-
Notifications
You must be signed in to change notification settings - Fork 9.2k
HADOOP-19096: [ABFS] [CST Optimization] Enhancing Client-Side Throttling Metrics Updating Logic #6276
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
HADOOP-19096: [ABFS] [CST Optimization] Enhancing Client-Side Throttling Metrics Updating Logic #6276
Changes from 8 commits
e043dd8
dc3938b
e2d43f2
ad8ce43
108d6de
fa18485
e7ba13b
b4df6fd
94e2559
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,7 +39,9 @@ | |
| import org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding; | ||
|
|
||
| import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.HTTP_CONTINUE; | ||
| import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.CONNECTION_TIMEOUT_ABBREVIATION; | ||
| import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.EGRESS_LIMIT_BREACH_ABBREVIATION; | ||
| import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.INGRESS_LIMIT_BREACH_ABBREVIATION; | ||
| import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.OPERATION_LIMIT_BREACH_ABBREVIATION; | ||
|
|
||
| /** | ||
| * The AbfsRestOperation for Rest AbfsClient. | ||
|
|
@@ -283,7 +285,7 @@ String getClientLatency() { | |
| private boolean executeHttpOperation(final int retryCount, | ||
| TracingContext tracingContext) throws AzureBlobFileSystemException { | ||
| AbfsHttpOperation httpOperation; | ||
| boolean wasIOExceptionThrown = false; | ||
| boolean wasExceptionThrown = false; | ||
|
|
||
| try { | ||
| // initialize the HTTP request and open the connection | ||
|
|
@@ -321,7 +323,27 @@ private boolean executeHttpOperation(final int retryCount, | |
| } else if (httpOperation.getStatusCode() == HttpURLConnection.HTTP_UNAVAILABLE) { | ||
| incrementCounter(AbfsStatistic.SERVER_UNAVAILABLE, 1); | ||
| } | ||
|
|
||
| // If no exception occurred till here it means http operation was successfully complete and | ||
| // a response from server has been received which might be failure or success. | ||
| // If any kind of exception has occurred it will be caught below. | ||
| // If request failed determine failure reason and retry policy here. | ||
|
||
| // else simply return with success after saving the result. | ||
| LOG.debug("HttpRequest: {}: {}", operationType, httpOperation); | ||
|
|
||
| int status = httpOperation.getStatusCode(); | ||
| failureReason = RetryReason.getAbbreviation(null, status, httpOperation.getStorageErrorMessage()); | ||
| retryPolicy = client.getRetryPolicy(failureReason); | ||
|
|
||
| if (retryPolicy.shouldRetry(retryCount, httpOperation.getStatusCode())) { | ||
| return false; | ||
| } | ||
|
|
||
| // If the request has succeeded or failed with non-retrial error, save the operation and return. | ||
| result = httpOperation; | ||
|
|
||
| } catch (UnknownHostException ex) { | ||
| wasExceptionThrown = true; | ||
|
||
| String hostname = null; | ||
| hostname = httpOperation.getHost(); | ||
| failureReason = RetryReason.getAbbreviation(ex, null, null); | ||
|
|
@@ -333,57 +355,44 @@ private boolean executeHttpOperation(final int retryCount, | |
| } | ||
| return false; | ||
| } catch (IOException ex) { | ||
| wasExceptionThrown = true; | ||
| if (LOG.isDebugEnabled()) { | ||
| LOG.debug("HttpRequestFailure: {}, {}", httpOperation, ex); | ||
| } | ||
|
|
||
| failureReason = RetryReason.getAbbreviation(ex, -1, ""); | ||
| retryPolicy = client.getRetryPolicy(failureReason); | ||
| wasIOExceptionThrown = true; | ||
| if (!retryPolicy.shouldRetry(retryCount, -1)) { | ||
| throw new InvalidAbfsRestOperationException(ex, retryCount); | ||
| } | ||
|
|
||
| return false; | ||
| } finally { | ||
| int status = httpOperation.getStatusCode(); | ||
| /* | ||
| A status less than 300 (2xx range) or greater than or equal | ||
| to 500 (5xx range) should contribute to throttling metrics being updated. | ||
| Less than 200 or greater than or equal to 500 show failed operations. 2xx | ||
| range contributes to successful operations. 3xx range is for redirects | ||
| and 4xx range is for user errors. These should not be a part of | ||
| throttling backoff computation. | ||
| Updating Client Side Throttling Metrics for relevant response status codes. | ||
| 1. Status code in 2xx range: Successful Operations should contribute | ||
| 2. Status code in 3xx range: Redirection Operations should not contribute | ||
| 3. Status code in 4xx range: User Errors should not contribute | ||
| 4. Status code is 503: Throttling Error should contribute as following: | ||
| a. 503, Ingress Over Account Limit: Should Contribute | ||
| b. 503, Egress Over Account Limit: Should Contribute | ||
| c. 503, TPS Over Account Limit: Should Contribute | ||
| d. 503, Other Server Throttling: Should not contribute | ||
| 5. Status code in 5xx range other than 503: Should not contribute | ||
| 6. IOException and UnknownHostExceptions: Should not contribute | ||
| */ | ||
| boolean updateMetricsResponseCode = (status < HttpURLConnection.HTTP_MULT_CHOICE | ||
| || status >= HttpURLConnection.HTTP_INTERNAL_ERROR); | ||
|
|
||
| /* | ||
| Connection Timeout failures should not contribute to throttling | ||
| In case the current request fails with Connection Timeout we will have | ||
| ioExceptionThrown true and failure reason as CT | ||
| In case the current request failed with 5xx, failure reason will be | ||
| updated after finally block but wasIOExceptionThrown will be false; | ||
| */ | ||
| boolean isCTFailure = CONNECTION_TIMEOUT_ABBREVIATION.equals(failureReason) && wasIOExceptionThrown; | ||
|
|
||
| if (updateMetricsResponseCode && !isCTFailure) { | ||
| int statusCode = httpOperation.getStatusCode(); | ||
| boolean shouldUpdateCSTMetrics = (statusCode < HttpURLConnection.HTTP_MULT_CHOICE // Case 1 | ||
anujmodi2021 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| || INGRESS_LIMIT_BREACH_ABBREVIATION.equals(failureReason) // Case 4.a | ||
| || EGRESS_LIMIT_BREACH_ABBREVIATION.equals(failureReason) // Case 4.b | ||
| || OPERATION_LIMIT_BREACH_ABBREVIATION.equals(failureReason)) // Case 4.c | ||
| && !wasExceptionThrown; // Case 6 | ||
|
||
|
|
||
| if (shouldUpdateCSTMetrics) { | ||
| intercept.updateMetrics(operationType, httpOperation); | ||
| } | ||
| } | ||
|
|
||
| LOG.debug("HttpRequest: {}: {}", operationType, httpOperation); | ||
|
|
||
| int status = httpOperation.getStatusCode(); | ||
| failureReason = RetryReason.getAbbreviation(null, status, httpOperation.getStorageErrorMessage()); | ||
| retryPolicy = client.getRetryPolicy(failureReason); | ||
|
|
||
| if (retryPolicy.shouldRetry(retryCount, httpOperation.getStatusCode())) { | ||
| return false; | ||
| } | ||
|
|
||
| result = httpOperation; | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,10 +22,12 @@ | |
| import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.HTTP_STATUS_CATEGORY_QUOTIENT; | ||
| import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.EGRESS_OVER_ACCOUNT_LIMIT; | ||
| import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.INGRESS_OVER_ACCOUNT_LIMIT; | ||
| import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.OTHER_SERVER_THROTTLING; | ||
| import static org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode.TPS_OVER_ACCOUNT_LIMIT; | ||
| import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.EGRESS_LIMIT_BREACH_ABBREVIATION; | ||
| import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.INGRESS_LIMIT_BREACH_ABBREVIATION; | ||
| import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.OPERATION_BREACH_MESSAGE; | ||
| import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.OPERATION_LIMIT_BREACH_ABBREVIATION; | ||
| import static org.apache.hadoop.fs.azurebfs.services.RetryReasonConstants.OTHER_SERVER_THROTTLING_ABBREVIATION; | ||
|
|
||
| /** | ||
| * Category that can capture server-response errors for 5XX status-code. | ||
|
|
@@ -56,10 +58,14 @@ String getAbbreviation(final Integer statusCode, | |
| splitedServerErrorMessage)) { | ||
| return EGRESS_LIMIT_BREACH_ABBREVIATION; | ||
| } | ||
| if (OPERATION_BREACH_MESSAGE.equalsIgnoreCase( | ||
| if (TPS_OVER_ACCOUNT_LIMIT.getErrorMessage().equalsIgnoreCase( | ||
| splitedServerErrorMessage)) { | ||
| return OPERATION_LIMIT_BREACH_ABBREVIATION; | ||
|
||
| } | ||
| if (OTHER_SERVER_THROTTLING.getErrorMessage().equalsIgnoreCase( | ||
| splitedServerErrorMessage)) { | ||
| return OTHER_SERVER_THROTTLING_ABBREVIATION; | ||
| } | ||
| return HTTP_UNAVAILABLE + ""; | ||
| } | ||
| return statusCode + ""; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe add a comment on the usage of this,
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added comment