-
Notifications
You must be signed in to change notification settings - Fork 9.2k
HADOOP-18606. Add reason in in x-ms-client-request-id on a retry API call. #5299
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
e70429c
2ddd8c9
cc2deb2
eece65f
40adc3e
77c9d21
b5d176b
c980762
34fe029
7acee90
b7d7121
1c3f2ee
9201ab7
77eb790
32e69cb
7f77ead
90fce12
06af705
62179bc
270545d
de424e3
94810b8
dad7b61
ac5593e
77aaa01
c49ab44
e1d38a0
e37d2e2
8c4ac80
9799dba
75594b9
e3ba294
836a352
a2a9a62
e312898
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,7 @@ | |
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| import org.apache.hadoop.classification.VisibleForTesting; | ||
| import org.apache.hadoop.fs.azurebfs.AbfsStatistic; | ||
| import org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants; | ||
| import org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException; | ||
|
|
@@ -73,6 +74,8 @@ public class AbfsRestOperation { | |
| private AbfsHttpOperation result; | ||
| private AbfsCounters abfsCounters; | ||
|
|
||
| private String failureReason = null; | ||
|
|
||
| /** | ||
| * Checks if there is non-null HTTP response. | ||
| * @return true if there is a non-null HTTP response from the ABFS call. | ||
|
|
@@ -208,7 +211,7 @@ public void execute(TracingContext tracingContext) | |
| private void completeExecute(TracingContext tracingContext) | ||
| throws AzureBlobFileSystemException { | ||
| // see if we have latency reports from the previous requests | ||
| String latencyHeader = this.client.getAbfsPerfTracker().getClientLatency(); | ||
| String latencyHeader = getClientLatency(); | ||
| if (latencyHeader != null && !latencyHeader.isEmpty()) { | ||
| AbfsHttpHeader httpHeader = | ||
| new AbfsHttpHeader(HttpHeaderConfigurations.X_MS_ABFS_CLIENT_LATENCY, latencyHeader); | ||
|
|
@@ -237,6 +240,11 @@ private void completeExecute(TracingContext tracingContext) | |
| LOG.trace("{} REST operation complete", operationType); | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| String getClientLatency() { | ||
| return this.client.getAbfsPerfTracker().getClientLatency(); | ||
|
||
| } | ||
|
|
||
| /** | ||
| * Executes a single HTTP operation to complete the REST operation. If it | ||
| * fails, there may be a retry. The retryCount is incremented with each | ||
|
|
@@ -248,9 +256,9 @@ private boolean executeHttpOperation(final int retryCount, | |
|
|
||
| try { | ||
| // initialize the HTTP request and open the connection | ||
| httpOperation = new AbfsHttpOperation(url, method, requestHeaders); | ||
| httpOperation = getHttpOperation(); | ||
| incrementCounter(AbfsStatistic.CONNECTIONS_MADE, 1); | ||
| tracingContext.constructHeader(httpOperation); | ||
| tracingContext.constructHeader(httpOperation, failureReason); | ||
|
|
||
| switch(client.getAuthType()) { | ||
| case Custom: | ||
|
|
@@ -303,6 +311,7 @@ private boolean executeHttpOperation(final int retryCount, | |
| } catch (UnknownHostException ex) { | ||
| String hostname = null; | ||
| hostname = httpOperation.getHost(); | ||
| failureReason = RetryReason.UNKNOWN_HOST.getAbbreviation(ex, null, null); | ||
| LOG.warn("Unknown host name: {}. Retrying to resolve the host name...", | ||
| hostname); | ||
| if (!client.getRetryPolicy().shouldRetry(retryCount, -1)) { | ||
|
|
@@ -314,6 +323,8 @@ private boolean executeHttpOperation(final int retryCount, | |
| LOG.debug("HttpRequestFailure: {}, {}", httpOperation, ex); | ||
| } | ||
|
|
||
| failureReason = RetryReason.getEnum(ex, -1).getAbbreviation(ex, -1, ""); | ||
|
|
||
| if (!client.getRetryPolicy().shouldRetry(retryCount, -1)) { | ||
| throw new InvalidAbfsRestOperationException(ex); | ||
| } | ||
|
|
@@ -326,6 +337,8 @@ private boolean executeHttpOperation(final int retryCount, | |
| LOG.debug("HttpRequest: {}: {}", operationType, httpOperation); | ||
|
|
||
| if (client.getRetryPolicy().shouldRetry(retryCount, httpOperation.getStatusCode())) { | ||
| int status = httpOperation.getStatusCode(); | ||
| failureReason = RetryReason.getEnum(null, status).getAbbreviation(null, status, httpOperation.getStorageErrorMessage()); | ||
| return false; | ||
| } | ||
|
|
||
|
|
@@ -334,6 +347,11 @@ private boolean executeHttpOperation(final int retryCount, | |
| return true; | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| AbfsHttpOperation getHttpOperation() throws IOException { | ||
|
||
| return new AbfsHttpOperation(url, method, requestHeaders); | ||
| } | ||
|
|
||
| /** | ||
| * Incrementing Abfs counters with a long value. | ||
| * | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,128 @@ | ||
| package org.apache.hadoop.fs.azurebfs.services; | ||
|
|
||
| import java.io.IOException; | ||
| import java.net.SocketException; | ||
| import java.util.ArrayList; | ||
| import java.util.List; | ||
|
|
||
| public enum RetryReason { | ||
steveloughran marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| CONNECTION_TIMEOUT(((exceptionCaptured, statusCode) -> { | ||
| return exceptionCaptured != null && "connect timed out".equalsIgnoreCase( | ||
| exceptionCaptured.getMessage()); | ||
| }), 2, "CT"), | ||
| READ_TIMEOUT(((exceptionCaptured, statusCode) -> { | ||
| return exceptionCaptured != null && "Read timed out".equalsIgnoreCase( | ||
| exceptionCaptured.getMessage()); | ||
| }), 2, "RT"), | ||
| UNKNOWN_HOST("UH"), | ||
| CONNECTION_RESET(((exceptionCaptured, statusCode) -> { | ||
| return exceptionCaptured != null && exceptionCaptured.getMessage() != null | ||
| && exceptionCaptured.getMessage().contains("Connection reset"); | ||
| }), 2, "CR"), | ||
| STATUS_5XX(((exceptionCaptured, statusCode) -> { | ||
| return statusCode / 100 == 5; | ||
| }), 0, ((ex, statusCode, serverErrorMessage) -> { | ||
| if (statusCode == 503) { | ||
| //ref: https://github.com/apache/hadoop/pull/4564/files#diff-75a2f54df6618d4015c63812e6a9916ddfb475d246850edfd2a6f57e36805e79 | ||
| serverErrorMessage = serverErrorMessage.split(System.lineSeparator(), | ||
| 2)[0]; | ||
| if ("Ingress is over the account limit.".equalsIgnoreCase( | ||
| serverErrorMessage)) { | ||
| return "ING"; | ||
| } | ||
| if ("Egress is over the account limit.".equalsIgnoreCase( | ||
| serverErrorMessage)) { | ||
| return "EGR"; | ||
| } | ||
| if ("Operations per second is over the account limit.".equalsIgnoreCase( | ||
| serverErrorMessage)) { | ||
| return "OPR"; | ||
| } | ||
| return "503"; | ||
| } | ||
| return statusCode + ""; | ||
| })), | ||
| STATUS_4XX(((exceptionCaptured, statusCode) -> { | ||
| return statusCode / 100 == 4; | ||
| }), 0, ((ex, statusCode, serverErrorMessage) -> { | ||
| return statusCode + ""; | ||
| })), | ||
| UNKNOWN_SOCKET_EXCEPTION(((exceptionCaptured, statusCode) -> { | ||
| return exceptionCaptured instanceof SocketException; | ||
| }), 1, "SE"), | ||
| UNKNOWN_IO_EXCEPTION(((exceptionCaptured, statusCode) -> { | ||
| return exceptionCaptured instanceof IOException; | ||
| }), 0, "IOE"); | ||
|
|
||
| private RetryReasonCaptureMechanism mechanism = null; | ||
steveloughran marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| private RetryReasonAbbreviationCreator retryReasonAbbreviationCreator = null; | ||
|
|
||
| private int rank = 0; | ||
|
|
||
| private String abbreviation; | ||
|
|
||
| RetryReason(String abbreviation) { | ||
| this.abbreviation = abbreviation; | ||
| } | ||
|
|
||
| RetryReason(RetryReasonCaptureMechanism mechanism, | ||
| int rank, | ||
| String abbreviation) { | ||
| this.mechanism = mechanism; | ||
| this.rank = rank; | ||
| this.abbreviation = abbreviation; | ||
| } | ||
|
|
||
| RetryReason(RetryReasonCaptureMechanism mechanism, | ||
| int rank, | ||
| RetryReasonAbbreviationCreator abbreviationCreator) { | ||
| this.mechanism = mechanism; | ||
| this.rank = rank; | ||
| this.retryReasonAbbreviationCreator = abbreviationCreator; | ||
| } | ||
|
|
||
| public String getAbbreviation(Exception ex, | ||
| Integer statusCode, | ||
| String serverErrorMessage) { | ||
| if (abbreviation != null) { | ||
| return abbreviation; | ||
| } | ||
| if (retryReasonAbbreviationCreator != null) { | ||
| return retryReasonAbbreviationCreator.getAbbreviation(ex, statusCode, | ||
| serverErrorMessage); | ||
| } | ||
| return null; | ||
| } | ||
|
|
||
| private static List<RetryReason> retryReasonLSortedist; | ||
|
|
||
| private synchronized static void sortRetryReason() { | ||
| if (retryReasonLSortedist != null) { | ||
| return; | ||
| } | ||
| List<RetryReason> list = new ArrayList<>(); | ||
| for (RetryReason reason : values()) { | ||
| list.add(reason); | ||
| } | ||
| list.sort((c1, c2) -> { | ||
| return c1.rank - c2.rank; | ||
| }); | ||
| retryReasonLSortedist = list; | ||
| } | ||
|
|
||
| static RetryReason getEnum(Exception ex, Integer statusCode) { | ||
| RetryReason retryReasonResult = null; | ||
| if (retryReasonLSortedist == null) { | ||
| sortRetryReason(); | ||
| } | ||
| for (RetryReason retryReason : retryReasonLSortedist) { | ||
| if (retryReason.mechanism != null) { | ||
| if (retryReason.mechanism.canCapture(ex, statusCode)) { | ||
| retryReasonResult = retryReason; | ||
| } | ||
| } | ||
| } | ||
| return retryReasonResult; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| package org.apache.hadoop.fs.azurebfs.services; | ||
|
|
||
| public interface RetryReasonAbbreviationCreator { | ||
| String getAbbreviation(Exception ex, Integer statusCode, String serverErrorMessage); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| package org.apache.hadoop.fs.azurebfs.services; | ||
|
|
||
| interface RetryReasonCaptureMechanism { | ||
| boolean canCapture(Exception exceptionCaptured, Integer statusCode); | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add a javadoc to explain. no need to set as null as that is the default and it actually speeds up object creation to not set it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Made the change.