-
Notifications
You must be signed in to change notification settings - Fork 9.2k
HADOOP-18606. Add reason in in x-ms-client-request-id on a retry API call. #5299
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 15 commits
e70429c
2ddd8c9
cc2deb2
eece65f
40adc3e
77c9d21
b5d176b
c980762
34fe029
7acee90
b7d7121
1c3f2ee
9201ab7
77eb790
32e69cb
7f77ead
90fce12
06af705
62179bc
270545d
de424e3
94810b8
dad7b61
ac5593e
77aaa01
c49ab44
e1d38a0
e37d2e2
8c4ac80
9799dba
75594b9
e3ba294
836a352
a2a9a62
e312898
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,7 @@ | |
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| import org.apache.hadoop.classification.VisibleForTesting; | ||
| import org.apache.hadoop.fs.azurebfs.AbfsStatistic; | ||
| import org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants; | ||
| import org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException; | ||
|
|
@@ -73,6 +74,8 @@ public class AbfsRestOperation { | |
| private AbfsHttpOperation result; | ||
| private AbfsCounters abfsCounters; | ||
|
|
||
| private String failureReason = null; | ||
|
||
|
|
||
| /** | ||
| * Checks if there is non-null HTTP response. | ||
| * @return true if there is a non-null HTTP response from the ABFS call. | ||
|
|
@@ -208,7 +211,7 @@ public void execute(TracingContext tracingContext) | |
| private void completeExecute(TracingContext tracingContext) | ||
| throws AzureBlobFileSystemException { | ||
| // see if we have latency reports from the previous requests | ||
| String latencyHeader = this.client.getAbfsPerfTracker().getClientLatency(); | ||
| String latencyHeader = getClientLatency(); | ||
| if (latencyHeader != null && !latencyHeader.isEmpty()) { | ||
| AbfsHttpHeader httpHeader = | ||
| new AbfsHttpHeader(HttpHeaderConfigurations.X_MS_ABFS_CLIENT_LATENCY, latencyHeader); | ||
|
|
@@ -237,6 +240,11 @@ private void completeExecute(TracingContext tracingContext) | |
| LOG.trace("{} REST operation complete", operationType); | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| String getClientLatency() { | ||
| return this.client.getAbfsPerfTracker().getClientLatency(); | ||
|
||
| } | ||
|
|
||
| /** | ||
| * Executes a single HTTP operation to complete the REST operation. If it | ||
| * fails, there may be a retry. The retryCount is incremented with each | ||
|
|
@@ -248,9 +256,9 @@ private boolean executeHttpOperation(final int retryCount, | |
|
|
||
| try { | ||
| // initialize the HTTP request and open the connection | ||
| httpOperation = new AbfsHttpOperation(url, method, requestHeaders); | ||
| httpOperation = getHttpOperation(); | ||
| incrementCounter(AbfsStatistic.CONNECTIONS_MADE, 1); | ||
| tracingContext.constructHeader(httpOperation); | ||
| tracingContext.constructHeader(httpOperation, failureReason); | ||
|
|
||
| switch(client.getAuthType()) { | ||
| case Custom: | ||
|
|
@@ -303,6 +311,7 @@ private boolean executeHttpOperation(final int retryCount, | |
| } catch (UnknownHostException ex) { | ||
| String hostname = null; | ||
| hostname = httpOperation.getHost(); | ||
| failureReason = RetryReason.getAbbreviation(ex, null, null); | ||
| LOG.warn("Unknown host name: {}. Retrying to resolve the host name...", | ||
| hostname); | ||
| if (!client.getRetryPolicy().shouldRetry(retryCount, -1)) { | ||
|
|
@@ -314,6 +323,8 @@ private boolean executeHttpOperation(final int retryCount, | |
| LOG.debug("HttpRequestFailure: {}, {}", httpOperation, ex); | ||
| } | ||
|
|
||
| failureReason = RetryReason.getAbbreviation(ex, -1, ""); | ||
|
|
||
| if (!client.getRetryPolicy().shouldRetry(retryCount, -1)) { | ||
| throw new InvalidAbfsRestOperationException(ex); | ||
| } | ||
|
|
@@ -326,6 +337,8 @@ private boolean executeHttpOperation(final int retryCount, | |
| LOG.debug("HttpRequest: {}: {}", operationType, httpOperation); | ||
|
|
||
| if (client.getRetryPolicy().shouldRetry(retryCount, httpOperation.getStatusCode())) { | ||
| int status = httpOperation.getStatusCode(); | ||
| failureReason = RetryReason.getAbbreviation(null, status, httpOperation.getStorageErrorMessage()); | ||
| return false; | ||
| } | ||
|
|
||
|
|
@@ -334,6 +347,11 @@ private boolean executeHttpOperation(final int retryCount, | |
| return true; | ||
| } | ||
|
|
||
| @VisibleForTesting | ||
| AbfsHttpOperation getHttpOperation() throws IOException { | ||
|
||
| return new AbfsHttpOperation(url, method, requestHeaders); | ||
| } | ||
|
|
||
| /** | ||
| * Incrementing Abfs counters with a long value. | ||
| * | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,176 @@ | ||
| /** | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * <p> | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * <p> | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.hadoop.fs.azurebfs.services; | ||
|
|
||
| import java.io.IOException; | ||
| import java.net.SocketException; | ||
| import java.net.UnknownHostException; | ||
| import java.util.ArrayList; | ||
| import java.util.List; | ||
|
|
||
| import static java.net.HttpURLConnection.HTTP_UNAVAILABLE; | ||
| import static org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.HTTP_STATUS_CATEGORY_QUOTIENT; | ||
|
|
||
| /** | ||
| * In case of retry, this enum would give the information on the reason for | ||
| * previous API call. | ||
| * */ | ||
|
||
| public enum RetryReason { | ||
steveloughran marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| CONNECTION_TIMEOUT(2, ((ex, statusCode, serverErrorMessage) -> { | ||
| if (ex != null && "connect timed out".equalsIgnoreCase( | ||
| ex.getMessage())) { | ||
| return "CT"; | ||
| } | ||
| return null; | ||
| })), | ||
| READ_TIMEOUT(2, ((exceptionCaptured, statusCode, serverErrorMessage) -> { | ||
steveloughran marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| if (exceptionCaptured != null && "Read timed out".equalsIgnoreCase( | ||
| exceptionCaptured.getMessage())) { | ||
| return "RT"; | ||
| } | ||
| return null; | ||
| })), | ||
| UNKNOWN_HOST(2, ((ex, statusCode, serverErrorMessage) -> { | ||
| if (ex instanceof UnknownHostException) { | ||
| return "UH"; | ||
| } | ||
| return null; | ||
| })), | ||
| CONNECTION_RESET(2, ((exceptionCaptured, statusCode, serverErrorMessage) -> { | ||
| if (exceptionCaptured != null && exceptionCaptured.getMessage() != null | ||
| && exceptionCaptured.getMessage().contains("Connection reset")) { | ||
|
||
| return "CR"; | ||
| } | ||
| return null; | ||
| })), | ||
| STATUS_5XX(0, ((ex, statusCode, serverErrorMessage) -> { | ||
| if (statusCode == null || statusCode / HTTP_STATUS_CATEGORY_QUOTIENT != 5) { | ||
| return null; | ||
| } | ||
| if (statusCode == HTTP_UNAVAILABLE) { | ||
| //ref: https://github.com/apache/hadoop/pull/4564/files#diff-75a2f54df6618d4015c63812e6a9916ddfb475d246850edfd2a6f57e36805e79 | ||
| serverErrorMessage = serverErrorMessage.split(System.lineSeparator(), | ||
| 2)[0]; | ||
| if ("Ingress is over the account limit.".equalsIgnoreCase( | ||
| serverErrorMessage)) { | ||
| return "ING"; | ||
| } | ||
| if ("Egress is over the account limit.".equalsIgnoreCase( | ||
| serverErrorMessage)) { | ||
| return "EGR"; | ||
| } | ||
| if ("Operations per second is over the account limit.".equalsIgnoreCase( | ||
| serverErrorMessage)) { | ||
| return "OPR"; | ||
| } | ||
| return HTTP_UNAVAILABLE + ""; | ||
| } | ||
| return statusCode + ""; | ||
| })), | ||
| STATUS_4XX(0, ((ex, statusCode, serverErrorMessage) -> { | ||
| if (statusCode == null || statusCode / HTTP_STATUS_CATEGORY_QUOTIENT != 4) { | ||
| return null; | ||
| } | ||
| return statusCode + ""; | ||
| })), | ||
| UNKNOWN_SOCKET_EXCEPTION(1, ((ex, statusCode, serverErrorMessage) -> { | ||
| if (ex instanceof SocketException) { | ||
| return "SE"; | ||
| } | ||
| return null; | ||
| })), | ||
| UNKNOWN_IO_EXCEPTION(0, ((ex, statusCode, serverErrorMessage) -> { | ||
| if (ex instanceof IOException) { | ||
| return "IOE"; | ||
| } | ||
| return null; | ||
| })); | ||
|
|
||
| private RetryReasonAbbreviationCreator retryReasonAbbreviationCreator = null; | ||
|
|
||
| private int rank = 0; | ||
|
|
||
| /** | ||
| * Constructor to have rank and the implementation of {@link RetryReasonAbbreviationCreator}. | ||
| * @param rank rank of a given enum. For example SocketTimeoutException is | ||
| * subclass of IOException. Rank of SocketTimeoutException enum has to be | ||
| * more than that of IOException enum. | ||
| * @param abbreviationCreator The implementation of {@link RetryReasonAbbreviationCreator} | ||
| * which would give the information if a given enum can be mapped to an error or not. | ||
| * */ | ||
| RetryReason(int rank, | ||
| RetryReasonAbbreviationCreator abbreviationCreator) { | ||
| this.rank = rank; | ||
| this.retryReasonAbbreviationCreator = abbreviationCreator; | ||
| } | ||
|
|
||
| private static List<RetryReason> retryReasonSortedList; | ||
|
|
||
| /** | ||
| * Synchronized method to assign sorted list in {@link RetryReason#retryReasonSortedList}. | ||
| * Method would check if list is assigned or not. If yes, method would return. This is required | ||
| * because multiple threads could be waiting to get into this method, and once a thread is done | ||
| * with this method, other thread would get into this method. Since the list would be assigned by | ||
| * first thread, the second thread need not run the whole mechanism of sorting. | ||
| * The enums are sorted on the ascending order of their rank. | ||
| * */ | ||
| private static synchronized void sortRetryReason() { | ||
| if (retryReasonSortedList != null) { | ||
| return; | ||
| } | ||
| List<RetryReason> list = new ArrayList<>(); | ||
| for (RetryReason reason : values()) { | ||
| list.add(reason); | ||
| } | ||
| list.sort((c1, c2) -> { | ||
| return c1.rank - c2.rank; | ||
| }); | ||
| retryReasonSortedList = list; | ||
| } | ||
|
|
||
| /** | ||
| * Method to get correct abbreviation for a given set of exception, statusCode, | ||
| * storageStatusCode. | ||
| * Method would iterate through the {@link RetryReason#retryReasonSortedList}, | ||
| * and would return the abbreviation returned by highest enum to be applicable on the group. | ||
| * For example, if SocketTimeoutException(rank 2) and IOException(rank 0) can be | ||
| * applied on the group, the abbreviation of SocketTimeoutException has to be returned. | ||
| * | ||
| * @param ex exception caught during server communication. | ||
| * @param statusCode statusCode in the server response. | ||
| * @param storageErrorMessage storageErrorMessage in the server response. | ||
| * */ | ||
| static String getAbbreviation(Exception ex, | ||
| Integer statusCode, | ||
| String storageErrorMessage) { | ||
| String result = null; | ||
| if (retryReasonSortedList == null) { | ||
| sortRetryReason(); | ||
| } | ||
| for (RetryReason retryReason : retryReasonSortedList) { | ||
| String enumCapturedAndAbbreviate | ||
| = retryReason.retryReasonAbbreviationCreator.capturableAndGetAbbreviation( | ||
| ex, statusCode, storageErrorMessage); | ||
| if (enumCapturedAndAbbreviate != null) { | ||
| result = enumCapturedAndAbbreviate; | ||
| } | ||
| } | ||
| return result; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| /** | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * <p> | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * <p> | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.hadoop.fs.azurebfs.services; | ||
|
|
||
| /** | ||
| * Interface to be implemented by each enum in {@link RetryReason}. The methods | ||
| * of the interface define if the given enum can be applied on the given server | ||
| * response. | ||
| * */ | ||
| public interface RetryReasonAbbreviationCreator { | ||
|
|
||
| /** | ||
| * Returns an abbreviation if the {@link RetryReason} enum can be applied on | ||
| * the server response. | ||
| * @param ex exception captured in the server API call. | ||
| * @param statusCode statusCode on the server response | ||
| * @param serverErrorMessage serverErrorMessage on the server response. | ||
| * @return <ol><li>null if the enum can not be used on the server response</li> | ||
| * <li>abbreviation corresponding to the server response.</li></ol> | ||
| * */ | ||
| String capturableAndGetAbbreviation(Exception ex, | ||
| Integer statusCode, | ||
| String serverErrorMessage); | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.