Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e70429c
retryReason foundations.
saxenapranav Jan 12, 2023
2ddd8c9
send failureReasons for adding in x-ms-client-request-id
saxenapranav Jan 12, 2023
cc2deb2
added heuristics in RetryReason
saxenapranav Jan 13, 2023
eece65f
fixed test for connect timed out
saxenapranav Jan 13, 2023
40adc3e
testabfsrestOperation: to add 5xx tests
saxenapranav Jan 13, 2023
77c9d21
test added
saxenapranav Jan 13, 2023
b5d176b
test: testClientRequestIdForConnectAndReadTimeoutRetry
saxenapranav Jan 13, 2023
c980762
small refactors
saxenapranav Jan 13, 2023
34fe029
removed non-required capture interface
saxenapranav Jan 14, 2023
7acee90
ASF license; javadocs on the interface.
saxenapranav Jan 14, 2023
b7d7121
minimize TestAbfsRestOperation class with added methods for common te…
saxenapranav Jan 14, 2023
1c3f2ee
added javadocs in RetryReason
saxenapranav Jan 14, 2023
9201ab7
removal of non-required new-lines
saxenapranav Jan 14, 2023
77eb790
small refactors
saxenapranav Jan 14, 2023
32e69cb
checkstyle: magic number
saxenapranav Jan 17, 2023
7f77ead
review refactors
saxenapranav Jan 18, 2023
90fce12
review WIP
saxenapranav Feb 22, 2023
06af705
removing enums and having classes with implementation of RetryReasonA…
saxenapranav Feb 22, 2023
62179bc
method change in RetryReasonAbbreviationCreator
saxenapranav Feb 22, 2023
270545d
asf license
saxenapranav Feb 22, 2023
de424e3
javadocs + small refactors
saxenapranav Feb 22, 2023
94810b8
fixed access modifier, only public method in abstract class is captur…
saxenapranav Feb 23, 2023
dad7b61
javadoc for RetryReason; package name change
saxenapranav Feb 23, 2023
ac5593e
RetryReasonConstants
saxenapranav Feb 23, 2023
77aaa01
asf license in RetryReasonConstants
saxenapranav Feb 23, 2023
c49ab44
constants in TestAbfsRestOperationMockFailures
saxenapranav Feb 23, 2023
e1d38a0
TestRetryReason
saxenapranav Feb 23, 2023
e37d2e2
javadocs wip
saxenapranav Feb 23, 2023
8c4ac80
javadocs on retry-categories
saxenapranav Feb 23, 2023
9799dba
javadoc fix in the RetryReasonCategory
saxenapranav Feb 27, 2023
75594b9
Merge pull request #4 from saxenapranav/retry_reason_review2
saxenapranav Feb 27, 2023
e3ba294
checkstyle issue
saxenapranav Feb 27, 2023
836a352
canCapture and getAbbreviation to be package-protected and not public
saxenapranav Mar 1, 2023
a2a9a62
review comments
saxenapranav Mar 6, 2023
e312898
Merge branch 'trunk' into retry_reason
saxenapranav Mar 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.classification.VisibleForTesting;
import org.apache.hadoop.fs.azurebfs.AbfsStatistic;
import org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants;
import org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException;
Expand Down Expand Up @@ -73,6 +74,8 @@ public class AbfsRestOperation {
private AbfsHttpOperation result;
private AbfsCounters abfsCounters;

private String failureReason = null;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a javadoc to explain. no need to set as null as that is the default and it actually speeds up object creation to not set it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made the change.


/**
* Checks if there is non-null HTTP response.
* @return true if there is a non-null HTTP response from the ABFS call.
Expand Down Expand Up @@ -208,7 +211,7 @@ public void execute(TracingContext tracingContext)
private void completeExecute(TracingContext tracingContext)
throws AzureBlobFileSystemException {
// see if we have latency reports from the previous requests
String latencyHeader = this.client.getAbfsPerfTracker().getClientLatency();
String latencyHeader = getClientLatency();
if (latencyHeader != null && !latencyHeader.isEmpty()) {
AbfsHttpHeader httpHeader =
new AbfsHttpHeader(HttpHeaderConfigurations.X_MS_ABFS_CLIENT_LATENCY, latencyHeader);
Expand Down Expand Up @@ -237,6 +240,11 @@ private void completeExecute(TracingContext tracingContext)
LOG.trace("{} REST operation complete", operationType);
}

@VisibleForTesting
String getClientLatency() {
return this.client.getAbfsPerfTracker().getClientLatency();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove the this.. I know, it was there before, but this is time to clean up.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made the change.

}

/**
* Executes a single HTTP operation to complete the REST operation. If it
* fails, there may be a retry. The retryCount is incremented with each
Expand All @@ -248,9 +256,9 @@ private boolean executeHttpOperation(final int retryCount,

try {
// initialize the HTTP request and open the connection
httpOperation = new AbfsHttpOperation(url, method, requestHeaders);
httpOperation = getHttpOperation();
incrementCounter(AbfsStatistic.CONNECTIONS_MADE, 1);
tracingContext.constructHeader(httpOperation);
tracingContext.constructHeader(httpOperation, failureReason);

switch(client.getAuthType()) {
case Custom:
Expand Down Expand Up @@ -303,6 +311,7 @@ private boolean executeHttpOperation(final int retryCount,
} catch (UnknownHostException ex) {
String hostname = null;
hostname = httpOperation.getHost();
failureReason = RetryReason.UNKNOWN_HOST.getAbbreviation(ex, null, null);
LOG.warn("Unknown host name: {}. Retrying to resolve the host name...",
hostname);
if (!client.getRetryPolicy().shouldRetry(retryCount, -1)) {
Expand All @@ -314,6 +323,8 @@ private boolean executeHttpOperation(final int retryCount,
LOG.debug("HttpRequestFailure: {}, {}", httpOperation, ex);
}

failureReason = RetryReason.getEnum(ex, -1).getAbbreviation(ex, -1, "");

if (!client.getRetryPolicy().shouldRetry(retryCount, -1)) {
throw new InvalidAbfsRestOperationException(ex);
}
Expand All @@ -326,6 +337,8 @@ private boolean executeHttpOperation(final int retryCount,
LOG.debug("HttpRequest: {}: {}", operationType, httpOperation);

if (client.getRetryPolicy().shouldRetry(retryCount, httpOperation.getStatusCode())) {
int status = httpOperation.getStatusCode();
failureReason = RetryReason.getEnum(null, status).getAbbreviation(null, status, httpOperation.getStorageErrorMessage());
return false;
}

Expand All @@ -334,6 +347,11 @@ private boolean executeHttpOperation(final int retryCount,
return true;
}

@VisibleForTesting
AbfsHttpOperation getHttpOperation() throws IOException {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as this is more than just a getter, can you

  • call it createHttpOperation()
  • add a javadoc description

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made the change.

return new AbfsHttpOperation(url, method, requestHeaders);
}

/**
* Incrementing Abfs counters with a long value.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
package org.apache.hadoop.fs.azurebfs.services;

import java.io.IOException;
import java.net.SocketException;
import java.util.ArrayList;
import java.util.List;

public enum RetryReason {
CONNECTION_TIMEOUT(((exceptionCaptured, statusCode) -> {
return exceptionCaptured != null && "connect timed out".equalsIgnoreCase(
exceptionCaptured.getMessage());
}), 2, "CT"),
READ_TIMEOUT(((exceptionCaptured, statusCode) -> {
return exceptionCaptured != null && "Read timed out".equalsIgnoreCase(
exceptionCaptured.getMessage());
}), 2, "RT"),
UNKNOWN_HOST("UH"),
CONNECTION_RESET(((exceptionCaptured, statusCode) -> {
return exceptionCaptured != null && exceptionCaptured.getMessage() != null
&& exceptionCaptured.getMessage().contains("Connection reset");
}), 2, "CR"),
STATUS_5XX(((exceptionCaptured, statusCode) -> {
return statusCode / 100 == 5;
}), 0, ((ex, statusCode, serverErrorMessage) -> {
if (statusCode == 503) {
//ref: https://github.com/apache/hadoop/pull/4564/files#diff-75a2f54df6618d4015c63812e6a9916ddfb475d246850edfd2a6f57e36805e79
serverErrorMessage = serverErrorMessage.split(System.lineSeparator(),
2)[0];
if ("Ingress is over the account limit.".equalsIgnoreCase(
serverErrorMessage)) {
return "ING";
}
if ("Egress is over the account limit.".equalsIgnoreCase(
serverErrorMessage)) {
return "EGR";
}
if ("Operations per second is over the account limit.".equalsIgnoreCase(
serverErrorMessage)) {
return "OPR";
}
return "503";
}
return statusCode + "";
})),
STATUS_4XX(((exceptionCaptured, statusCode) -> {
return statusCode / 100 == 4;
}), 0, ((ex, statusCode, serverErrorMessage) -> {
return statusCode + "";
})),
UNKNOWN_SOCKET_EXCEPTION(((exceptionCaptured, statusCode) -> {
return exceptionCaptured instanceof SocketException;
}), 1, "SE"),
UNKNOWN_IO_EXCEPTION(((exceptionCaptured, statusCode) -> {
return exceptionCaptured instanceof IOException;
}), 0, "IOE");

private RetryReasonCaptureMechanism mechanism = null;

private RetryReasonAbbreviationCreator retryReasonAbbreviationCreator = null;

private int rank = 0;

private String abbreviation;

RetryReason(String abbreviation) {
this.abbreviation = abbreviation;
}

RetryReason(RetryReasonCaptureMechanism mechanism,
int rank,
String abbreviation) {
this.mechanism = mechanism;
this.rank = rank;
this.abbreviation = abbreviation;
}

RetryReason(RetryReasonCaptureMechanism mechanism,
int rank,
RetryReasonAbbreviationCreator abbreviationCreator) {
this.mechanism = mechanism;
this.rank = rank;
this.retryReasonAbbreviationCreator = abbreviationCreator;
}

public String getAbbreviation(Exception ex,
Integer statusCode,
String serverErrorMessage) {
if (abbreviation != null) {
return abbreviation;
}
if (retryReasonAbbreviationCreator != null) {
return retryReasonAbbreviationCreator.getAbbreviation(ex, statusCode,
serverErrorMessage);
}
return null;
}

private static List<RetryReason> retryReasonLSortedist;

private synchronized static void sortRetryReason() {
if (retryReasonLSortedist != null) {
return;
}
List<RetryReason> list = new ArrayList<>();
for (RetryReason reason : values()) {
list.add(reason);
}
list.sort((c1, c2) -> {
return c1.rank - c2.rank;
});
retryReasonLSortedist = list;
}

static RetryReason getEnum(Exception ex, Integer statusCode) {
RetryReason retryReasonResult = null;
if (retryReasonLSortedist == null) {
sortRetryReason();
}
for (RetryReason retryReason : retryReasonLSortedist) {
if (retryReason.mechanism != null) {
if (retryReason.mechanism.canCapture(ex, statusCode)) {
retryReasonResult = retryReason;
}
}
}
return retryReasonResult;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package org.apache.hadoop.fs.azurebfs.services;

public interface RetryReasonAbbreviationCreator {
String getAbbreviation(Exception ex, Integer statusCode, String serverErrorMessage);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package org.apache.hadoop.fs.azurebfs.services;

interface RetryReasonCaptureMechanism {
boolean canCapture(Exception exceptionCaptured, Integer statusCode);
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package org.apache.hadoop.fs.azurebfs.utils;

import java.util.List;
import java.util.UUID;

import org.slf4j.Logger;
Expand Down Expand Up @@ -67,6 +68,9 @@ public class TracingContext {
public static final int MAX_CLIENT_CORRELATION_ID_LENGTH = 72;
public static final String CLIENT_CORRELATION_ID_PATTERN = "[a-zA-Z0-9-]*";

//x-ms-client-request-id can have maximum 1KB string
private static final Integer MAX_CLIENT_REQUEST_ID = 1024;

/**
* Initialize TracingContext
* @param clientCorrelationID Provided over config by client
Expand Down Expand Up @@ -152,15 +156,18 @@ public void setListener(Listener listener) {
* X_MS_CLIENT_REQUEST_ID header of the http operation
* @param httpOperation AbfsHttpOperation instance to set header into
* connection
* @param previousFailure List of failures seen before this API trigger on
* same operation from AbfsClient.
*/
public void constructHeader(AbfsHttpOperation httpOperation) {
public void constructHeader(AbfsHttpOperation httpOperation, String previousFailure) {
clientRequestId = UUID.randomUUID().toString();
switch (format) {
case ALL_ID_FORMAT: // Optional IDs (e.g. streamId) may be empty
header =
clientCorrelationID + ":" + clientRequestId + ":" + fileSystemID + ":"
+ primaryRequestId + ":" + streamID + ":" + opType + ":"
+ retryCount;
header = addFailureReasons(header, previousFailure);
break;
case TWO_ID_FORMAT:
header = clientCorrelationID + ":" + clientRequestId;
Expand All @@ -174,6 +181,14 @@ public void constructHeader(AbfsHttpOperation httpOperation) {
httpOperation.setRequestProperty(HttpHeaderConfigurations.X_MS_CLIENT_REQUEST_ID, header);
}

private String addFailureReasons(final String header,
final String previousFailure) {
if(previousFailure == null) {
return header;
}
return String.format("%s_%s", header, previousFailure);
}

/**
* Return header representing the request associated with the tracingContext
* @return Header string set into X_MS_CLIENT_REQUEST_ID
Expand Down
Loading