diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index c11c6abd7f..3e81c62d0d 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -38,7 +38,10 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy private bool isMultiMasterWriteRequest; private Uri locationEndpoint; private RetryContext retryContext; - private DocumentServiceRequest documentServiceRequest; + private DocumentServiceRequest documentServiceRequest; +#if !INTERNAL + private volatile bool addHubRegionProcessingOnlyHeader; +#endif public ClientRetryPolicy( GlobalEndpointManager globalEndpointManager, @@ -222,8 +225,14 @@ public void OnBeforeSendRequest(DocumentServiceRequest request) // set location-based routing directive based on request retry context request.RequestContext.RouteToLocation(this.retryContext.RetryLocationIndex, this.retryContext.RetryRequestOnPreferredLocations); } - } - + } +#if !INTERNAL + // If previous attempt failed with 404/1002, add the hub-region-processing-only header to all subsequent retry attempts + if (this.addHubRegionProcessingOnlyHeader) + { + request.Headers[HttpConstants.HttpHeaders.ShouldProcessOnlyInHubRegion] = bool.TrueString; + } +#endif // Resolve the endpoint for the request and pin the resolution to the resolved endpoint // This enables marking the endpoint unavailability on endpoint failover/unreachability this.locationEndpoint = this.isThinClientEnabled @@ -318,15 +327,22 @@ private async Task ShouldRetryInternalAsync( markBothReadAndWriteAsUnavailable: false, forceRefresh: false, retryOnPreferredLocations: true); - } - - if (statusCode == HttpStatusCode.NotFound - && subStatusCode == SubStatusCodes.ReadSessionNotAvailable) + } + + if (statusCode == HttpStatusCode.NotFound && subStatusCode == SubStatusCodes.ReadSessionNotAvailable) { +#if !INTERNAL + // Only set the hub region processing header for single master accounts + // Set header only after the first retry attempt fails with 404/1002 + if (!this.canUseMultipleWriteLocations && this.sessionTokenRetryCount >= 1) + { + this.addHubRegionProcessingOnlyHeader = true; + } +#endif return this.ShouldRetryOnSessionNotAvailable(this.documentServiceRequest); - } - - // Received 503 due to client connect timeout or Gateway + } + + // Received 503 due to client connect timeout or Gateway if (statusCode == HttpStatusCode.ServiceUnavailable) { return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemTests.cs index 6882b42e8d..e3a0233461 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemTests.cs @@ -20,6 +20,7 @@ namespace Microsoft.Azure.Cosmos.SDK.EmulatorTests using System.Threading.Tasks; using Microsoft.Azure.Cosmos; using Microsoft.Azure.Cosmos.Diagnostics; + using Microsoft.Azure.Cosmos.Handlers; using Microsoft.Azure.Cosmos.Json; using Microsoft.Azure.Cosmos.Query.Core.ExecutionContext; using Microsoft.Azure.Cosmos.Query.Core.QueryClient; @@ -39,7 +40,8 @@ public class CosmosItemTests : BaseCosmosClientHelper { private Container Container = null; private ContainerProperties containerSettings = null; - + + private const string HubRegionHeader = "x-ms-cosmos-hub-region-processing-only"; private static readonly string nonPartitionItemId = "fixed-Container-Item"; private static readonly string undefinedPartitionItemId = "undefined-partition-Item"; @@ -4315,7 +4317,112 @@ private static async Task GivenItemAsyncWhenMissingMemberHandlingIsErrorThenExpe JsonConvert.DefaultSettings = () => default; } - } + } + + [TestMethod] + [Owner("aavasthy")] + [Description("Forces two consecutive 404/1002 responses from the gateway and verifies ClientRetryPolicy sets the hub region header flag after the first retry fails.")] + public async Task ReadItemAsync_ShouldAddHubHeader_OnRetryAfter_404_1002() + { + int requestCount = 0; + int return404Count = 0; + const int maxReturn404 = 2; // Return 404/1002 twice + + // Created HTTP handler to intercept requests + HttpClientHandlerHelper httpHandler = new HttpClientHandlerHelper + { + RequestCallBack = (request, cancellationToken) => + { + // Track all document read requests + if (request.Method == HttpMethod.Get && + request.RequestUri != null && + request.RequestUri.AbsolutePath.Contains("/docs/")) + { + requestCount++; + + // Header should NOT be present on first retry (2nd request) + if (requestCount == 2 && + request.Headers.TryGetValues(HubRegionHeader, out IEnumerable firstRetryValues) && + firstRetryValues.Any()) + { + Assert.Fail("Header should NOT be present on first retry attempt."); + } + + // Return fake 404/1002 for first two requests + if (return404Count < maxReturn404) + { + return404Count++; + + var errorResponse = new + { + code = "NotFound", + message = "Message: {\"Errors\":[\"Resource Not Found. Learn more: https://aka.ms/cosmosdb-tsg-not-found\"]}\r\nActivityId: " + Guid.NewGuid() + ", Request URI: " + request.RequestUri, + additionalErrorInfo = "" + }; + + HttpResponseMessage notFoundResponse = new HttpResponseMessage(HttpStatusCode.NotFound) + { + Content = new StringContent( + JsonConvert.SerializeObject(errorResponse), + Encoding.UTF8, + "application/json" + ) + }; + + // Add the substatus header for ReadSessionNotAvailable + notFoundResponse.Headers.Add("x-ms-substatus", "1002"); + notFoundResponse.Headers.Add("x-ms-activity-id", Guid.NewGuid().ToString()); + notFoundResponse.Headers.Add("x-ms-request-charge", "1.0"); + + return Task.FromResult(notFoundResponse); + } + } + + return Task.FromResult(null); + } + }; + + CosmosClientOptions clientOptions = new CosmosClientOptions + { + ConnectionMode = ConnectionMode.Gateway, + ConsistencyLevel = Cosmos.ConsistencyLevel.Session, + HttpClientFactory = () => new HttpClient(httpHandler), + MaxRetryAttemptsOnRateLimitedRequests = 9, + MaxRetryWaitTimeOnRateLimitedRequests = TimeSpan.FromSeconds(30) + }; + + using CosmosClient customClient = TestCommon.CreateCosmosClient(clientOptions); + + Container customContainer = customClient.GetContainer(this.database.Id, this.Container.Id); + + // Create a test item first + ToDoActivity testItem = ToDoActivity.CreateRandomToDoActivity(); + await this.Container.CreateItemAsync(testItem, new Cosmos.PartitionKey(testItem.pk)); + + try + { + // This should trigger 404/1002 twice + // In single-region emulator, after first retry fails with 404/1002, it won't retry again + ItemResponse response = await customContainer.ReadItemAsync( + testItem.id, + new Cosmos.PartitionKey(testItem.pk)); + + Assert.Fail("Expected CosmosException due to consecutive 404/1002 failures."); + } + catch (CosmosException ex) + { + // Expected: After first retry fails with 404/1002, single master won't retry again + Assert.AreEqual(HttpStatusCode.NotFound, ex.StatusCode); + Assert.AreEqual((int)SubStatusCodes.ReadSessionNotAvailable, ex.SubStatusCode); + } + + // Verify the expected behavior: + // 1. Initial request (requestCount = 1) fails with 404/1002 + // 2. First retry (requestCount = 2) fails with 404/1002 + // 3. No more retries because single master + no additional regions + Assert.AreEqual(2, requestCount, $"Expected exactly 2 requests (initial + 1 retry) for single-region emulator, but got {requestCount}"); + Assert.AreEqual(2, return404Count, "Both requests should have returned 404/1002"); + } private async Task AutoGenerateIdPatternTest(Cosmos.PartitionKey pk, T itemWithoutId) { diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs index 26ad1e3b88..01beceec22 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs @@ -27,7 +27,8 @@ public sealed class ClientRetryPolicyTests { private static Uri Location1Endpoint = new Uri("https://location1.documents.azure.com"); private static Uri Location2Endpoint = new Uri("https://location2.documents.azure.com"); - + + private const string HubRegionHeader = "x-ms-cosmos-hub-region-processing-only"; private ReadOnlyCollection preferredLocations; private AccountProperties databaseAccount; private GlobalPartitionEndpointManager partitionKeyRangeLocationCache; @@ -400,6 +401,134 @@ public async Task ClientRetryPolicy_NoRetry_MultiMaster_Write_NoPreferredLocatio { await this.ValidateConnectTimeoutTriggersClientRetryPolicyAsync(isReadRequest: false, useMultipleWriteLocations: true, usesPreferredLocations: false, true); } + + /// + /// Test to validate that hub region header is added on 404/1002 for single master accounts only, + /// starting from the second retry (after first retry also fails). For multi-master accounts, + /// the header should NOT be added. + /// + [TestMethod] + [DataRow(true, true, DisplayName = "Read request on single master - Hub region header added after first retry fails")] + [DataRow(false, true, DisplayName = "Write request on single master - Hub region header added after first retry fails")] + [DataRow(true, false, DisplayName = "Read request on multi-master - Hub region header NOT added")] + [DataRow(false, false, DisplayName = "Write request on multi-master - Hub region header NOT added")] + public async Task ClientRetryPolicy_HubRegionHeader_AddedOn404_1002_BasedOnAccountType(bool isReadRequest, bool isSingleMaster) + { + // Arrange + const bool enableEndpointDiscovery = true; + + using GlobalEndpointManager endpointManager = this.Initialize( + useMultipleWriteLocations: !isSingleMaster, + enableEndpointDiscovery: enableEndpointDiscovery, + isPreferredLocationsListEmpty: false, + enforceSingleMasterSingleWriteLocation: isSingleMaster); + + ClientRetryPolicy retryPolicy = new ClientRetryPolicy( + endpointManager, + this.partitionKeyRangeLocationCache, + new RetryOptions(), + enableEndpointDiscovery, + isThinClientEnabled: false); + + DocumentServiceRequest request = this.CreateRequest(isReadRequest: isReadRequest, isMasterResourceType: false); + + // First attempt - header should not exist + retryPolicy.OnBeforeSendRequest(request); + Assert.IsNull(request.Headers.GetValues(HubRegionHeader), "Header should not exist on initial request before any 404/1002 error."); + + // Simulate first 404/1002 error + DocumentClientException sessionNotAvailableException = new DocumentClientException( + message: "Simulated 404/1002 ReadSessionNotAvailable", + innerException: null, + statusCode: HttpStatusCode.NotFound, + substatusCode: SubStatusCodes.ReadSessionNotAvailable, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: new DictionaryNameValueCollection()); + + ShouldRetryResult shouldRetry = await retryPolicy.ShouldRetryAsync(sessionNotAvailableException, CancellationToken.None); + Assert.IsTrue(shouldRetry.ShouldRetry, "Should retry on 404/1002."); + + // First retry attempt - header should NOT be present yet + retryPolicy.OnBeforeSendRequest(request); + string[] headerValues = request.Headers.GetValues(HubRegionHeader); + Assert.IsNull(headerValues, "Header should NOT be present on first retry attempt (before it fails)."); + + // Simulate first retry also failing with 404/1002 + DocumentClientException sessionNotAvailableException2 = new DocumentClientException( + message: "Simulated 404/1002 ReadSessionNotAvailable on first retry", + innerException: null, + statusCode: HttpStatusCode.NotFound, + substatusCode: SubStatusCodes.ReadSessionNotAvailable, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: new DictionaryNameValueCollection()); + + shouldRetry = await retryPolicy.ShouldRetryAsync(sessionNotAvailableException2, CancellationToken.None); + + if (isSingleMaster) + { + // For single master, after one retry fails with 404/1002, it won't retry further + // But the header flag should be set for any potential future retries due to other errors + Assert.IsFalse(shouldRetry.ShouldRetry, "Single master should not retry again after first 404/1002 retry fails."); + + // The header flag should be set even though no more 404/1002 retries will happen + // This ensures if the request is retried for a different reason (e.g., 503), it will have the header + } + else + { + // Multi-master can retry across multiple regions + Assert.IsTrue(shouldRetry.ShouldRetry, "Multi-master should continue retrying on 404/1002."); + } + + // For single master: Verify header would be added if request is retried for other reasons (e.g., 503) + // For multi-master: Verify header is NOT added even on subsequent retries + if (isSingleMaster) + { + // Simulate a 503 error to trigger another retry + DocumentClientException serviceUnavailableException = new DocumentClientException( + message: "Simulated 503 ServiceUnavailable", + innerException: null, + statusCode: HttpStatusCode.ServiceUnavailable, + substatusCode: SubStatusCodes.Unknown, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: new DictionaryNameValueCollection()); + + shouldRetry = await retryPolicy.ShouldRetryAsync(serviceUnavailableException, CancellationToken.None); + + if (shouldRetry.ShouldRetry) + { + // Now verify the header is present on this retry triggered by 503 + retryPolicy.OnBeforeSendRequest(request); + headerValues = request.Headers.GetValues(HubRegionHeader); + Assert.IsNotNull(headerValues, "Header should be present on retry after 404/1002 flag was set."); + Assert.AreEqual(1, headerValues.Length, "Header should have exactly one value."); + Assert.AreEqual(bool.TrueString, headerValues[0], "Header value should be 'True'."); + } + } + else + { + // For multi-master: Verify header is NOT added even on subsequent retries + for (int retryAttempt = 2; retryAttempt <= 3; retryAttempt++) + { + if (shouldRetry.ShouldRetry) + { + retryPolicy.OnBeforeSendRequest(request); + headerValues = request.Headers.GetValues(HubRegionHeader); + Assert.IsNull(headerValues, $"Header should NOT be present on retry attempt {retryAttempt} for multi-master account."); + + // Simulate another 404/1002 or 503 to continue retry loop + DocumentClientException nextException = new DocumentClientException( + message: $"Simulated error on retry {retryAttempt}", + innerException: null, + statusCode: retryAttempt % 2 == 0 ? HttpStatusCode.ServiceUnavailable : HttpStatusCode.NotFound, + substatusCode: retryAttempt % 2 == 0 ? SubStatusCodes.Unknown : SubStatusCodes.ReadSessionNotAvailable, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: new DictionaryNameValueCollection()); + + shouldRetry = await retryPolicy.ShouldRetryAsync(nextException, CancellationToken.None); + } + } + } + } private async Task ValidateConnectTimeoutTriggersClientRetryPolicyAsync( bool isReadRequest,