Skip to content

Commit 46974e9

Browse files
authored
Fix IP address being labelled "bad" for too long (#718)
**Issue:** A server went offline for a while to do some updates. But when it came back online later, it wasn't getting any traffic. The expectation is that traffic to this server would eventually resume. **Diagnosis:** The default host resolver has a "bad" list of addresses (connect attempt failed). The idea is: don't use these addresses unless there are NO known good addresses. Due to quirks of the code, addresses in the "bad' list would not be removed when their TTL expired. Maybe this was an accident? Maybe the author just wanted to avoid some looping? In any case, this could result in the address being labeled "bad" FOREVER. Even if the server and DNS did the right thing: not reporting the address while the server was offline but reporting it again later when everything was good, the aws_host_resolver cache would still have the address in its "bad" list. The only way to get off the bad list would be if ALL good addresses failed, and the resolver was forced to pull from the bad list. But we need to handle a reasonable world, where an address might randomly go bad for a while and get better later. **Description of changes:** - Remove addresses in the bad list when their TTL expires - TTL is currently [30sec by default](https://github.com/awslabs/aws-c-io/blob/9c7c52cb0d61b6644e6fb1973fcd680d62435ac7/source/host_resolver.c#L23-L24), [5min for aws-c-s3](https://github.com/awslabs/aws-c-s3/blob/169842b7e2f81d71d0719d4a77f9c3e186512f99/source/s3_client.c#L81-L82) (these may change in the future) - If an address is in the "bad" list, but DNS is still reporting it, don't update its TTL. So now, if there's a failed connection, the address will remain on the "bad" list until its TTL expires, even if that address appears in future DNS queries. Then, the address will be removed from all caches. If a future DNS query returns that address again, it will go into the "good" list.
1 parent 2efcac7 commit 46974e9

File tree

3 files changed

+216
-25
lines changed

3 files changed

+216
-25
lines changed

source/host_resolver.c

Lines changed: 51 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -520,16 +520,22 @@ static inline void process_records(
520520
AWS_LOGF_TRACE(AWS_LS_IO_DNS, "static: remaining record count for host %d", (int)record_count);
521521

522522
/* if we don't have any known good addresses, take the least recently used, but not expired address with a history
523-
* of spotty behavior and upgrade it for reuse. If it's expired, leave it and let the resolve fail. Better to fail
524-
* than accidentally give a kids' app an IP address to somebody's adult website when the IP address gets rebound to
525-
* a different endpoint. The moral of the story here is to not disable SSL verification! */
526-
if (!record_count) {
527-
size_t failed_count = aws_cache_get_element_count(failed_records);
528-
for (size_t index = 0; index < failed_count; ++index) {
529-
struct aws_host_address_cache_entry *lru_element_entry = aws_lru_cache_use_lru_element(failed_records);
530-
if (timestamp >= lru_element_entry->address.expiry) {
531-
continue;
532-
}
523+
* of spotty behavior and upgrade it for reuse. */
524+
bool should_promote = (record_count == 0);
525+
size_t failed_count = aws_cache_get_element_count(failed_records);
526+
for (size_t index = 0; index < failed_count; ++index) {
527+
struct aws_host_address_cache_entry *lru_element_entry = aws_lru_cache_use_lru_element(failed_records);
528+
529+
/* remove expired entries */
530+
if (timestamp >= lru_element_entry->address.expiry) {
531+
AWS_LOGF_DEBUG(
532+
AWS_LS_IO_DNS,
533+
"static: purging expired bad record %s for %s",
534+
lru_element_entry->address.address->bytes,
535+
lru_element_entry->address.host->bytes);
536+
aws_cache_remove(failed_records, lru_element_entry->address.address);
537+
538+
} else if (should_promote) {
533539

534540
struct aws_host_address_cache_entry *to_add =
535541
aws_mem_calloc(host_entry->allocator, 1, sizeof(struct aws_host_address_cache_entry));
@@ -558,7 +564,7 @@ static inline void process_records(
558564
aws_cache_remove(failed_records, lru_element_entry->address.address);
559565

560566
/* we only want to promote one per process run.*/
561-
break;
567+
should_promote = false;
562568
}
563569
}
564570
}
@@ -703,24 +709,33 @@ static int resolver_record_connection_failure(
703709
* A bunch of convenience functions for the host resolver background thread function
704710
*/
705711

706-
static struct aws_host_address_cache_entry *s_find_cached_address_entry_aux(
712+
struct aws_host_address_cache_entry_lookup {
713+
struct aws_host_address_cache_entry *entry; /* if lookup succeeds, this is non-NULL */
714+
bool is_fallback;
715+
};
716+
717+
static struct aws_host_address_cache_entry_lookup s_find_cached_address_entry_aux(
707718
struct aws_cache *primary_records,
708719
struct aws_cache *fallback_records,
709720
const struct aws_string *address) {
710721

711722
struct aws_host_address_cache_entry *found = NULL;
723+
bool is_fallback = false;
712724
aws_cache_find(primary_records, address, (void **)&found);
713725
if (found == NULL) {
714726
aws_cache_find(fallback_records, address, (void **)&found);
727+
if (found != NULL) {
728+
is_fallback = true;
729+
}
715730
}
716731

717-
return found;
732+
return (struct aws_host_address_cache_entry_lookup){.entry = found, .is_fallback = is_fallback};
718733
}
719734

720735
/*
721736
* Looks in both the good and failed connection record sets for a given host record
722737
*/
723-
static struct aws_host_address_cache_entry *s_find_cached_address_entry(
738+
static struct aws_host_address_cache_entry_lookup s_find_cached_address_entry(
724739
struct host_entry *entry,
725740
const struct aws_string *address,
726741
enum aws_address_record_type record_type) {
@@ -733,7 +748,7 @@ static struct aws_host_address_cache_entry *s_find_cached_address_entry(
733748
return s_find_cached_address_entry_aux(entry->a_records, entry->failed_connection_a_records, address);
734749

735750
default:
736-
return NULL;
751+
return (struct aws_host_address_cache_entry_lookup){.entry = NULL};
737752
}
738753
}
739754

@@ -779,19 +794,30 @@ static void s_update_address_cache(
779794
struct aws_host_address *fresh_resolved_address = NULL;
780795
aws_array_list_get_at_ptr(address_list, (void **)&fresh_resolved_address, i);
781796

782-
struct aws_host_address_cache_entry *address_to_cache_entry = s_find_cached_address_entry(
797+
struct aws_host_address_cache_entry_lookup cache_lookup = s_find_cached_address_entry(
783798
host_entry, fresh_resolved_address->address, fresh_resolved_address->record_type);
784799

785-
if (address_to_cache_entry) {
786-
address_to_cache_entry->address.expiry = new_expiration;
787-
AWS_LOGF_TRACE(
788-
AWS_LS_IO_DNS,
789-
"static: updating expiry for %s for host %s to %llu",
790-
address_to_cache_entry->address.address->bytes,
791-
host_entry->host_name->bytes,
792-
(unsigned long long)new_expiration);
800+
if (cache_lookup.entry) {
801+
struct aws_host_address_cache_entry *address_to_cache_entry = cache_lookup.entry;
802+
if (cache_lookup.is_fallback) {
803+
AWS_LOGF_TRACE(
804+
AWS_LS_IO_DNS,
805+
"static: NOT updating expiry for %s for host %s because it's in bad list."
806+
" Keeping old expiry of %" PRIu64,
807+
aws_string_c_str(address_to_cache_entry->address.address),
808+
aws_string_c_str(host_entry->host_name),
809+
address_to_cache_entry->address.expiry);
810+
} else {
811+
address_to_cache_entry->address.expiry = new_expiration;
812+
AWS_LOGF_TRACE(
813+
AWS_LS_IO_DNS,
814+
"static: updating expiry for %s for host %s to %" PRIu64,
815+
aws_string_c_str(address_to_cache_entry->address.address),
816+
aws_string_c_str(host_entry->host_name),
817+
new_expiration);
818+
}
793819
} else {
794-
address_to_cache_entry =
820+
struct aws_host_address_cache_entry *address_to_cache_entry =
795821
aws_mem_calloc(host_entry->allocator, 1, sizeof(struct aws_host_address_cache_entry));
796822

797823
aws_host_address_move(fresh_resolved_address, &address_to_cache_entry->address);

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ add_net_test_case(test_default_with_ipv4_only_lookup)
113113
add_test_case(test_resolver_ttls)
114114
add_test_case(test_resolver_connect_failure_recording)
115115
add_test_case(test_resolver_ttl_refreshes_on_resolve)
116+
add_test_case(test_resolver_bad_list_expires_eventually)
116117
add_test_case(test_resolver_low_frequency_starvation)
117118

118119
add_test_case(test_pem_single_cert_parse)

tests/default_host_resolver_test.c

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,6 +1060,170 @@ static int s_test_resolver_ttl_refreshes_on_resolve_fn(struct aws_allocator *all
10601060

10611061
AWS_TEST_CASE(test_resolver_ttl_refreshes_on_resolve, s_test_resolver_ttl_refreshes_on_resolve_fn)
10621062

1063+
/**
1064+
* Test that a failed address isn't stuck on the bad list for eternity.
1065+
*
1066+
* This is a regression test for a real world use case:
1067+
* A server went offline for a while to do some updates.
1068+
* But when it came back online later, it wasn't getting any traffic.
1069+
* The expectation is that traffic to this server would eventually resume.
1070+
*
1071+
* The cause was: the default host resolver didn't purge addresses
1072+
* from the bad list after their TTL expired. Once an address got on the bad list,
1073+
* it was nearly impossible to ever get off it.
1074+
*/
1075+
static int s_test_resolver_bad_list_expires_eventually_fn(struct aws_allocator *allocator, void *ctx) {
1076+
(void)ctx;
1077+
1078+
aws_io_library_init(allocator);
1079+
1080+
struct aws_event_loop_group_options elg_options = {
1081+
.loop_count = 1,
1082+
};
1083+
struct aws_event_loop_group *el_group = aws_event_loop_group_new(allocator, &elg_options);
1084+
1085+
struct aws_host_resolver_default_options resolver_options = {
1086+
.el_group = el_group,
1087+
.max_entries = 10,
1088+
};
1089+
struct aws_host_resolver *resolver = aws_host_resolver_new_default(allocator, &resolver_options);
1090+
1091+
const struct aws_string *host_name = aws_string_new_from_c_str(allocator, "host_address");
1092+
1093+
/*
1094+
* Set up mock DNS with 2 addresses.
1095+
* We'll tell the host resolver that a connection to addr1 failed.
1096+
* (don't report both as failed, or bad addresses will move back to the
1097+
* good list through mechanisms we're testing elsewhere)
1098+
* The mock DNS will continually report those same 2 addresses.
1099+
* Assert that, at some point in the future, the host resolver yields addr1 again.
1100+
*/
1101+
1102+
const struct aws_string *addr1_ipv4 = aws_string_new_from_c_str(allocator, "address1ipv4");
1103+
const struct aws_string *addr2_ipv4 = aws_string_new_from_c_str(allocator, "address2ipv4");
1104+
1105+
struct mock_dns_resolver mock_resolver;
1106+
ASSERT_SUCCESS(mock_dns_resolver_init(&mock_resolver, 1000, allocator));
1107+
1108+
struct aws_host_resolution_config config = {
1109+
/* short TTL so bad entries expire and get removed without this test taking too long */
1110+
.max_ttl = 2 /*sec*/,
1111+
.impl = mock_dns_resolve,
1112+
.impl_data = &mock_resolver,
1113+
};
1114+
1115+
struct aws_host_address host_address_1_ipv4 = {
1116+
.address = addr1_ipv4,
1117+
.allocator = allocator,
1118+
.host = aws_string_new_from_c_str(allocator, "host_address"),
1119+
.record_type = AWS_ADDRESS_RECORD_TYPE_A,
1120+
};
1121+
1122+
struct aws_host_address host_address_2_ipv4 = {
1123+
.address = addr2_ipv4,
1124+
.allocator = allocator,
1125+
.host = aws_string_new_from_c_str(allocator, "host_address"),
1126+
.record_type = AWS_ADDRESS_RECORD_TYPE_A,
1127+
};
1128+
1129+
struct aws_array_list address_list_1;
1130+
ASSERT_SUCCESS(aws_array_list_init_dynamic(&address_list_1, allocator, 2, sizeof(struct aws_host_address)));
1131+
ASSERT_SUCCESS(aws_array_list_push_back(&address_list_1, &host_address_1_ipv4));
1132+
ASSERT_SUCCESS(aws_array_list_push_back(&address_list_1, &host_address_2_ipv4));
1133+
1134+
ASSERT_SUCCESS(mock_dns_resolver_append_address_list(&mock_resolver, &address_list_1));
1135+
1136+
struct aws_mutex mutex = AWS_MUTEX_INIT;
1137+
struct default_host_callback_data callback_data = {
1138+
.condition_variable = AWS_CONDITION_VARIABLE_INIT,
1139+
.mutex = &mutex,
1140+
};
1141+
1142+
/*
1143+
* Do first lookup, we expect to get addr1.
1144+
*/
1145+
1146+
ASSERT_SUCCESS(aws_host_resolver_resolve_host(
1147+
resolver, host_name, s_default_host_resolved_test_callback, &config, &callback_data));
1148+
1149+
/* BEGIN CRITICAL SECTION */
1150+
ASSERT_SUCCESS(aws_mutex_lock(&mutex));
1151+
aws_condition_variable_wait_pred(
1152+
&callback_data.condition_variable, &mutex, s_default_host_resolved_predicate, &callback_data);
1153+
1154+
ASSERT_INT_EQUALS(0, aws_string_compare(addr1_ipv4, callback_data.a_address.address));
1155+
1156+
aws_host_address_clean_up(&callback_data.a_address);
1157+
callback_data.invoked = false;
1158+
aws_mutex_unlock(&mutex);
1159+
/* END CRITICAL SECTION */
1160+
1161+
/*
1162+
* Report addr1 as failed.
1163+
* Then do repeated lookups in a loop.
1164+
* We expect to see addr2 again and again, until we eventually see addr1.
1165+
*/
1166+
1167+
ASSERT_SUCCESS(aws_host_resolver_record_connection_failure(resolver, &host_address_1_ipv4));
1168+
1169+
uint64_t num_times_received_addr1 = 0;
1170+
uint64_t num_times_received_addr2 = 0;
1171+
1172+
uint64_t start_timestamp_ns = 0;
1173+
aws_high_res_clock_get_ticks(&start_timestamp_ns);
1174+
while (num_times_received_addr1 == 0) {
1175+
1176+
/* Check for timeout */
1177+
uint64_t current_timestamp_ns = 0;
1178+
aws_high_res_clock_get_ticks(&current_timestamp_ns);
1179+
1180+
uint64_t looping_secs = aws_timestamp_convert(
1181+
current_timestamp_ns - start_timestamp_ns, AWS_TIMESTAMP_NANOS, AWS_TIMESTAMP_SECS, NULL);
1182+
ASSERT_TRUE(looping_secs < 10, "Timed out waiting for addr1 to get off bad list");
1183+
1184+
/* Sleep a moment, to keep logs at reasonable size */
1185+
uint64_t sleep_ns = aws_timestamp_convert(100, AWS_TIMESTAMP_MILLIS, AWS_TIMESTAMP_NANOS, NULL);
1186+
aws_thread_current_sleep(sleep_ns);
1187+
1188+
/* Do lookup */
1189+
ASSERT_SUCCESS(aws_host_resolver_resolve_host(
1190+
resolver, host_name, s_default_host_resolved_test_callback, &config, &callback_data));
1191+
1192+
/* BEGIN CRITICAL SECTION */
1193+
ASSERT_SUCCESS(aws_mutex_lock(&mutex));
1194+
aws_condition_variable_wait_pred(
1195+
&callback_data.condition_variable, &mutex, s_default_host_resolved_predicate, &callback_data);
1196+
1197+
if (aws_string_compare(addr1_ipv4, callback_data.a_address.address) == 0) {
1198+
++num_times_received_addr1;
1199+
} else if (aws_string_compare(addr2_ipv4, callback_data.a_address.address) == 0) {
1200+
++num_times_received_addr2;
1201+
} else {
1202+
ASSERT_TRUE(false, "Received unexpected address");
1203+
}
1204+
1205+
aws_host_address_clean_up(&callback_data.a_address);
1206+
callback_data.invoked = false;
1207+
aws_mutex_unlock(&mutex);
1208+
/* END CRITICAL SECTION */
1209+
}
1210+
1211+
ASSERT_TRUE(num_times_received_addr2 > 10, "Expected to see a lot of addr2 while addr1 was on bad list");
1212+
1213+
/* clean up */
1214+
aws_host_resolver_release(resolver);
1215+
aws_string_destroy((void *)host_name);
1216+
aws_event_loop_group_release(el_group);
1217+
1218+
aws_io_library_clean_up();
1219+
1220+
mock_dns_resolver_clean_up(&mock_resolver);
1221+
1222+
return 0;
1223+
}
1224+
1225+
AWS_TEST_CASE(test_resolver_bad_list_expires_eventually, s_test_resolver_bad_list_expires_eventually_fn)
1226+
10631227
static int s_test_resolver_ipv4_address_lookup_fn(struct aws_allocator *allocator, void *ctx) {
10641228
(void)ctx;
10651229

0 commit comments

Comments
 (0)