Skip to content

Commit 833e3de

Browse files
committed
Improve keepalive handling.
- Tweak how the upstreams are setup to prevent temporary connection failures from removing the servers from rotation. - Allow connection retries to upstreams in the event of connection failures. - Enable so_keepalive on listening sockets (I don't necessarily think this will help with the upstream keepalive issues, but is probably a good idea, and could help with keepalive behavior to any front-facing load balancers. This cropped up after introducing an AWS NAT Gateway into our stack, which closes inactive keepalive connections after 5 minutes: https://docs.aws.amazon.com/AmazonVPC/latest/UserGuide/vpc-nat-gateway.html#nat-gateway-troubleshooting-timeout See 18F/api.data.gov#446
1 parent 547c51e commit 833e3de

File tree

2 files changed

+28
-11
lines changed

2 files changed

+28
-11
lines changed

src/api-umbrella/proxy/load_backends.lua

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,23 @@ local function generate_upstream_config(api)
4545
nginx_ip = ip
4646
end
4747

48-
table.insert(servers, "server " .. nginx_ip .. ":" .. server["port"] .. ";")
48+
-- Insert 5 copies of the server, and set max_fails=0. In combination
49+
-- with the global "proxy_next_upstream error" setting, this allows
50+
-- for the API backend requests to retry up to 5 times if a
51+
-- connection was never actually established.
52+
--
53+
-- This is a bit of a hack, but this helps deal with upstream
54+
-- keepalive connections that might get closed (either by the API
55+
-- backend or some other firewall or NAT in between).
56+
-
57+
-- max_fails=0 is important so that single servers don't get
58+
-- completely removed from rotation (for fail_timeout) if a single
59+
-- request fails. By repeating the same server IP multiple times,
60+
-- this also gives proxy_next_upstream a chance to failover and retry
61+
-- the same server.
62+
for i = 1, 5 do
63+
table.insert(servers, "server " .. nginx_ip .. ":" .. server["port"] .. " max_fails=0;")
64+
end
4965
end
5066
end
5167
end

templates/etc/nginx/router.conf.mustache

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ http {
3030
'ca=$connections_active cr=$connections_reading cw=$connections_writing '
3131
'ct=$connections_waiting cq=$connection_requests bs=$bytes_sent '
3232
'rl=$request_length rt=$request_time uct=$upstream_connect_time '
33-
'uht=$upstream_header_time urt=$upstream_response_time';
33+
'uht=$upstream_header_time urt=$upstream_response_time '
34+
'ua="$upstream_addr" us="$upstream_status"';
3435
access_log {{log_dir}}/nginx/{{nginx.access_log_filename}} combined_extended {{nginx.access_log_options}};
3536

3637
client_body_temp_path {{tmp_dir}}/nginx-client_body_temp;
@@ -211,14 +212,14 @@ http {
211212
dyups_trylock on;
212213
dyups_read_msg_timeout 300ms;
213214
server {
214-
listen {{nginx.dyups.host}}:{{nginx.dyups.port}};
215+
listen {{nginx.dyups.host}}:{{nginx.dyups.port}} so_keepalive=on;
215216
location / {
216217
dyups_interface;
217218
}
218219
}
219220

220221
server {
221-
listen {{api_server.host}}:{{api_server.port}};
222+
listen {{api_server.host}}:{{api_server.port}} so_keepalive=on;
222223
set $x_api_umbrella_request_id $http_x_api_umbrella_request_id;
223224

224225
location /api-umbrella/v1/health {
@@ -232,12 +233,12 @@ http {
232233

233234
{{#hosts}}
234235
server {
235-
listen {{http_port}}{{#default}} default_server{{/default}};
236-
listen [::]:{{http_port}}{{#default}} default_server{{/default}};
236+
listen {{http_port}}{{#default}} default_server so_keepalive=on{{/default}};
237+
listen [::]:{{http_port}}{{#default}} default_server so_keepalive=on{{/default}};
237238
server_name {{_nginx_server_name}};
238239

239-
listen {{https_port}} ssl{{#default}} default_server{{/default}};
240-
listen [::]:{{https_port}} ssl{{#default}} default_server{{/default}};
240+
listen {{https_port}} ssl{{#default}} default_server so_keepalive=on{{/default}};
241+
listen [::]:{{https_port}} ssl{{#default}} default_server so_keepalive=on{{/default}};
241242
{{#ssl_cert}}
242243
ssl_certificate {{ssl_cert}};
243244
ssl_certificate_key {{ssl_cert_key}};
@@ -277,7 +278,7 @@ http {
277278
{{/hosts}}
278279

279280
server {
280-
listen {{static_site.host}}:{{static_site.port}};
281+
listen {{static_site.host}}:{{static_site.port}} so_keepalive=on;
281282
server_name _;
282283

283284
root {{static_site.build_dir}};
@@ -296,7 +297,7 @@ http {
296297
}
297298

298299
server {
299-
listen {{router.api_backends.host}}:{{router.api_backends.port}};
300+
listen {{router.api_backends.host}}:{{router.api_backends.port}} so_keepalive=on;
300301
server_name _;
301302

302303
set $x_api_umbrella_request_id $http_x_api_umbrella_request_id;
@@ -339,7 +340,7 @@ http {
339340
}
340341

341342
server {
342-
listen {{web.host}}:{{web.port}};
343+
listen {{web.host}}:{{web.port}} so_keepalive=on;
343344
server_name _;
344345

345346
set $x_api_umbrella_request_id $http_x_api_umbrella_request_id;

0 commit comments

Comments
 (0)