|
| 1 | +commit 046fb6ba0aec8246075b18d787daec43201566fa |
| 2 | +Author: Antti Tiainen <atiainen@forcepoint.com> |
| 3 | +Date: Mon Feb 6 15:41:05 2017 +0200 |
| 4 | + |
| 5 | + libteam: resynchronize ifinfo after lost RTNLGRP_LINK notifications |
| 6 | + |
| 7 | + When there's a large number of interfaces (e.g. vlans), teamd loses |
| 8 | + link notifications as it cannot read them as fast as kernel is |
| 9 | + broadcasting them. This often prevents teamd starting properly if |
| 10 | + started concurrently when other links are being set up. It can also |
| 11 | + fail when it's up and running, especially in the cases where the team |
| 12 | + device itself has a lot of vlans under it. |
| 13 | + |
| 14 | + This can easily be reproduces by simple example (in SMP system) by |
| 15 | + manually adding team device with a bunch of vlans, putting it up, |
| 16 | + and starting teamd with --take-over option: |
| 17 | + |
| 18 | + root@debian:~# ip link add name team0 type team |
| 19 | + root@debian:~# for i in `seq 100 150` ; do |
| 20 | + > ip link add link team0 name team0.$i type vlan id $i ; done |
| 21 | + root@debian:~# ip link set team0 up |
| 22 | + root@debian:~# cat teamd.conf |
| 23 | + { |
| 24 | + "device": "team0", |
| 25 | + "runner": { |
| 26 | + "name": "activebackup" |
| 27 | + }, |
| 28 | + "ports": { |
| 29 | + "eth1": {}, |
| 30 | + "eth2": {} |
| 31 | + } |
| 32 | + } |
| 33 | + root@debian:~# teamd -o -N -f teamd.conf |
| 34 | + |
| 35 | + At this point, teamd will not give any error messages or other |
| 36 | + indication that something is wrong. But state will not look healthy: |
| 37 | + |
| 38 | + root@debian:~# teamdctl team0 state |
| 39 | + setup: |
| 40 | + runner: activebackup |
| 41 | + ports: |
| 42 | + eth1 |
| 43 | + link watches: |
| 44 | + link summary: up |
| 45 | + instance[link_watch_0]: |
| 46 | + name: ethtool |
| 47 | + link: up |
| 48 | + down count: 0 |
| 49 | + Failed to parse JSON port dump. |
| 50 | + command call failed (Invalid argument) |
| 51 | + |
| 52 | + If checking state dump, it will show that port eth2 is missing info. |
| 53 | + Running strace to teamd will reveal that there's one recvmsgs() that |
| 54 | + returned -1 with errno ENOBUFS. What happened in this example was |
| 55 | + that when teamd started, all vlans got carrier up, and kernel flooded |
| 56 | + notifications faster than teamd could read them. It then lost events |
| 57 | + related to port eth2 getting enslaved and up. |
| 58 | + |
| 59 | + The socket that joins RTNLGRP_LINK notifications uses default libnl |
| 60 | + 32k buffer size. Netlink messages are large (over 1k), and this buffer |
| 61 | + gets easily full. Kernel neither knows nor cares were notification |
| 62 | + broadcasts delivered. This cannot be fixed by simply increasing the |
| 63 | + buffer size, as there's no size that is guaranteed to work in every |
| 64 | + use case, and this can require several megabytes of buffer (a way over |
| 65 | + normal rmem_max limit) if there are hunderds of vlans. |
| 66 | + |
| 67 | + Only way to recover from this is to refresh all ifinfo list, as it's |
| 68 | + invalidated at this point. It cannot easily work around of this by |
| 69 | + just refreshing team device and its ports, because library side might |
| 70 | + not have ports linked due to events missed, and it doesn't know about |
| 71 | + teamd configuration. |
| 72 | + |
| 73 | + Checks now return value of nl_recvmsgs_default() for event socket. In |
| 74 | + case of ENOBUFS (which libnl nicely changes to ENOMEM), refreshes |
| 75 | + all ifinfo list. get_ifinfo_list() also checks now for removed interfaces |
| 76 | + in case of missed dellink event. Currently all TEAM_IFINFO_CHANGE |
| 77 | + handlers processed events one by one, so it had to be changed to support |
| 78 | + multiple ifinfo changes. For this, ifinfo changed flags are cleared |
| 79 | + and removed entries destroyed only after all handlers have been called. |
| 80 | + |
| 81 | + Also, increased nl_cli.sock_event receive buffers to 96k like all other |
| 82 | + sockets. Added possibility to change this via environment variable. |
| 83 | + |
| 84 | + Signed-off-by: Antti Tiainen <atiainen@forcepoint.com> |
| 85 | + Signed-off-by: Jiri Pirko <jiri@mellanox.com> |
| 86 | + |
| 87 | +diff --git a/libteam/ifinfo.c b/libteam/ifinfo.c |
| 88 | +index 72155ae..5c32a9c 100644 |
| 89 | +--- a/libteam/ifinfo.c |
| 90 | ++++ b/libteam/ifinfo.c |
| 91 | +@@ -72,6 +72,10 @@ struct team_ifinfo { |
| 92 | + #define CHANGED_PHYS_PORT_ID (1 << 5) |
| 93 | + #define CHANGED_PHYS_PORT_ID_LEN (1 << 6) |
| 94 | + #define CHANGED_ADMIN_STATE (1 << 7) |
| 95 | ++/* This is only used when tagging interfaces for finding |
| 96 | ++ * removed, and thus not included to CHANGED_ANY. |
| 97 | ++ */ |
| 98 | ++#define CHANGED_REFRESHING (1 << 8) |
| 99 | + #define CHANGED_ANY (CHANGED_REMOVED | CHANGED_HWADDR | \ |
| 100 | + CHANGED_HWADDR_LEN | CHANGED_IFNAME | \ |
| 101 | + CHANGED_MASTER_IFINDEX | CHANGED_PHYS_PORT_ID | \ |
| 102 | +@@ -202,7 +206,7 @@ static struct team_ifinfo *ifinfo_find(struct team_handle *th, uint32_t ifindex) |
| 103 | + return NULL; |
| 104 | + } |
| 105 | + |
| 106 | +-static void clear_last_changed(struct team_handle *th) |
| 107 | ++void ifinfo_clear_changed(struct team_handle *th) |
| 108 | + { |
| 109 | + struct team_ifinfo *ifinfo; |
| 110 | + |
| 111 | +@@ -236,7 +240,7 @@ static void ifinfo_destroy(struct team_ifinfo *ifinfo) |
| 112 | + free(ifinfo); |
| 113 | + } |
| 114 | + |
| 115 | +-static void ifinfo_destroy_removed(struct team_handle *th) |
| 116 | ++void ifinfo_destroy_removed(struct team_handle *th) |
| 117 | + { |
| 118 | + struct team_ifinfo *ifinfo, *tmp; |
| 119 | + |
| 120 | +@@ -254,8 +258,6 @@ static void obj_input_newlink(struct nl_object *obj, void *arg, bool event) |
| 121 | + uint32_t ifindex; |
| 122 | + int err; |
| 123 | + |
| 124 | +- ifinfo_destroy_removed(th); |
| 125 | +- |
| 126 | + link = (struct rtnl_link *) obj; |
| 127 | + |
| 128 | + ifindex = rtnl_link_get_ifindex(link); |
| 129 | +@@ -269,7 +271,7 @@ static void obj_input_newlink(struct nl_object *obj, void *arg, bool event) |
| 130 | + return; |
| 131 | + } |
| 132 | + |
| 133 | +- clear_last_changed(th); |
| 134 | ++ clear_changed(ifinfo); |
| 135 | + ifinfo_update(ifinfo, link); |
| 136 | + |
| 137 | + if (event) |
| 138 | +@@ -292,8 +294,6 @@ static void event_handler_obj_input_dellink(struct nl_object *obj, void *arg) |
| 139 | + uint32_t ifindex; |
| 140 | + int err; |
| 141 | + |
| 142 | +- ifinfo_destroy_removed(th); |
| 143 | +- |
| 144 | + link = (struct rtnl_link *) obj; |
| 145 | + |
| 146 | + ifindex = rtnl_link_get_ifindex(link); |
| 147 | +@@ -311,7 +311,7 @@ static void event_handler_obj_input_dellink(struct nl_object *obj, void *arg) |
| 148 | + return; |
| 149 | + } |
| 150 | + |
| 151 | +- clear_last_changed(th); |
| 152 | ++ clear_changed(ifinfo); |
| 153 | + set_changed(ifinfo, CHANGED_REMOVED); |
| 154 | + set_call_change_handlers(th, TEAM_IFINFO_CHANGE); |
| 155 | + } |
| 156 | +@@ -367,6 +367,14 @@ int get_ifinfo_list(struct team_handle *th) |
| 157 | + }; |
| 158 | + int ret; |
| 159 | + int retry = 1; |
| 160 | ++ struct team_ifinfo *ifinfo; |
| 161 | ++ |
| 162 | ++ /* Tag all ifinfo, this is cleared in newlink handler. |
| 163 | ++ * Any interface that has this after dump is processed |
| 164 | ++ * has been removed. |
| 165 | ++ */ |
| 166 | ++ list_for_each_node_entry(ifinfo, &th->ifinfo_list, list) |
| 167 | ++ set_changed(ifinfo, CHANGED_REFRESHING); |
| 168 | + |
| 169 | + while (retry) { |
| 170 | + retry = 0; |
| 171 | +@@ -395,6 +403,15 @@ int get_ifinfo_list(struct team_handle *th) |
| 172 | + retry = 1; |
| 173 | + } |
| 174 | + } |
| 175 | ++ |
| 176 | ++ list_for_each_node_entry(ifinfo, &th->ifinfo_list, list) { |
| 177 | ++ if (is_changed(ifinfo, CHANGED_REFRESHING)) { |
| 178 | ++ clear_changed(ifinfo); |
| 179 | ++ set_changed(ifinfo, CHANGED_REMOVED); |
| 180 | ++ set_call_change_handlers(th, TEAM_IFINFO_CHANGE); |
| 181 | ++ } |
| 182 | ++ } |
| 183 | ++ |
| 184 | + ret = check_call_change_handlers(th, TEAM_IFINFO_CHANGE); |
| 185 | + if (ret < 0) |
| 186 | + err(th, "get_ifinfo_list: check_call_change_handers failed"); |
| 187 | +diff --git a/libteam/libteam.c b/libteam/libteam.c |
| 188 | +index ac187aa..d5f22cd 100644 |
| 189 | +--- a/libteam/libteam.c |
| 190 | ++++ b/libteam/libteam.c |
| 191 | +@@ -236,6 +236,10 @@ int check_call_change_handlers(struct team_handle *th, |
| 192 | + break; |
| 193 | + } |
| 194 | + } |
| 195 | ++ if (call_type_mask & TEAM_IFINFO_CHANGE) { |
| 196 | ++ ifinfo_destroy_removed(th); |
| 197 | ++ ifinfo_clear_changed(th); |
| 198 | ++ } |
| 199 | + th->change_handler.pending_type_mask &= ~call_type_mask; |
| 200 | + return err; |
| 201 | + } |
| 202 | +@@ -546,6 +550,11 @@ int team_destroy(struct team_handle *th) |
| 203 | + #endif |
| 204 | + /* \endcond */ |
| 205 | + |
| 206 | ++/* libnl uses default 32k socket receive buffer size, |
| 207 | ++ * whicn can get too small. Use 96k for all sockets. |
| 208 | ++ */ |
| 209 | ++#define NETLINK_RCVBUF 983040 |
| 210 | ++ |
| 211 | + /** |
| 212 | + * @param th libteam library context |
| 213 | + * @param ifindex team device interface index |
| 214 | +@@ -561,6 +570,8 @@ int team_init(struct team_handle *th, uint32_t ifindex) |
| 215 | + int err; |
| 216 | + int grp_id; |
| 217 | + int val; |
| 218 | ++ int eventbufsize; |
| 219 | ++ const char *env; |
| 220 | + |
| 221 | + if (!ifindex) { |
| 222 | + err(th, "Passed interface index %d is not valid.", ifindex); |
| 223 | +@@ -589,12 +600,12 @@ int team_init(struct team_handle *th, uint32_t ifindex) |
| 224 | + return -errno; |
| 225 | + } |
| 226 | + |
| 227 | +- err = nl_socket_set_buffer_size(th->nl_sock, 98304, 0); |
| 228 | ++ err = nl_socket_set_buffer_size(th->nl_sock, NETLINK_RCVBUF, 0); |
| 229 | + if (err) { |
| 230 | + err(th, "Failed to set buffer size of netlink sock."); |
| 231 | + return -nl2syserr(err); |
| 232 | + } |
| 233 | +- err = nl_socket_set_buffer_size(th->nl_sock_event, 98304, 0); |
| 234 | ++ err = nl_socket_set_buffer_size(th->nl_sock_event, NETLINK_RCVBUF, 0); |
| 235 | + if (err) { |
| 236 | + err(th, "Failed to set buffer size of netlink event sock."); |
| 237 | + return -nl2syserr(err); |
| 238 | +@@ -627,6 +638,25 @@ int team_init(struct team_handle *th, uint32_t ifindex) |
| 239 | + nl_socket_modify_cb(th->nl_cli.sock_event, NL_CB_VALID, |
| 240 | + NL_CB_CUSTOM, cli_event_handler, th); |
| 241 | + nl_cli_connect(th->nl_cli.sock_event, NETLINK_ROUTE); |
| 242 | ++ |
| 243 | ++ env = getenv("TEAM_EVENT_BUFSIZE"); |
| 244 | ++ if (env) { |
| 245 | ++ eventbufsize = strtol(env, NULL, 10); |
| 246 | ++ /* ignore other errors, libnl forces minimum 32k and |
| 247 | ++ * too large values are truncated to system rmem_max |
| 248 | ++ */ |
| 249 | ++ if (eventbufsize < 0) |
| 250 | ++ eventbufsize = 0; |
| 251 | ++ } else { |
| 252 | ++ eventbufsize = NETLINK_RCVBUF; |
| 253 | ++ } |
| 254 | ++ |
| 255 | ++ err = nl_socket_set_buffer_size(th->nl_cli.sock_event, eventbufsize, 0); |
| 256 | ++ if (err) { |
| 257 | ++ err(th, "Failed to set cli event socket buffer size."); |
| 258 | ++ return err; |
| 259 | ++ } |
| 260 | ++ |
| 261 | + err = nl_socket_add_membership(th->nl_cli.sock_event, RTNLGRP_LINK); |
| 262 | + if (err < 0) { |
| 263 | + err(th, "Failed to add netlink membership."); |
| 264 | +@@ -767,7 +797,23 @@ static int get_cli_sock_event_fd(struct team_handle *th) |
| 265 | + |
| 266 | + static int cli_sock_event_handler(struct team_handle *th) |
| 267 | + { |
| 268 | +- nl_recvmsgs_default(th->nl_cli.sock_event); |
| 269 | ++ int err; |
| 270 | ++ |
| 271 | ++ err = nl_recvmsgs_default(th->nl_cli.sock_event); |
| 272 | ++ err = -nl2syserr(err); |
| 273 | ++ |
| 274 | ++ /* libnl thinks ENOBUFS and ENOMEM are same. Hope it was ENOBUFS. */ |
| 275 | ++ if (err == -ENOMEM) { |
| 276 | ++ warn(th, "Lost link notifications from kernel."); |
| 277 | ++ /* There's no way to know what events were lost and no |
| 278 | ++ * way to get them again. Refresh all. |
| 279 | ++ */ |
| 280 | ++ err = get_ifinfo_list(th); |
| 281 | ++ } |
| 282 | ++ |
| 283 | ++ if (err) |
| 284 | ++ return err; |
| 285 | ++ |
| 286 | + return check_call_change_handlers(th, TEAM_IFINFO_CHANGE); |
| 287 | + } |
| 288 | + |
| 289 | +diff --git a/libteam/team_private.h b/libteam/team_private.h |
| 290 | +index a07632f..a5eb0be 100644 |
| 291 | +--- a/libteam/team_private.h |
| 292 | ++++ b/libteam/team_private.h |
| 293 | +@@ -115,6 +115,9 @@ int ifinfo_link_with_port(struct team_handle *th, uint32_t ifindex, |
| 294 | + int ifinfo_link(struct team_handle *th, uint32_t ifindex, |
| 295 | + struct team_ifinfo **p_ifinfo); |
| 296 | + void ifinfo_unlink(struct team_ifinfo *ifinfo); |
| 297 | ++void ifinfo_clear_changed(struct team_handle *th); |
| 298 | ++void ifinfo_destroy_removed(struct team_handle *th); |
| 299 | ++int get_ifinfo_list(struct team_handle *th); |
| 300 | + int get_options_handler(struct nl_msg *msg, void *arg); |
| 301 | + int option_list_alloc(struct team_handle *th); |
| 302 | + int option_list_init(struct team_handle *th); |
0 commit comments