Skip to content

Commit b13d3cb

Browse files
Florian Westphaldavem330
authored andcommitted
inet: frag: move eviction of queues to work queue
When the high_thresh limit is reached we try to toss the 'oldest' incomplete fragment queues until memory limits are below the low_thresh value. This happens in softirq/packet processing context. This has two drawbacks: 1) processors might evict a queue that was about to be completed by another cpu, because they will compete wrt. resource usage and resource reclaim. 2) LRU list maintenance is expensive. But when constantly overloaded, even the 'least recently used' element is recent, so removing 'lru' queue first is not 'fairer' than removing any other fragment queue. This moves eviction out of the fast path: When the low threshold is reached, a work queue is scheduled which then iterates over the table and removes the queues that exceed the memory limits of the namespace. It sets a new flag called INET_FRAG_EVICTED on the evicted queues so the proper counters will get incremented when the queue is forcefully expired. When the high threshold is reached, no more fragment queues are created until we're below the limit again. The LRU list is now unused and will be removed in a followup patch. Joint work with Nikolay Aleksandrov. Suggested-by: Eric Dumazet <[email protected]> Signed-off-by: Florian Westphal <[email protected]> Signed-off-by: Nikolay Aleksandrov <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 86e93e4 commit b13d3cb

File tree

5 files changed

+112
-47
lines changed

5 files changed

+112
-47
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,9 @@ ipfrag_high_thresh - INTEGER
104104
is reached.
105105

106106
ipfrag_low_thresh - INTEGER
107-
See ipfrag_high_thresh
107+
Maximum memory used to reassemble IP fragments before the kernel
108+
begins to remove incomplete fragment queues to free up resources.
109+
The kernel still accepts new fragments for defragmentation.
108110

109111
ipfrag_time - INTEGER
110112
Time in seconds to keep an IP fragment in memory.

include/net/inet_frag.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ struct inet_frag_queue {
3232
int meat;
3333
__u8 last_in; /* first/last segment arrived? */
3434

35+
#define INET_FRAG_EVICTED 8
3536
#define INET_FRAG_COMPLETE 4
3637
#define INET_FRAG_FIRST_IN 2
3738
#define INET_FRAG_LAST_IN 1
@@ -48,7 +49,7 @@ struct inet_frag_queue {
4849
* rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
4950
* struct frag_queue))
5051
*/
51-
#define INETFRAGS_MAXDEPTH 128
52+
#define INETFRAGS_MAXDEPTH 128
5253

5354
struct inet_frag_bucket {
5455
struct hlist_head chain;
@@ -65,6 +66,9 @@ struct inet_frags {
6566
int secret_interval;
6667
struct timer_list secret_timer;
6768

69+
struct work_struct frags_work;
70+
unsigned int next_bucket;
71+
6872
/* The first call to hashfn is responsible to initialize
6973
* rnd. This is best done with net_get_random_once.
7074
*/

net/ipv4/inet_fragment.c

Lines changed: 99 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@
2525
#include <net/inet_frag.h>
2626
#include <net/inet_ecn.h>
2727

28+
#define INETFRAGS_EVICT_BUCKETS 128
29+
#define INETFRAGS_EVICT_MAX 512
30+
2831
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
2932
* Value : 0xff if frame should be dropped.
3033
* 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
@@ -46,8 +49,6 @@ const u8 ip_frag_ecn_table[16] = {
4649
};
4750
EXPORT_SYMBOL(ip_frag_ecn_table);
4851

49-
static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force);
50-
5152
static unsigned int
5253
inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
5354
{
@@ -89,10 +90,92 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
8990
mod_timer(&f->secret_timer, now + f->secret_interval);
9091
}
9192

93+
static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
94+
{
95+
return q->net->low_thresh == 0 ||
96+
frag_mem_limit(q->net) >= q->net->low_thresh;
97+
}
98+
99+
static unsigned int
100+
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
101+
{
102+
struct inet_frag_queue *fq;
103+
struct hlist_node *n;
104+
unsigned int evicted = 0;
105+
HLIST_HEAD(expired);
106+
107+
evict_again:
108+
spin_lock(&hb->chain_lock);
109+
110+
hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
111+
if (!inet_fragq_should_evict(fq))
112+
continue;
113+
114+
if (!del_timer(&fq->timer)) {
115+
/* q expiring right now thus increment its refcount so
116+
* it won't be freed under us and wait until the timer
117+
* has finished executing then destroy it
118+
*/
119+
atomic_inc(&fq->refcnt);
120+
spin_unlock(&hb->chain_lock);
121+
del_timer_sync(&fq->timer);
122+
WARN_ON(atomic_read(&fq->refcnt) != 1);
123+
inet_frag_put(fq, f);
124+
goto evict_again;
125+
}
126+
127+
/* suppress xmit of (icmp) error packet */
128+
fq->last_in &= ~INET_FRAG_FIRST_IN;
129+
fq->last_in |= INET_FRAG_EVICTED;
130+
hlist_del(&fq->list);
131+
hlist_add_head(&fq->list, &expired);
132+
++evicted;
133+
}
134+
135+
spin_unlock(&hb->chain_lock);
136+
137+
hlist_for_each_entry_safe(fq, n, &expired, list)
138+
f->frag_expire((unsigned long) fq);
139+
140+
return evicted;
141+
}
142+
143+
static void inet_frag_worker(struct work_struct *work)
144+
{
145+
unsigned int budget = INETFRAGS_EVICT_BUCKETS;
146+
unsigned int i, evicted = 0;
147+
struct inet_frags *f;
148+
149+
f = container_of(work, struct inet_frags, frags_work);
150+
151+
BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
152+
153+
read_lock_bh(&f->lock);
154+
155+
for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
156+
evicted += inet_evict_bucket(f, &f->hash[i]);
157+
i = (i + 1) & (INETFRAGS_HASHSZ - 1);
158+
if (evicted > INETFRAGS_EVICT_MAX)
159+
break;
160+
}
161+
162+
f->next_bucket = i;
163+
164+
read_unlock_bh(&f->lock);
165+
}
166+
167+
static void inet_frag_schedule_worker(struct inet_frags *f)
168+
{
169+
if (unlikely(!work_pending(&f->frags_work)))
170+
schedule_work(&f->frags_work);
171+
}
172+
92173
void inet_frags_init(struct inet_frags *f)
93174
{
94175
int i;
95176

177+
INIT_WORK(&f->frags_work, inet_frag_worker);
178+
96179
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
97180
struct inet_frag_bucket *hb = &f->hash[i];
98181

@@ -120,16 +203,22 @@ EXPORT_SYMBOL(inet_frags_init_net);
120203
void inet_frags_fini(struct inet_frags *f)
121204
{
122205
del_timer(&f->secret_timer);
206+
cancel_work_sync(&f->frags_work);
123207
}
124208
EXPORT_SYMBOL(inet_frags_fini);
125209

126210
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
127211
{
212+
int i;
213+
128214
nf->low_thresh = 0;
129215

130-
local_bh_disable();
131-
inet_frag_evictor(nf, f, true);
132-
local_bh_enable();
216+
read_lock_bh(&f->lock);
217+
218+
for (i = 0; i < INETFRAGS_HASHSZ ; i++)
219+
inet_evict_bucket(f, &f->hash[i]);
220+
221+
read_unlock_bh(&f->lock);
133222

134223
percpu_counter_destroy(&nf->mem);
135224
}
@@ -205,41 +294,6 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
205294
}
206295
EXPORT_SYMBOL(inet_frag_destroy);
207296

208-
static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
209-
{
210-
struct inet_frag_queue *q;
211-
int work, evicted = 0;
212-
213-
work = frag_mem_limit(nf) - nf->low_thresh;
214-
while (work > 0 || force) {
215-
spin_lock(&nf->lru_lock);
216-
217-
if (list_empty(&nf->lru_list)) {
218-
spin_unlock(&nf->lru_lock);
219-
break;
220-
}
221-
222-
q = list_first_entry(&nf->lru_list,
223-
struct inet_frag_queue, lru_list);
224-
atomic_inc(&q->refcnt);
225-
/* Remove q from list to avoid several CPUs grabbing it */
226-
list_del_init(&q->lru_list);
227-
228-
spin_unlock(&nf->lru_lock);
229-
230-
spin_lock(&q->lock);
231-
if (!(q->last_in & INET_FRAG_COMPLETE))
232-
inet_frag_kill(q, f);
233-
spin_unlock(&q->lock);
234-
235-
if (atomic_dec_and_test(&q->refcnt))
236-
inet_frag_destroy(q, f, &work);
237-
evicted++;
238-
}
239-
240-
return evicted;
241-
}
242-
243297
static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
244298
struct inet_frag_queue *qp_in, struct inet_frags *f,
245299
void *arg)
@@ -292,8 +346,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
292346
{
293347
struct inet_frag_queue *q;
294348

295-
if (frag_mem_limit(nf) > nf->high_thresh)
349+
if (frag_mem_limit(nf) > nf->high_thresh) {
350+
inet_frag_schedule_worker(f);
296351
return NULL;
352+
}
297353

298354
q = kzalloc(f->qsize, GFP_ATOMIC);
299355
if (q == NULL)
@@ -331,8 +387,8 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
331387
struct inet_frag_queue *q;
332388
int depth = 0;
333389

334-
if (frag_mem_limit(nf) > nf->high_thresh)
335-
inet_frag_evictor(nf, f, false);
390+
if (frag_mem_limit(nf) > nf->low_thresh)
391+
inet_frag_schedule_worker(f);
336392

337393
hash &= (INETFRAGS_HASHSZ - 1);
338394
hb = &f->hash[hash];

net/ipv4/ip_fragment.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,8 @@ static void ip_expire(unsigned long arg)
195195

196196
ipq_kill(qp);
197197

198-
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
198+
if (!(qp->q.last_in & INET_FRAG_EVICTED))
199+
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
199200
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
200201

201202
if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {

net/ipv6/reassembly.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,9 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
141141
if (!dev)
142142
goto out_rcu_unlock;
143143

144-
IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
144+
if (!(fq->q.last_in & INET_FRAG_EVICTED))
145+
IP6_INC_STATS_BH(net, __in6_dev_get(dev),
146+
IPSTATS_MIB_REASMTIMEOUT);
145147
IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
146148

147149
/* Don't send error if the first segment did not arrive. */

0 commit comments

Comments
 (0)