Skip to content

Commit 70bbeb2

Browse files
cloehleMike Snitzer
authored andcommitted
dm delay: for short delays, use kthread instead of timers and wq
DM delay's current design of using timers and wq to realize the delays is insufficient for delays below ~50ms. This commit enhances the design to use a kthread to flush the expired delays, trading some CPU time (in some cases) for better delay accuracy and delays closer to what the user requested for smaller delays. The new design is chosen as long as all the delays are below 50ms. Since bios can't be completed in interrupt context using a kthread is probably the most reasonable way to approach this. Testing with echo "0 2097152 zero" | dmsetup create dm-zeros for i in $(seq 0 20); do echo "0 2097152 delay /dev/mapper/dm-zeros 0 $i" | dmsetup create dm-delay-${i}ms; done Some performance numbers for comparison, on beaglebone black (single core) CONFIG_HZ_1000=y: fio --name=1msread --rw=randread --bs=4k --runtime=60 --time_based \ --filename=/dev/mapper/dm-delay-1ms Theoretical maximum: 1000 IOPS Previous: 250 IOPS Kthread: 500 IOPS fio --name=10msread --rw=randread --bs=4k --runtime=60 --time_based \ --filename=/dev/mapper/dm-delay-10ms Theoretical maximum: 100 IOPS Previous: 45 IOPS Kthread: 50 IOPS fio --name=1mswrite --rw=randwrite --direct=1 --bs=4k --runtime=60 \ --time_based --filename=/dev/mapper/dm-delay-1ms Theoretical maximum: 1000 IOPS Previous: 498 IOPS Kthread: 1000 IOPS fio --name=10mswrite --rw=randwrite --direct=1 --bs=4k --runtime=60 \ --time_based --filename=/dev/mapper/dm-delay-10ms Theoretical maximum: 100 IOPS Previous: 90 IOPS Kthread: 100 IOPS (This one is just to prove the new design isn't impacting throughput, not really about delays): fio --name=10mswriteasync --rw=randwrite --direct=1 --bs=4k \ --runtime=60 --time_based --filename=/dev/mapper/dm-delay-10ms \ --numjobs=32 --iodepth=64 --ioengine=libaio --group_reporting Previous: 13.3k IOPS Kthread: 13.3k IOPS Signed-off-by: Christian Loehle <[email protected]> [Harshit: kthread_create error handling fix in delay_ctr] Signed-off-by: Harshit Mogalapalli <[email protected]> Signed-off-by: Mike Snitzer <[email protected]>
1 parent 8388cba commit 70bbeb2

File tree

1 file changed

+88
-15
lines changed

1 file changed

+88
-15
lines changed

drivers/md/dm-delay.c

Lines changed: 88 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <linux/blkdev.h>
1414
#include <linux/bio.h>
1515
#include <linux/slab.h>
16+
#include <linux/kthread.h>
1617

1718
#include <linux/device-mapper.h>
1819

@@ -31,6 +32,7 @@ struct delay_c {
3132
struct workqueue_struct *kdelayd_wq;
3233
struct work_struct flush_expired_bios;
3334
struct list_head delayed_bios;
35+
struct task_struct *worker;
3436
atomic_t may_delay;
3537

3638
struct delay_class read;
@@ -66,6 +68,44 @@ static void queue_timeout(struct delay_c *dc, unsigned long expires)
6668
mutex_unlock(&dc->timer_lock);
6769
}
6870

71+
static inline bool delay_is_fast(struct delay_c *dc)
72+
{
73+
return !!dc->worker;
74+
}
75+
76+
static void flush_delayed_bios_fast(struct delay_c *dc, bool flush_all)
77+
{
78+
struct dm_delay_info *delayed, *next;
79+
80+
mutex_lock(&delayed_bios_lock);
81+
list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) {
82+
if (flush_all || time_after_eq(jiffies, delayed->expires)) {
83+
struct bio *bio = dm_bio_from_per_bio_data(delayed,
84+
sizeof(struct dm_delay_info));
85+
list_del(&delayed->list);
86+
dm_submit_bio_remap(bio, NULL);
87+
delayed->class->ops--;
88+
}
89+
}
90+
mutex_unlock(&delayed_bios_lock);
91+
}
92+
93+
static int flush_worker_fn(void *data)
94+
{
95+
struct delay_c *dc = data;
96+
97+
while (1) {
98+
flush_delayed_bios_fast(dc, false);
99+
if (unlikely(list_empty(&dc->delayed_bios))) {
100+
set_current_state(TASK_INTERRUPTIBLE);
101+
schedule();
102+
} else
103+
cond_resched();
104+
}
105+
106+
return 0;
107+
}
108+
69109
static void flush_bios(struct bio *bio)
70110
{
71111
struct bio *n;
@@ -78,7 +118,7 @@ static void flush_bios(struct bio *bio)
78118
}
79119
}
80120

81-
static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all)
121+
static struct bio *flush_delayed_bios(struct delay_c *dc, bool flush_all)
82122
{
83123
struct dm_delay_info *delayed, *next;
84124
unsigned long next_expires = 0;
@@ -115,7 +155,10 @@ static void flush_expired_bios(struct work_struct *work)
115155
struct delay_c *dc;
116156

117157
dc = container_of(work, struct delay_c, flush_expired_bios);
118-
flush_bios(flush_delayed_bios(dc, 0));
158+
if (delay_is_fast(dc))
159+
flush_delayed_bios_fast(dc, false);
160+
else
161+
flush_bios(flush_delayed_bios(dc, false));
119162
}
120163

121164
static void delay_dtr(struct dm_target *ti)
@@ -131,8 +174,11 @@ static void delay_dtr(struct dm_target *ti)
131174
dm_put_device(ti, dc->write.dev);
132175
if (dc->flush.dev)
133176
dm_put_device(ti, dc->flush.dev);
177+
if (dc->worker)
178+
kthread_stop(dc->worker);
134179

135-
mutex_destroy(&dc->timer_lock);
180+
if (!delay_is_fast(dc))
181+
mutex_destroy(&dc->timer_lock);
136182

137183
kfree(dc);
138184
}
@@ -175,6 +221,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
175221
{
176222
struct delay_c *dc;
177223
int ret;
224+
unsigned int max_delay;
178225

179226
if (argc != 3 && argc != 6 && argc != 9) {
180227
ti->error = "Requires exactly 3, 6 or 9 arguments";
@@ -188,16 +235,14 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
188235
}
189236

190237
ti->private = dc;
191-
timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
192-
INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
193238
INIT_LIST_HEAD(&dc->delayed_bios);
194-
mutex_init(&dc->timer_lock);
195239
atomic_set(&dc->may_delay, 1);
196240
dc->argc = argc;
197241

198242
ret = delay_class_ctr(ti, &dc->read, argv);
199243
if (ret)
200244
goto bad;
245+
max_delay = dc->read.delay;
201246

202247
if (argc == 3) {
203248
ret = delay_class_ctr(ti, &dc->write, argv);
@@ -206,6 +251,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
206251
ret = delay_class_ctr(ti, &dc->flush, argv);
207252
if (ret)
208253
goto bad;
254+
max_delay = max(max_delay, dc->write.delay);
255+
max_delay = max(max_delay, dc->flush.delay);
209256
goto out;
210257
}
211258

@@ -216,19 +263,37 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
216263
ret = delay_class_ctr(ti, &dc->flush, argv + 3);
217264
if (ret)
218265
goto bad;
266+
max_delay = max(max_delay, dc->flush.delay);
219267
goto out;
220268
}
221269

222270
ret = delay_class_ctr(ti, &dc->flush, argv + 6);
223271
if (ret)
224272
goto bad;
273+
max_delay = max(max_delay, dc->flush.delay);
225274

226275
out:
227-
dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
228-
if (!dc->kdelayd_wq) {
229-
ret = -EINVAL;
230-
DMERR("Couldn't start kdelayd");
231-
goto bad;
276+
if (max_delay < 50) {
277+
/*
278+
* In case of small requested delays, use kthread instead of
279+
* timers and workqueue to achieve better latency.
280+
*/
281+
dc->worker = kthread_create(&flush_worker_fn, dc,
282+
"dm-delay-flush-worker");
283+
if (IS_ERR(dc->worker)) {
284+
ret = PTR_ERR(dc->worker);
285+
goto bad;
286+
}
287+
} else {
288+
timer_setup(&dc->delay_timer, handle_delayed_timer, 0);
289+
INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
290+
mutex_init(&dc->timer_lock);
291+
dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
292+
if (!dc->kdelayd_wq) {
293+
ret = -EINVAL;
294+
DMERR("Couldn't start kdelayd");
295+
goto bad;
296+
}
232297
}
233298

234299
ti->num_flush_bios = 1;
@@ -260,7 +325,10 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
260325
list_add_tail(&delayed->list, &dc->delayed_bios);
261326
mutex_unlock(&delayed_bios_lock);
262327

263-
queue_timeout(dc, expires);
328+
if (delay_is_fast(dc))
329+
wake_up_process(dc->worker);
330+
else
331+
queue_timeout(dc, expires);
264332

265333
return DM_MAPIO_SUBMITTED;
266334
}
@@ -270,8 +338,13 @@ static void delay_presuspend(struct dm_target *ti)
270338
struct delay_c *dc = ti->private;
271339

272340
atomic_set(&dc->may_delay, 0);
273-
del_timer_sync(&dc->delay_timer);
274-
flush_bios(flush_delayed_bios(dc, 1));
341+
342+
if (delay_is_fast(dc))
343+
flush_delayed_bios_fast(dc, true);
344+
else {
345+
del_timer_sync(&dc->delay_timer);
346+
flush_bios(flush_delayed_bios(dc, true));
347+
}
275348
}
276349

277350
static void delay_resume(struct dm_target *ti)
@@ -356,7 +429,7 @@ static int delay_iterate_devices(struct dm_target *ti,
356429

357430
static struct target_type delay_target = {
358431
.name = "delay",
359-
.version = {1, 3, 0},
432+
.version = {1, 4, 0},
360433
.features = DM_TARGET_PASSES_INTEGRITY,
361434
.module = THIS_MODULE,
362435
.ctr = delay_ctr,

0 commit comments

Comments
 (0)