Skip to content

Commit d2a189c

Browse files
josefbacikkdave
authored andcommitted
btrfs: do not infinite loop in data reclaim if we aborted
Error injection stressing uncovered a busy loop in our data reclaim loop. There are two cases here, one where we loop creating block groups until space_info->full is set, or in the main loop we will skip erroring out any tickets if space_info->full == 0. Unfortunately if we aborted the transaction then we will never allocate chunks or reclaim any space and thus never get ->full, and you'll see stack traces like this watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [kworker/u4:4:139] CPU: 0 PID: 139 Comm: kworker/u4:4 Tainted: G W 5.13.0-rc1+ torvalds#328 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Workqueue: events_unbound btrfs_async_reclaim_data_space RIP: 0010:btrfs_join_transaction+0x12/0x20 RSP: 0018:ffffb2b780b77de0 EFLAGS: 00000246 RAX: ffffb2b781863d58 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000801 RSI: ffff987952b57400 RDI: ffff987940aa3000 RBP: ffff987954d55000 R08: 0000000000000001 R09: ffff98795539e8f0 R10: 000000000000000f R11: 000000000000000f R12: ffffffffffffffff R13: ffff987952b574c8 R14: ffff987952b57400 R15: 0000000000000008 FS: 0000000000000000(0000) GS:ffff9879bbc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f0703da4000 CR3: 0000000113398004 CR4: 0000000000370ef0 Call Trace: flush_space+0x4a8/0x660 btrfs_async_reclaim_data_space+0x55/0x130 process_one_work+0x1e9/0x380 worker_thread+0x53/0x3e0 ? process_one_work+0x380/0x380 kthread+0x118/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x1f/0x30 Fix this by checking to see if we have a btrfs fs error in either of the reclaim loops, and if so fail the tickets and bail. In addition to this, fix maybe_fail_all_tickets() to not try to grant tickets if we've aborted, simply fail everything. Signed-off-by: Josef Bacik <[email protected]> Signed-off-by: David Sterba <[email protected]>
1 parent c76ac8d commit d2a189c

File tree

1 file changed

+24
-5
lines changed

1 file changed

+24
-5
lines changed

fs/btrfs/space-info.c

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -941,6 +941,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
941941
struct reserve_ticket *ticket;
942942
u64 tickets_id = space_info->tickets_id;
943943
u64 first_ticket_bytes = 0;
944+
const bool aborted = btrfs_has_fs_error(fs_info);
944945

945946
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
946947
btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
@@ -952,7 +953,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
952953
ticket = list_first_entry(&space_info->tickets,
953954
struct reserve_ticket, list);
954955

955-
if (ticket->steal &&
956+
if (!aborted && ticket->steal &&
956957
steal_from_global_rsv(fs_info, space_info, ticket))
957958
return true;
958959

@@ -968,15 +969,18 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
968969
*/
969970
if (first_ticket_bytes == 0)
970971
first_ticket_bytes = ticket->bytes;
971-
else if (first_ticket_bytes > ticket->bytes)
972+
else if (!aborted && first_ticket_bytes > ticket->bytes)
972973
return true;
973974

974-
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
975+
if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
975976
btrfs_info(fs_info, "failing ticket with %llu bytes",
976977
ticket->bytes);
977978

978979
remove_ticket(space_info, ticket);
979-
ticket->error = -ENOSPC;
980+
if (aborted)
981+
ticket->error = -EIO;
982+
else
983+
ticket->error = -ENOSPC;
980984
wake_up(&ticket->wait);
981985

982986
/*
@@ -985,7 +989,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
985989
* here to see if we can make progress with the next ticket in
986990
* the list.
987991
*/
988-
btrfs_try_granting_tickets(fs_info, space_info);
992+
if (!aborted)
993+
btrfs_try_granting_tickets(fs_info, space_info);
989994
}
990995
return (tickets_id != space_info->tickets_id);
991996
}
@@ -1253,6 +1258,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
12531258
spin_unlock(&space_info->lock);
12541259
return;
12551260
}
1261+
1262+
/* Something happened, fail everything and bail. */
1263+
if (btrfs_has_fs_error(fs_info))
1264+
goto aborted_fs;
12561265
last_tickets_id = space_info->tickets_id;
12571266
spin_unlock(&space_info->lock);
12581267
}
@@ -1283,9 +1292,19 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work)
12831292
} else {
12841293
flush_state = 0;
12851294
}
1295+
1296+
/* Something happened, fail everything and bail. */
1297+
if (btrfs_has_fs_error(fs_info))
1298+
goto aborted_fs;
1299+
12861300
}
12871301
spin_unlock(&space_info->lock);
12881302
}
1303+
return;
1304+
aborted_fs:
1305+
maybe_fail_all_tickets(fs_info, space_info);
1306+
space_info->flush = 0;
1307+
spin_unlock(&space_info->lock);
12891308
}
12901309

12911310
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)

0 commit comments

Comments
 (0)