From: Eric Sandeen <sandeen@redhat.com> Date: Mon, 14 Dec 2009 19:17:57 -0500 Subject: [fs] ext3: replace lock_super with explicit resize lock Message-id: <4B268F65.6060300@redhat.com> Patchwork-id: 21929 O-Subject: [PATCH RHEL5.5] ext3: Replace lock/unlock_super() with an explicit lock for resizing Bugzilla: 525100 RH-Acked-by: Josef Bacik <josef@redhat.com> This is for bug 525100, "resize2fs online resize hangs" At issue is a deadlock between orphan list management wanting to take the sb_lock, an resize wanting that as well, and getting sideways with journal locking & extension in extend_or_restart_transaction() Upstream has a similar patch to this in ext4, though it was done out of tidiness rather than to address this issue I think. I've just sent this (and related) patches upstream for ext3; this one change should suffice for the bug at hand. Thanks also to Josef for looking at the deadlock with me. :) I've tested this against the testcase mentioned in the bug. Upstream commit message follows: Use a separate lock to protect s_groups_count and the other block group descriptors which get changed via an on-line resize operation, so we can stop overloading the use of lock_super(). Crossport of 32ed5058ce90024efcd811254b4b1de0468099df Thanks, -Eric Signed-off-by: Don Zickus <dzickus@redhat.com> diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index e9a7a54..32b4b66 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -211,7 +211,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { err = -EBUSY; goto exit_journal; @@ -326,7 +326,7 @@ exit_bh: brelse(bh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; @@ -664,11 +664,12 @@ exit_free: * important part is that the new block and inode counts are in the backup * superblocks, and the location of the new group metadata in the GDT backups. * - * We do not need lock_super() for this, because these blocks are not - * otherwise touched by the filesystem code when it is mounted. We don't - * need to worry about last changing from sbi->s_groups_count, because the - * worst that can happen is that we do not copy the full number of backups - * at this time. The resize which changed s_groups_count will backup again. + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. */ static void update_backups(struct super_block *sb, int blk_off, char *data, int size) @@ -828,7 +829,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) goto exit_put; } - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { ext3_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); @@ -859,7 +860,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) /* * OK, now we've set up the new group. Time to make it active. * - * Current kernels don't lock all allocations via lock_super(), + * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -905,12 +906,12 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold lock_super + * * Writers of s_groups_count *must* hold s_resize_lock * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold lock_super() over the access + * * Readers must hold s_resize_lock over the access * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. @@ -943,7 +944,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) sb->s_dirt = 1; exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; if (!err) { @@ -980,7 +981,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after - * taking lock_super() below. */ + * taking the s_resize_lock below. */ o_blocks_count = le32_to_cpu(es->s_blocks_count); o_groups_count = EXT3_SB(sb)->s_groups_count; @@ -1052,11 +1053,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, goto exit_put; } - lock_super(sb); + mutex_lock(&EXT3_SB(sb)->s_resize_lock); if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { ext3_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); err = -EBUSY; goto exit_put; } @@ -1065,14 +1066,14 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, EXT3_SB(sb)->s_sbh))) { ext3_warning(sb, __FUNCTION__, "error %d on journal write access", err); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_journal_stop(handle); goto exit_put; } es->s_blocks_count = cpu_to_le32(o_blocks_count + add); ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); sb->s_dirt = 1; - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, o_blocks_count + add); ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 2663e45..f78aae9 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1723,6 +1723,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->dq_op = &ext3_quota_operations; #endif INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_resize_lock); sb->s_root = NULL; diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h index f61309c..2667379 100644 --- a/include/linux/ext3_fs_sb.h +++ b/include/linux/ext3_fs_sb.h @@ -68,6 +68,7 @@ struct ext3_sb_info { struct inode * s_journal_inode; struct journal_s * s_journal; struct list_head s_orphan; + struct mutex s_resize_lock; unsigned long s_commit_interval; struct block_device *journal_bdev; #ifdef CONFIG_JBD_DEBUG