From: Eric Sandeen <sandeen@redhat.com> Date: Thu, 2 Sep 2010 17:43:15 -0400 Subject: [fs] direct-io: move aio_complete into ->end_io Message-id: <4C7FE233.8010401@redhat.com> Patchwork-id: 28024 O-Subject: [RHEL5.6 PATCH 1/3] direct-io: move aio_complete into ->end_io Bugzilla: 617690 RH-Acked-by: Dave Chinner <dchinner@redhat.com> For Bug 617690 - ext4 and xfs wrong data returned on read after write if file size was changed with ftruncate See also kernel.org bz https://bugzilla.kernel.org/show_bug.cgi?id=16165 Backport of this upstream commit: commit 40e2e97316af6e62affab7a392e792494b8d9dde Author: Christoph Hellwig <hch@infradead.org> Date: Sun Jul 18 21:17:09 2010 +0000 direct-io: move aio_complete into ->end_io Filesystems with unwritten extent support must not complete an AIO request until the transaction to convert the extent has been commited. That means the aio_complete calls needs to be moved into the ->end_io callback so that the filesystem can control when to call it exactly. This makes a bit of a mess out of dio_complete and the ->end_io callback prototype even more complicated. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Alex Elder <aelder@sgi.com> This required some KABI tricks because it changed dio_iodone_t, which is part of the signature of __blockdev_direct_IO(). To work around this I've used yet another fs_flag so that ext4 & xfs can define an iodone handler of dio_iodone2_t, cast that into dio_iodone_t for calls into blockdev_direct_IO*(), and then dio_complete() looks at the fs_flag to know whether it should cast it back into the new dio_iodone2_t type and call it with more args or just go the old route. Thanks, -Eric Signed-off-by: Jarod Wilson <jarod@redhat.com> diff --git a/fs/direct-io.c b/fs/direct-io.c index 01f0592..44c9378 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -216,7 +216,7 @@ static struct page *dio_get_page(struct dio *dio) * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static int dio_complete(struct dio *dio, loff_t offset, int ret) +static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) { ssize_t transferred = 0; @@ -237,13 +237,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) transferred = dio->i_size - offset; } - if (dio->end_io && dio->result) - dio->end_io(dio->iocb, offset, transferred, - dio->map_bh.b_private); - if (dio->lock_type == DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); - if (ret == 0) ret = dio->page_errors; if (ret == 0) @@ -251,6 +244,23 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) if (ret == 0) ret = transferred; + if (dio->end_io && dio->result) { + if (dio->inode->i_sb->s_type->fs_flags & FS_HAS_IODONE2) { + dio_iodone2_t *end_io = (dio_iodone2_t *)dio->end_io; + + end_io(dio->iocb, offset, transferred, + dio->map_bh.b_private, ret, is_async); + } else + dio->end_io(dio->iocb, offset, transferred, + dio->map_bh.b_private); + } else if (is_async) { + aio_complete(dio->iocb, ret, 0); + } + + if (dio->lock_type == DIO_LOCKING) + /* lockdep: non-owner release */ + up_read_non_owner(&dio->inode->i_alloc_sem); + return ret; } @@ -277,8 +287,7 @@ static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - int ret = dio_complete(dio, dio->iocb->ki_pos, 0); - aio_complete(dio->iocb, ret, 0); + dio_complete(dio, dio->iocb->ki_pos, 0, true); kfree(dio); } @@ -1089,7 +1098,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, spin_unlock_irqrestore(&dio->bio_lock, flags); if (ret2 == 0) { - ret = dio_complete(dio, offset, ret); + ret = dio_complete(dio, offset, ret, false); kfree(dio); } else BUG_ON(ret != -EIOCBQUEUED); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5daef8e..b8e48ea 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3732,7 +3732,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode) } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) + ssize_t size, void *private, int ret, + bool is_async) { ext4_io_end_t *io_end = iocb->private; struct ext4_inode_info *ei; @@ -3741,7 +3742,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, /* if not async direct IO or dio with 0 bytes write, just return */ if (!io_end || !size) - return; + goto out; ei = EXT4_I(io_end->inode); @@ -3754,7 +3755,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (io_end->flag != DIO_AIO_UNWRITTEN){ ext4_free_io_end(io_end); iocb->private = NULL; - return; + goto out; } io_end->offset = offset; @@ -3770,6 +3771,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); iocb->private = NULL; +out: + if (is_async) + aio_complete(iocb, ret, 0); } /* * For ext4 extent files, ext4 will do direct-io write to holes, @@ -3841,7 +3845,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, inode->i_sb->s_bdev, iov, offset, nr_segs, ext4_get_block_dio_write, - ext4_end_io_dio); + (dio_iodone_t *)ext4_end_io_dio); if (iocb->private) EXT4_I(inode)->cur_aio_dio = NULL; /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e631779..1de676d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4032,7 +4032,8 @@ static struct file_system_type ext4_fs_type = { .get_sb = ext4_get_sb, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_HAS_FALLOCATE|FS_HAS_FIEMAP - |FS_HAS_FREEZE|FS_HAS_TRYTOFREE|FS_HAS_GETRESV, + |FS_HAS_FREEZE|FS_HAS_TRYTOFREE|FS_HAS_GETRESV + |FS_HAS_IODONE2, }; #ifdef CONFIG_EXT4DEV_COMPAT @@ -4052,7 +4053,8 @@ static struct file_system_type ext4dev_fs_type = { .get_sb = ext4dev_get_sb, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_HAS_FALLOCATE|FS_HAS_FIEMAP - |FS_HAS_FREEZE|FS_HAS_TRYTOFREE|FS_HAS_GETRESV, + |FS_HAS_FREEZE|FS_HAS_TRYTOFREE|FS_HAS_GETRESV + |FS_HAS_IODONE2, }; MODULE_ALIAS("ext4dev"); #endif diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f1d1c34..4795179 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -608,7 +608,9 @@ bail: static void ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, ssize_t bytes, - void *private) + void *private, + int ret, + bool is_async) { struct inode *inode = iocb->ki_filp->f_dentry->d_inode; diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index b2062f3..0f334b5 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1433,7 +1433,9 @@ xfs_end_io_direct( struct kiocb *iocb, loff_t offset, ssize_t size, - void *private) + void *private, + int ret, + bool is_async) { xfs_ioend_t *ioend = iocb->private; @@ -1479,6 +1481,9 @@ xfs_end_io_direct( * against double-freeing. */ iocb->private = NULL; + + if (is_async) + aio_complete(iocb, ret, 0); } STATIC ssize_t @@ -1501,13 +1506,13 @@ xfs_vm_direct_IO( ret = blockdev_direct_IO_own_locking(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, - xfs_end_io_direct); + (dio_iodone_t *)xfs_end_io_direct); } else { iocb->private = xfs_alloc_ioend(inode, IOMAP_READ); ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, offset, nr_segs, xfs_get_blocks_direct, - xfs_end_io_direct); + (dio_iodone_t *)xfs_end_io_direct); } if (unlikely(ret != -EIOCBQUEUED && iocb->private)) diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index 67d9810..14ee645 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -38,6 +38,8 @@ typedef struct xfs_ioend { size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ + struct kiocb *io_iocb; + int io_result; } xfs_ioend_t; extern const struct address_space_operations_ext xfs_address_space_operations; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index ee26f44..6c0745e 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1890,7 +1890,8 @@ static struct file_system_type xfs_fs_type = { .get_sb = xfs_fs_get_sb, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_HAS_FALLOCATE| - FS_HAS_FIEMAP|FS_HAS_FREEZE, + FS_HAS_FIEMAP|FS_HAS_FREEZE| + FS_HAS_IODONE2, }; STATIC int __init diff --git a/include/linux/fs.h b/include/linux/fs.h index 670b760..8f43fcc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -97,6 +97,7 @@ extern int dir_notify_enable; #define FS_HAS_FREEZE 16 /* Safe to check for ->freeze_fs etc */ #define FS_HAS_TRYTOFREE 32 /* Safe to check for ->bdev_try_to_free... */ #define FS_HAS_GETRESV 64 /* Safe to check for ->get_reserved_space */ +#define FS_HAS_IODONE2 128 /* dio->io_done is type dio_iodone2_t */ #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() * during rename() internally. @@ -324,6 +325,9 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); +typedef void (dio_iodone2_t)(struct kiocb *iocb, loff_t offset, + ssize_t bytes, void *private, int ret, + bool is_async); /* * Attribute flags. These should be or-ed together to figure out what