Sophie: kernel-2.6.18-238.19.1.el5.centos.plus src

kernel-2.6.18-238.19.1.el5.centos.plus.src.rpm

From: Jeff Moyer <jmoyer@redhat.com>
Date: Mon, 9 Nov 2009 23:18:50 -0500
Subject: [aio] implement request batching
Message-id: <x49bpjbffs5.fsf@segfault.boston.devel.redhat.com>
Patchwork-id: 21340
O-Subject: [RHEL 5 PATCH] aio: Implement request batching
Bugzilla: 532769
RH-Acked-by: Josef Bacik <josef@redhat.com>

Hi,

This is a backport of my upstream work to batch together AIO requests in
order to achieve better request merging and hence better disk throughput:

commit cfb1e33eed48165763edc7a4a067cf5f74898d0b
Author: Jeff Moyer <jmoyer@redhat.com>
Date:   Fri Oct 2 18:57:36 2009 -0400

    aio: implement request batching

    Hi,

    Some workloads issue batches of small I/O, and the performance is
    poor due to the call to blk_run_address_space for every single iocb.
    Nathan Roberts pointed this out, and suggested that by deferring
    this call until all I/Os in the iocb array are submitted to the
    block layer, we can realize some impressive performance gains (up to
    30% for sequential 4k reads in batches of 16).

(Note, this is queued in Jens' block tree for 2.6.33)

I tested this code on RHEL 5 against some internal SATA disks.  The
test was conducted using the veritable aio-stress utility, using an I/O
depth of 128 with a batch count of 16.  Record sizes are 4k.  There was
no noticable difference in throughput, since the internal disks
subsystem is so slow.  Unfortunately, I don't have a RHEL 5 box hooked
up to decent storage to give this a good beating right now.  I posted
results for upstream kernels, and was also able to get some nice results
from the RHEL 4 version of this patch:

vanilla:  79.34MB/s
patched: 137.87MB/s

This fixes bug 532769.  Brew builds are available here for testing:
  http://download.devel.redhat.com/brewroot/scratch/jmoyer/task_2068499/
Comments, as always, are appreciated.

Cheers,
Jeff

Signed-off-by: Don Zickus <dzickus@redhat.com>

diff --git a/fs/aio.c b/fs/aio.c
index 7c07b3e..0a3efb7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -29,6 +29,11 @@
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
+#ifndef __GENKSYMS__
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/hash.h>
+#endif
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -58,6 +63,21 @@ static DECLARE_WORK(fput_work, aio_fput_routine, NULL);
 static DEFINE_SPINLOCK(fput_lock);
 static LIST_HEAD(fput_head);
 
+/*
+ * TODO: If we want to be clever, we could determine the size of the
+ * hash based on the nr of iocbs passed to io_submit.  This would mean
+ * a switch from the on-stack allocation, and a method to determine
+ * optimal hash size based on number of elements.   -JEM
+ */
+#define AIO_BATCH_HASH_BITS	3 /* allocated on-stack, so don't go crazy */
+#define AIO_BATCH_HASH_SIZE	(1 << AIO_BATCH_HASH_BITS)
+struct aio_batch_entry {
+	struct hlist_node list;
+	struct address_space *mapping;
+};
+mempool_t *abe_pool;
+ 
+
 static void aio_kick_handler(void *);
 static void aio_queue_work(struct kioctx *);
 
@@ -73,6 +93,8 @@ static int __init aio_setup(void)
 				0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
 	aio_wq = create_workqueue("aio");
+	abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
+	BUG_ON(!abe_pool);
 
 	pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
 
@@ -1484,8 +1506,44 @@ static int aio_wake_function(wait_queue_t *wait, unsigned mode,
 	return 1;
 }
 
+static void aio_batch_add(struct address_space *mapping,
+			  struct hlist_head *batch_hash)
+{
+	struct aio_batch_entry *abe;
+	struct hlist_node *pos;
+	unsigned bucket;
+
+	bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
+	hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
+		if (abe->mapping == mapping)
+			return;
+	}
+
+	abe = mempool_alloc(abe_pool, GFP_KERNEL);
+	BUG_ON(!igrab(mapping->host));
+	abe->mapping = mapping;
+	hlist_add_head(&abe->list, &batch_hash[bucket]);
+	return;
+}
+
+static void aio_batch_free(struct hlist_head *batch_hash)
+{
+	struct aio_batch_entry *abe;
+	struct hlist_node *pos, *n;
+	int i;
+
+	for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
+		hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
+			blk_run_address_space(abe->mapping);
+			iput(abe->mapping->host);
+			hlist_del(&abe->list);
+			mempool_free(abe, abe_pool);
+		}
+	}
+}
+
 int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb)
+			struct iocb *iocb, struct hlist_head *batch_hash)
 {
 	struct kiocb *req;
 	struct file *file;
@@ -1549,6 +1607,8 @@ int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			;
 	}
 	spin_unlock_irq(&ctx->ctx_lock);
+	aio_batch_add(file->f_mapping, batch_hash);
+
 	aio_put_req(req);	/* drop extra ref to req */
 	return 0;
 
@@ -1576,6 +1636,7 @@ asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr,
 	struct kioctx *ctx;
 	long ret = 0;
 	int i;
+	struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
 
 	if (unlikely(nr < 0))
 		return -EINVAL;
@@ -1607,10 +1668,11 @@ asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr,
 			break;
 		}
 
-		ret = io_submit_one(ctx, user_iocb, &tmp);
+		ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
 		if (ret)
 			break;
 	}
+	aio_batch_free(batch_hash);
 
 	put_ioctx(ctx);
 	return i ? i : ret;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9f53c68..01f0592 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1041,9 +1041,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	if (dio->bio)
 		dio_bio_submit(dio);
 
-	/* All IO is now issued, send it on its way */
-	blk_run_address_space(inode->i_mapping);
-
 	/*
 	 * It is possible that, we return short IO due to end of file.
 	 * In that case, we need to release all the pages we got hold on.
@@ -1070,8 +1067,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	    ((rw & READ) || (dio->result == dio->size)))
 		ret = -EIOCBQUEUED;
 
-	if (ret != -EIOCBQUEUED)
+	if (ret != -EIOCBQUEUED) {
+		/* All IO is now issued, send it on its way */
+		blk_run_address_space(inode->i_mapping);
 		dio_await_completion(dio);
+	}
 
 	/*
 	 * Sync will always be dropping the final ref and completing the
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 00c8efa..653758a 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -205,12 +205,13 @@ struct mm_struct;
 extern void FASTCALL(exit_aio(struct mm_struct *mm));
 extern struct kioctx *lookup_ioctx(unsigned long ctx_id);
 extern int FASTCALL(io_submit_one(struct kioctx *ctx,
-			struct iocb __user *user_iocb, struct iocb *iocb));
+			struct iocb __user *user_iocb, struct iocb *iocb,
+			struct hlist_head *batch_hash));
 
 /* semi private, but used by the 32bit emulations: */
 struct kioctx *lookup_ioctx(unsigned long ctx_id);
 int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-				  struct iocb *iocb));
+			   struct iocb *iocb, struct hlist_head *batch_hash));
 
 #define get_ioctx(kioctx) do {						\
 	BUG_ON(unlikely(atomic_read(&(kioctx)->users) <= 0));		\