From: Doug Ledford <dledford@redhat.com> Date: Fri, 24 Apr 2009 16:16:39 -0400 Subject: [md] fix lockup on read error Message-id: C3D8F332-FEEC-4F72-ADE8-6FCF51FE2AF1@redhat.com O-Subject: [Patch RHEL5.4] md: fix lockup on read error Bugzilla: 465781 RH-Acked-by: David Milburn <dmilburn@redhat.com> RH-Acked-by: John Feeney <jfeeney@redhat.com> This patch resolves bugzilla 465781 and has been partner verified. I've reviewed and brew built the patch, but testing will have to wait until my md raid box is up and running again (CPU failure with an old enough machine that I couldn't get a CPU for that motherboard any more). I expect personal test results back sometime next week. commit f8221c8c23fdcc8fdbf4be9c23397e309e4d89ae Author: Doug Ledford <dledford@redhat.com> Date: Fri Apr 24 10:40:49 2009 -0400 [md raid1] Fix deadlock on read error Original patch from Bryn Reeves. Tested by customer and fix confirmed. Reviewed by me and passes review. Bugzilla 465781 Signed-off-by: Doug Ledford <dledford@redhat.com> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index dc45b3f..ac323a1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -605,6 +605,38 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, return ret; } + +static int flush_pending_writes(conf_t *conf) +{ + /* Any writes that have been queued but are awaiting + * bitmap updates get flushed here. + * We return 1 if any requests were actually submitted. + */ + int rv = 0; + + spin_lock_irq(&conf->device_lock); + + if (conf->pending_bio_list.head) { + struct bio *bio; + bio = bio_list_get(&conf->pending_bio_list); + blk_remove_plug(conf->mddev->queue); + spin_unlock_irq(&conf->device_lock); + /* flush any pending bitmap writes to + * disk before proceeding w/ I/O */ + bitmap_unplug(conf->mddev->bitmap); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + rv = 1; + } else + spin_unlock_irq(&conf->device_lock); + return rv; +} + /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -694,7 +726,8 @@ static void freeze_array(conf_t *conf) wait_event_lock_irq(conf->wait_barrier, conf->barrier+conf->nr_pending == conf->nr_queued+2, conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + ({ flush_pending_writes(conf); + raid1_unplug(conf->mddev->queue); })); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(conf_t *conf) @@ -919,6 +952,9 @@ static int make_request(request_queue_t *q, struct bio * bio) blk_plug_device(mddev->queue); spin_unlock_irqrestore(&conf->device_lock, flags); + /* In case raid1d snuck into freeze_array */ + wake_up(&conf->wait_barrier); + #if 0 while ((bio = bio_list_pop(&bl)) != NULL) generic_make_request(bio); @@ -1402,29 +1438,14 @@ static void raid1d(mddev_t *mddev) for (;;) { char b[BDEVNAME_SIZE]; - spin_lock_irqsave(&conf->device_lock, flags); - if (conf->pending_bio_list.head) { - bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); - /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - if (bitmap_unplug(mddev->bitmap) != 0) - printk("%s: bitmap file write failed!\n", mdname(mddev)); + unplug += flush_pending_writes(conf); - while (bio) { /* submit pending writes */ - struct bio *next = bio->bi_next; - bio->bi_next = NULL; - generic_make_request(bio); - bio = next; - } - unplug = 1; - - continue; - } - - if (list_empty(head)) + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&conf->device_lock, flags); break; + } r1_bio = list_entry(head->prev, r1bio_t, retry_list); list_del(head->prev); conf->nr_queued--; @@ -1590,7 +1611,6 @@ static void raid1d(mddev_t *mddev) } } } - spin_unlock_irqrestore(&conf->device_lock, flags); if (unplug) unplug_slaves(mddev); }