From: Doug Ledford <dledford@redhat.com> Date: Mon, 25 Aug 2008 12:58:16 -0400 Subject: [md] fix error propogation in raid arrays Message-id: 1219683496.16580.2.camel@firewall.xsintricity.com O-Subject: [RHEL5.3 patch] Fix error propogation in md raid arrays Bugzilla: 430984 RH-Acked-by: Pete Zaitcev <zaitcev@redhat.com> Given certain circumstances, the md raid stack would incorrectly propogate an error condition up above the block layer (usually resulting in ext3 remounting the fs read-only) even when there was still a good device to use (specifically, this could happen when a read error was recognized after a write error had already kicked the device that gave the read error out of the array). This resolves bugzilla 430984. I tested this over the weekend on a stress test and it never once failed to work as it should. Patch attached, or it can also be retrieved via git as such: git pull git://git.engineering.redhat.com/users/dledford/rhel5-kernel.git bz430984 Commit from that branch is also attached here. -- Doug Ledford <dledford@redhat.com> GPG KeyID: CFBFF194 http://people.redhat.com/dledford Infiniband specific RPMs available at http://people.redhat.com/dledford/Infiniband commit 9e2e54ae8b8be03cc896391cefed3342d8220623 Author: Doug Ledford <dledford@redhat.com> Date: Mon Aug 25 12:45:03 2008 -0400 Fix bogus error propogation In the raid stack, due to some racy conditions, it was possible that an error on one device would result in the error being propogated all the way up to the filesystem level when it shouldn't. This resolves that. Signed-off-by: Doug Ledford <dledford@redhat.com> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ccd24bf..dc45b3f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -271,21 +271,25 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int */ update_head_pos(mirror, r1_bio); - if (uptodate || conf->working_disks <= 1) { - /* - * Set R1BIO_Uptodate in our master bio, so that - * we will return a good error code for to the higher - * levels even if IO on some other mirrored buffer fails. - * - * The 'master' represents the composite IO operation to - * user-side. So if something waits for IO, then it will - * wait for the 'master' bio. + if (uptodate) + set_bit(R1BIO_Uptodate, &r1_bio->state); + else { + /* If all other devices have failed, we want to return + * the error upwards rather than fail the last device. + * Here we redefine "uptodate" to mean "Don't want to retry" */ - if (uptodate) - set_bit(R1BIO_Uptodate, &r1_bio->state); + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + if (r1_bio->mddev->degraded == conf->raid_disks || + (r1_bio->mddev->degraded == conf->raid_disks-1 && + !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))) + uptodate = 1; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + if (uptodate) raid_end_bio_io(r1_bio); - } else { + else { /* * oops, read error: */ @@ -929,7 +933,7 @@ static void status(struct seq_file *seq, mddev_t *mddev) int i; seq_printf(seq, " [%d/%d] [", conf->raid_disks, - conf->working_disks); + conf->raid_disks - mddev->degraded); rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); @@ -953,26 +957,28 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) - && conf->working_disks == 1) + && (conf->raid_disks - mddev->degraded) == 1) /* * Don't fail the drive, act as though we were just a * normal single drive */ return; - if (test_bit(In_sync, &rdev->flags)) { + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; - conf->working_disks--; + set_bit(Faulty, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); - } - clear_bit(In_sync, &rdev->flags); - set_bit(Faulty, &rdev->flags); + } else + set_bit(Faulty, &rdev->flags); mddev->sb_dirty = 1; printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" " Operation continuing on %d devices\n", - bdevname(rdev->bdev,b), conf->working_disks); + bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); } static void print_conf(conf_t *conf) @@ -984,7 +990,7 @@ static void print_conf(conf_t *conf) printk("(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->working_disks, + printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); rcu_read_lock(); @@ -1023,10 +1029,11 @@ static int raid1_spare_active(mddev_t *mddev) mdk_rdev_t *rdev = conf->mirrors[i].rdev; if (rdev && !test_bit(Faulty, &rdev->flags) - && !test_bit(In_sync, &rdev->flags)) { - conf->working_disks++; + && !test_and_set_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; - set_bit(In_sync, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); } } @@ -1897,15 +1904,11 @@ static int run(mddev_t *mddev) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; - if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags)) - conf->working_disks++; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); - if (conf->working_disks == 1) - mddev->recovery_cp = MaxSector; spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); @@ -1913,11 +1916,6 @@ static int run(mddev_t *mddev) bio_list_init(&conf->pending_bio_list); bio_list_init(&conf->flushing_bio_list); - if (!conf->working_disks) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", - mdname(mddev)); - goto out_free_conf; - } mddev->degraded = 0; for (i = 0; i < conf->raid_disks; i++) { @@ -1930,6 +1928,13 @@ static int run(mddev_t *mddev) mddev->degraded++; } } + if (mddev->degraded == conf->raid_disks) { + printk(KERN_ERR "raid1: no operational mirrors for %s\n", + mdname(mddev)); + goto out_free_conf; + } + if (conf->raid_disks - mddev->degraded == 1) + mddev->recovery_cp = MaxSector; /* * find the first working one and use it as a starting point @@ -2048,7 +2053,7 @@ static int raid1_reshape(mddev_t *mddev) mirror_info_t *newmirrors; conf_t *conf = mddev_to_conf(mddev); int cnt, raid_disks; - + unsigned long flags; int d, d2; /* Cannot change chunk_size, layout, or level */ @@ -2107,7 +2112,9 @@ static int raid1_reshape(mddev_t *mddev) kfree(conf->poolinfo); conf->poolinfo = newpoolinfo; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded += (raid_disks - conf->raid_disks); + spin_unlock_irqrestore(&conf->device_lock, flags); conf->raid_disks = mddev->raid_disks = raid_disks; mddev->delta_disks = 0; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 115a6f8..5276ddf 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -950,15 +950,17 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * really dead" tests... */ return; - if (test_bit(In_sync, &rdev->flags)) { + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; conf->working_disks--; + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_ERR, &mddev->recovery); } - clear_bit(In_sync, &rdev->flags); set_bit(Faulty, &rdev->flags); mddev->sb_dirty = 1; printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" @@ -1034,10 +1036,12 @@ static int raid10_spare_active(mddev_t *mddev) tmp = conf->mirrors + i; if (tmp->rdev && !test_bit(Faulty, &tmp->rdev->flags) - && !test_bit(In_sync, &tmp->rdev->flags)) { - conf->working_disks++; + && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; - set_bit(In_sync, &tmp->rdev->flags); + conf->working_disks++; + spin_unlock_irqrestore(&conf->device_lock, flags); } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4500660..c544680 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -636,7 +636,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, struct stripe_head *sh = bi->bi_private; raid5_conf_t *conf = sh->raid_conf; int disks = sh->disks, i; - unsigned long flags; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); if (bi->bi_size) @@ -654,7 +653,6 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, return 0; } - spin_lock_irqsave(&conf->device_lock, flags); if (!uptodate) md_error(conf->mddev, conf->disks[i].rdev); @@ -662,8 +660,7 @@ static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done, clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); - __release_stripe(conf, sh); - spin_unlock_irqrestore(&conf->device_lock, flags); + release_stripe(sh); return 0; } @@ -697,11 +694,13 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) if (!test_bit(Faulty, &rdev->flags)) { mddev->sb_dirty = 1; - if (test_bit(In_sync, &rdev->flags)) { + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); conf->working_disks--; mddev->degraded++; conf->failed_disks++; - clear_bit(In_sync, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery was running, make sure it aborts. */ @@ -3420,11 +3419,13 @@ static int raid5_spare_active(mddev_t *mddev) tmp = conf->disks + i; if (tmp->rdev && !test_bit(Faulty, &tmp->rdev->flags) - && !test_bit(In_sync, &tmp->rdev->flags)) { + && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded--; conf->failed_disks--; conf->working_disks++; - set_bit(In_sync, &tmp->rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); } } print_raid5_conf(conf); @@ -3560,6 +3561,7 @@ static int raid5_start_reshape(mddev_t *mddev) struct list_head *rtmp; int spares = 0; int added_devices = 0; + unsigned long flags; if (mddev->degraded || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) @@ -3602,7 +3604,9 @@ static int raid5_start_reshape(mddev_t *mddev) break; } + spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; + spin_unlock_irqrestore(&conf->device_lock, flags); mddev->raid_disks = conf->raid_disks; mddev->reshape_position = 0; mddev->sb_dirty = 1; diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h index 3009c81..0a9ba7c 100644 --- a/include/linux/raid/raid1.h +++ b/include/linux/raid/raid1.h @@ -30,7 +30,6 @@ struct r1_private_data_s { mddev_t *mddev; mirror_info_t *mirrors; int raid_disks; - int working_disks; int last_used; sector_t next_seq_sect; spinlock_t device_lock;