--- diff/drivers/md/dm-raid1.c 2004-04-21 10:57:33.000000000 +0100 +++ source/drivers/md/dm-raid1.c 2004-04-21 17:14:21.000000000 +0100 @@ -543,10 +543,24 @@ static void rh_start_recovery(struct reg /*----------------------------------------------------------------- * Mirror set structures. *---------------------------------------------------------------*/ +struct region_list { + region_t reg; + struct region_list *next; +}; + struct mirror { atomic_t error_count; struct dm_dev *dev; sector_t offset; + + /* + * FIXME: this list is very inefficient, we search it for + * every write. However this performance drop willl only + * be noticeable if you have serious problems with one of + * your mirrors. Optimise with a binary tree later. + */ + spinlock_t failed_lock; + struct region_list *failed_regions; /* FIXME: initialise this */ }; struct mirror_set { @@ -727,10 +741,37 @@ static void do_reads(struct mirror_set * * RECOVERING: delay the io until recovery completes * NOSYNC: increment pending, just write to the default mirror *---------------------------------------------------------------*/ +static void fail_region(struct mirror_set *ms, int mirror, region_t reg) +{ + int found = 0; + struct region_list **rl; + struct mirror *m = ms->mirrors[mirror]; + + /* we're in irq context */ + spin_lock(&m->failed_lock); + + for (rl = &m->failed_regions; *rl; rl = &rl->next) + if(*rl->reg == reg) { + found = 1; + break; + } + + if (!found) { + /* FIXME: how can we allocate this safely ? */ + *rl = kmalloc(GFP_NOIO, sizeof(**rl)); + *rl->reg = reg; + *rl->next = NULL; + + /* trigger an event */ + dm_table_event(ms->ti->table); + } + + spin_unlock(&m->failed_lock); +} + static void write_callback(unsigned long error, void *context) { unsigned int i; - int uptodate = 1; struct bio *bio = (struct bio *) context; struct mirror_set *ms; @@ -744,24 +785,46 @@ static void write_callback(unsigned long * regions with the same code. */ - if (error) { - /* - * only error the io if all mirrors failed. - * FIXME: bogus - */ - uptodate = 0; + if (!error) + bio_endio(bio, bio->bi_size, 0); + + else { + int succeeded = 0; + + /* fail regions that have errored */ for (i = 0; i < ms->nr_mirrors; i++) - if (!test_bit(i, &error)) { - uptodate = 1; - break; - } + if (!test_bit(i, &error)) + succeeded++; + else + fail_region(ms, i, + sector_to_region(bio->bi_sector)); + + /* only error the io if _all_ the regions are dead */ + bio_endio(bio, bio->bi_size, succeeded ? 0 : -EIO); } - bio_endio(bio, bio->bi_size, 0); +} + +/* + * FIXME: see comment next to mirror.failed_regions. + */ +static int is_region_failed(struct mirror *m, region_t reg) +{ + struct region_list *rl, r = 0; + unsigned long flags; + + spin_lock_irqsave(m->failed_lock, flags); + for (rl = m->failed_regions; rl; rl = rl->next) + if (rl->reg == reg) { + r = 1; + break; + } + spin_unlock_irqrestore(m->failed_lock, flags); + return r; } static void do_write(struct mirror_set *ms, struct bio *bio) { - unsigned int i; + unsigned int i, dests = 0; struct io_region io[ms->nr_mirrors]; struct mirror *m; @@ -770,13 +833,33 @@ static void do_write(struct mirror_set * io[i].bdev = m->dev->bdev; io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); - io[i].count = bio->bi_size >> 9; + + /* + * If this region has failed for a mirror we skip + * writing to it by setting count to zero. + */ + if (!is_region_failed(m, sector_to_region(bio->bi_sector))) + io[i].count = 0; + + else { + io[i].count = bio->bi_size >> 9; + dests++; + } } bio_set_ms(bio, ms); - dm_io_async_bvec(ms->nr_mirrors, io, WRITE, - bio->bi_io_vec + bio->bi_idx, - write_callback, bio); + + /* + * Issue the io as long as we have at least 1 destination. + */ + if (dests) + /* yes, I do mean 'ms->nr_mirrors' rather than 'dests' */ + dm_io_async_bvec(ms->nr_mirrors, io, WRITE, + bio->bi_io_vec + bio->bi_idx, + write_callback, bio); + else + /* we need to error this io */ + bio_endio(bio, bio->bi_size, -EIO); } static void do_writes(struct mirror_set *ms, struct bio_list *writes)