--- drivers/md/dm-raid1.c | 247 +++++++++++++++++----------- drivers/md/dm-region-hash.c | 380 ++++++++++++++++++++++---------------------- 2 files changed, 344 insertions(+), 283 deletions(-) Index: linux/drivers/md/dm-raid1.c =================================================================== --- linux.orig/drivers/md/dm-raid1.c 2008-10-21 14:55:44.000000000 +0100 +++ linux/drivers/md/dm-raid1.c 2008-10-21 14:59:06.000000000 +0100 @@ -83,14 +76,14 @@ struct mirror_set { struct work_struct trigger_event; - unsigned nr_mirrors; + unsigned nr_mirrors; struct mirror mirror[0]; }; -static void wakeup_mirrord(void *context) +static void wake(void *context) { struct mirror_set *ms = context; queue_work(ms->kmirrord_wq, &ms->kmirrord_work); } @@ -98,7 +92,7 @@ static void delayed_wake_fn(unsigned lon struct mirror_set *ms = (struct mirror_set *) data; clear_bit(0, &ms->timer_pending); - wakeup_mirrord(ms); + wake(ms); } static void delayed_wake(struct mirror_set *ms) @@ -112,11 +106,35 @@ static void delayed_wake(struct mirror_s add_timer(&ms->timer); } -static void wakeup_all_recovery_waiters(void *context) +static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) { - wake_up_all(&_kmirrord_recovery_stopped); + unsigned long flags; + int should_wake = 0; + struct bio_list *bl; + + bl = (rw == WRITE) ? &ms->writes : &ms->reads; + spin_lock_irqsave(&ms->lock, flags); + should_wake = !bl->head; + bio_list_add(bl, bio); + spin_unlock_irqrestore(&ms->lock, flags); + + if (should_wake) + wake(ms); } +static void dispatch_bios(void *context, struct bio_list *bio_list, int error) +{ + struct mirror_set *ms = context; + struct bio *bio; + + while ((bio = bio_list_pop(bio_list))) + queue_bio(ms, bio, WRITE); +} + +static region_t bio_to_region(struct mirror_set *ms, struct bio *bio) +{ + return dm_rh_sector_to_region(ms->rh, bio->bi_sector - ms->ti->begin); +} #define MIN_READ_RECORDS 20 struct dm_raid1_read_record { @@ -146,7 +164,7 @@ static void bio_set_m(struct bio *bio, s static struct mirror *get_default_mirror(struct mirror_set *ms) { - return &ms->mirror[atomic_read(&ms->default_mirror)]; + return ms->mirror + atomic_read(&ms->default_mirror); } static void set_default_mirror(struct mirror *m) @@ -209,7 +227,7 @@ static void fail_mirror(struct mirror *m } if (unlikely(new == ms->mirror + ms->nr_mirrors)) - DMWARN("All sides of mirror have failed."); + DMWARN("All mirrors have failed."); out: schedule_work(&ms->trigger_event); @@ -226,8 +244,7 @@ static void recovery_complete(int read_e void *context) { struct dm_region *reg = context; - struct mirror_set *ms = dm_rh_region_context(reg); - int m, bit = 0; + struct mirror_set *ms = dm_rh_get_region_context(reg); if (read_err) { /* Read error means the failure of default mirror. */ @@ -236,15 +253,18 @@ static void recovery_complete(int read_e } if (write_err) { + int bit, m; + DMERR_LIMIT("Write error during recovery (error = 0x%lx)", write_err); /* * Bits correspond to devices (excluding default mirror). * The default mirror cannot change during recovery. */ - for (m = 0; m < ms->nr_mirrors; m++) { + for (bit = m = 0; m < ms->nr_mirrors; m++) { if (&ms->mirror[m] == get_default_mirror(ms)) continue; + if (test_bit(bit, &write_err)) fail_mirror(ms->mirror + m, DM_RAID1_SYNC_ERROR); @@ -252,27 +272,27 @@ static void recovery_complete(int read_e } } - dm_rh_recovery_end(reg, !(read_err || write_err)); + dm_rh_recovery_end(ms->rh, reg, read_err || write_err); } static int recover(struct mirror_set *ms, struct dm_region *reg) { int r; unsigned i; - struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; - struct mirror *m; unsigned long flags = 0; region_t key = dm_rh_get_region_key(reg); - sector_t region_size = dm_rh_get_region_size(ms->rh); + struct dm_region_hash *rh = ms->rh; + sector_t region_size = dm_rh_get_region_size(rh); + struct mirror *m; + struct dm_io_region from, to[ms->nr_mirrors - 1], *dest; /* fill in the source */ m = get_default_mirror(ms); from.bdev = m->dev->bdev; - from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key); - if (key == (ms->nr_regions - 1)) { + from.sector = m->offset + dm_rh_region_to_sector(rh, key); + if (key == ms->nr_regions - 1) { /* - * The final region may be smaller than - * region_size. + * The final region may be smaller than region_size. */ from.count = ms->ti->len & (region_size - 1); if (!from.count) @@ -282,28 +302,31 @@ static int recover(struct mirror_set *ms /* fill in the destinations */ for (i = 0, dest = to; i < ms->nr_mirrors; i++) { - if (&ms->mirror[i] == get_default_mirror(ms)) + m = ms->mirror + i; + if (m == get_default_mirror(ms)) continue; - m = ms->mirror + i; dest->bdev = m->dev->bdev; - dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key); + dest->sector = m->offset + dm_rh_region_to_sector(rh, key); dest->count = from.count; dest++; } - /* hand to kcopyd */ + /* Keep mirror set reference in region context for callback function. */ + dm_rh_set_region_context(reg, ms); + + /* Hand to kcopyd. */ if (!errors_handled(ms)) set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, recovery_complete, reg); - return r; } static void do_recovery(struct mirror_set *ms) { + struct dm_region_hash *rh = ms->rh; struct dm_region *reg; struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); int r; @@ -311,15 +334,17 @@ static void do_recovery(struct mirror_se /* * Start quiescing some regions. */ - dm_rh_recovery_prepare(ms->rh); + r = dm_rh_recovery_prepare(rh); + if (r == -ESRCH) + wake_up_all(&_kmirrord_recovery_stopped); /* * Copy any already quiesced regions. */ - while ((reg = dm_rh_recovery_start(ms->rh))) { + while ((reg = dm_rh_recovery_start(rh))) { r = recover(ms, reg); if (r) - dm_rh_recovery_end(reg, 0); + dm_rh_recovery_end(rh, reg, r); } /* @@ -361,7 +386,7 @@ static int default_ok(struct mirror *m) static int mirror_available(struct mirror_set *ms, struct bio *bio) { struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); - region_t region = dm_rh_bio_to_region(ms->rh, bio); + region_t region = dm_rh_bio_to_region(ms->rh, bio); if (log->type->in_sync(log, region, 0)) return choose_mirror(ms, bio->bi_sector) ? 1 : 0; @@ -391,9 +416,6 @@ static void map_region(struct dm_io_regi io->count = bio->bi_size >> 9; } -/* FIXME move this */ -static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); - /*----------------------------------------------------------------- * Reads *---------------------------------------------------------------*/ @@ -440,16 +462,17 @@ static void read_async_bio(struct mirror map_region(&io, m, bio); bio_set_m(bio, m); - BUG_ON(dm_io(&io_req, 1, &io, NULL)); + BUG_ON(dm_io(&io_req, 1, &io, NULL)); } -static inline int region_in_sync(struct mirror_set *ms, region_t region, - int may_block) +static inline int region_in_sync(struct dm_region_hash *rh, + region_t region, int may_block) { - int state = dm_rh_get_state(ms->rh, region, may_block); + int state = dm_rh_get_state(rh, region, may_block); return state == DM_RH_CLEAN || state == DM_RH_DIRTY; } + static void do_reads(struct mirror_set *ms, struct bio_list *reads) { region_t region; @@ -457,13 +480,13 @@ static void do_reads(struct mirror_set * struct mirror *m; while ((bio = bio_list_pop(reads))) { - region = dm_rh_bio_to_region(ms->rh, bio); + region = dm_rh_bio_to_region(ms->rh, bio); m = get_default_mirror(ms); /* * We can only read balance if the region is in sync. */ - if (likely(region_in_sync(ms, region, 1))) + if (likely(region_in_sync(ms->rh, region, 1))) m = choose_mirror(ms, bio->bi_sector); else if (m && atomic_read(&m->error_count)) m = NULL; @@ -486,6 +509,56 @@ static void do_reads(struct mirror_set * * NOSYNC: increment pending, just write to the default mirror *---------------------------------------------------------------*/ +/* __bio_mark_nosync + * @ms + * @bio + * @done + * @error + * + * The bio was written on some mirror(s) but failed on other mirror(s). + * We can successfully endio the bio but should avoid the region being + * marked clean by setting the state RH_NOSYNC. + * + * This function is _not_ safe in interrupt context! + */ +static void __bio_mark_nosync(struct mirror_set *ms, struct bio *bio) +{ + struct dm_region_hash *rh = ms->rh; + region_t region = bio_to_region(rh, bio); + int recovering = dm_rh_get_state(rh, region, 0) == DM_RH_RECOVERING; + + ms->in_sync = 0; + + /* + * Region hash entry should exist because write was in-flight. + * + * The log'll be informed about the state change via the region hash. + */ + dm_rh_set_state(rh, region, DM_RH_NOSYNC, 0); + + /* + * Possible cases: + * 1) DM_RH_DIRTY + * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed + * 3) DM_RH_RECOVERING: flushing pending writes + * Either case, the region should have not been connected to list. + */ + bio_endio(bio, 0); + if (recovering) { + /* + * Dispatch the bios before we call 'wake_up_all'. + * This is important because if we are suspending, + * we want to know that recovery is complete and + * the work queue is flushed. If we wake_up_all + * before we dispatch_bios (queue bios and call wake()), + * then we risk suspending before the work queue + * has been properly flushed. + */ + dm_rh_dispatch_bios(rh, region, 0); + if (!dm_rh_recovery_in_flight(rh)) + wake_up_all(&_kmirrord_recovery_stopped); + } +} static void write_callback(unsigned long error, void *context) { @@ -527,10 +600,11 @@ static void write_callback(unsigned long spin_lock_irqsave(&ms->lock, flags); if (!ms->failures.head) should_wake = 1; + bio_list_add(&ms->failures, bio); spin_unlock_irqrestore(&ms->lock, flags); if (should_wake) - wakeup_mirrord(ms); + wake(ms); return; } out: @@ -559,13 +633,13 @@ static void do_write(struct mirror_set * * to the mirror set in write_callback(). */ bio_set_m(bio, get_default_mirror(ms)); - - BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL)); + BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL)); } static void do_writes(struct mirror_set *ms, struct bio_list *writes) { - int state; + enum dm_rh_region_states state; + struct dm_region_hash *rh = ms->rh; struct bio *bio; struct bio_list sync, nosync, recover, *this_list = NULL; @@ -580,8 +654,7 @@ static void do_writes(struct mirror_set bio_list_init(&recover); while ((bio = bio_list_pop(writes))) { - state = dm_rh_get_state(ms->rh, - dm_rh_bio_to_region(ms->rh, bio), 1); + state = dm_rh_get_state(rh, bio_to_region(ms, bio), 1); switch (state) { case DM_RH_CLEAN: case DM_RH_DIRTY: @@ -605,9 +678,9 @@ static void do_writes(struct mirror_set * be written to (writes to recover regions are going to * be delayed). */ - dm_rh_inc_pending(ms->rh, &sync); - dm_rh_inc_pending(ms->rh, &nosync); - ms->log_failure = dm_rh_flush(ms->rh) ? 1 : 0; + dm_rh_inc_pending(rh, &sync); + dm_rh_inc_pending(rh, &nosync); + ms->log_failure = dm_rh_flush(rh) ? 1 : 0; /* * Dispatch io. @@ -616,13 +689,13 @@ static void do_writes(struct mirror_set spin_lock_irq(&ms->lock); bio_list_merge(&ms->failures, &sync); spin_unlock_irq(&ms->lock); - wakeup_mirrord(ms); + wake(ms); } else while ((bio = bio_list_pop(&sync))) do_write(ms, bio); while ((bio = bio_list_pop(&recover))) - dm_rh_delay(ms->rh, bio); + dm_rh_delay(rh, bio); while ((bio = bio_list_pop(&nosync))) { map_bio(get_default_mirror(ms), bio); @@ -639,8 +712,7 @@ static void do_failures(struct mirror_se if (!ms->log_failure) { while ((bio = bio_list_pop(failures))) - ms->in_sync = 0; - dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); + __bio_mark_nosync(ms, bio); return; } @@ -717,7 +789,6 @@ static void do_mirror(struct work_struct } -static void dispatch_bios(void *context, struct bio_list *bio_list); /*----------------------------------------------------------------- * Target functions *---------------------------------------------------------------*/ @@ -764,10 +835,8 @@ static struct mirror_set *alloc_context( return NULL; } - ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord, - wakeup_all_recovery_waiters, - ms->ti->begin, MAX_RECOVERY, - dl, region_size, ms->nr_regions); + ms->rh = dm_region_hash_create(MAX_RECOVERY, dispatch_bios, ms, wake, + ms, dl, region_size, ms->nr_regions); if (IS_ERR(ms->rh)) { ti->error = "Error creating dirty region hash"; dm_io_client_destroy(ms->io_client); @@ -826,8 +895,8 @@ static int get_mirror(struct mirror_set * Create dirty log: log_type #log_params */ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, - unsigned argc, char **argv, - unsigned *args_used) + unsigned argc, char **argv, + unsigned *args_used) { unsigned param_count; struct dm_dirty_log *dl; @@ -995,11 +1064,11 @@ static int mirror_ctr(struct dm_target * goto err_destroy_wq; } - r = dm_kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); + r = dm_kcopyd_client_create(DM_KCOPYD_PAGES, &ms->kcopyd_client); if (r) goto err_destroy_wq; - wakeup_mirrord(ms); + wake(ms); return 0; err_destroy_wq: @@ -1020,31 +1089,6 @@ static void mirror_dtr(struct dm_target free_context(ms, ti, ms->nr_mirrors); } -static void dispatch_bios(void *context, struct bio_list *bio_list) -{ - struct mirror_set *ms = context; - struct bio *bio; - - while ((bio = bio_list_pop(bio_list))) - queue_bio(ms, bio, WRITE); -} - -static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) -{ - unsigned long flags; - int should_wake = 0; - struct bio_list *bl; - - bl = (rw == WRITE) ? &ms->writes : &ms->reads; - spin_lock_irqsave(&ms->lock, flags); - should_wake = !(bl->head); - bio_list_add(bl, bio); - spin_unlock_irqrestore(&ms->lock, flags); - - if (should_wake) - wakeup_mirrord(ms); -} - /* * Mirror mapping function */ @@ -1059,12 +1103,12 @@ static int mirror_map(struct dm_target * if (rw == WRITE) { /* Save region for mirror_end_io() handler */ - map_context->ll = dm_rh_bio_to_region(ms->rh, bio); + map_context->ll = bio_to_region(ms->rh, bio); queue_bio(ms, bio, rw); return DM_MAPIO_SUBMITTED; } - r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0); + r = log->type->in_sync(log, bio_to_region(ms->rh, bio), 0); if (r < 0 && r != -EWOULDBLOCK) return r; @@ -1112,7 +1156,11 @@ static int mirror_end_io(struct dm_targe * We need to dec pending if this was a write. */ if (rw == WRITE) { - dm_rh_dec(ms->rh, map_context->ll); + int r = dm_rh_dec(ms->rh, map_context->ll); + + if (r) + wake(ms); + return error; } @@ -1213,6 +1261,7 @@ static void mirror_resume(struct dm_targ if (log->type->resume && log->type->resume(log)) /* FIXME: need better error handling */ DMWARN("log resume failed"); + dm_rh_start_recovery(ms->rh); } Index: linux/drivers/md/dm-region-hash.c =================================================================== --- linux.orig/drivers/md/dm-region-hash.c 2008-10-21 14:55:44.000000000 +0100 +++ linux/drivers/md/dm-region-hash.c 2008-10-21 14:59:06.000000000 +0100 @@ -21,9 +21,15 @@ /*----------------------------------------------------------------- * Region hash * - * The mirror splits itself up into discrete regions. Each - * region can be in one of three states: clean, dirty, - * nosync. There is no need to put clean regions in the hash. + * A storage set (eg. RAID1, RAID5) splits itself up into discrete regions. + * Each region can be in one of three states: + * + * o clean + * o dirty, + * o nosync. + * + * There is no need to put clean regions in the hash. + * * * In addition to being present in the hash table a region _may_ * be present on one of three lists. @@ -34,14 +40,13 @@ * hash table. * * quiesced_regions: These regions have been spun down, ready - * for recovery. rh_recovery_start() will remove regions from - * this list and hand them to kmirrord, which will schedule the - * recovery io with kcopyd. + * for recovery. dm_rh_recovery_start() will remove regions from + * this list and hand them to the caller, which will schedule the + * recovery io. * - * recovered_regions: Regions that kcopyd has successfully + * recovered_regions: Regions that the caller has successfully * recovered. dm_rh_update_states() will now schedule any delayed - * io, up the recovery_count, and remove the region from the - * hash. + * io, up the recovery_count, and remove the region from the hash. * * There are 2 locks: * A rw spin lock 'hash_lock' protects just the hash table, @@ -55,6 +60,14 @@ * context, so all other uses will have to suspend local irqs. *---------------------------------------------------------------*/ struct dm_region_hash { + /* Callback function to dispatch queued writes on recovered regions. */ + void (*dispatch)(void *context, struct bio_list *bios, int error); + void *dispatch_context; + + /* Callback function to wakeup callers worker thread. */ + void (*wake)(void *context); + void *wake_context; + uint32_t region_size; unsigned region_shift; @@ -79,24 +92,12 @@ struct dm_region_hash { struct list_head quiesced_regions; struct list_head recovered_regions; struct list_head failed_recovered_regions; - - void *context; - sector_t target_begin; - - /* Callback function to schedule bios writes */ - void (*dispatch_bios)(void *context, struct bio_list *bios); - - /* Callback function to wakeup callers worker thread. */ - void (*wakeup_workers)(void *context); - - /* Callback function to wakeup callers recovery waiters. */ - void (*wakeup_all_recovery_waiters)(void *context); }; struct dm_region { - struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ region_t key; - int state; + enum dm_rh_region_states state; + void *context; /* Caller context. */ struct list_head hash_list; struct list_head list; @@ -114,24 +115,21 @@ region_t dm_rh_sector_to_region(struct d } EXPORT_SYMBOL_GPL(dm_rh_sector_to_region); -sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) -{ - return region << rh->region_shift; -} -EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); - region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) { - return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin); + return dm_rh_sector_to_region(rh, bio->bi_sector); } EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); -void *dm_rh_region_context(struct dm_region *reg) +sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) { - return reg->rh->context; + return region << rh->region_shift; } -EXPORT_SYMBOL_GPL(dm_rh_region_context); +EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); +/* + * Retrival fns. + */ region_t dm_rh_get_region_key(struct dm_region *reg) { return reg->key; @@ -144,35 +142,43 @@ sector_t dm_rh_get_region_size(struct dm } EXPORT_SYMBOL_GPL(dm_rh_get_region_size); +/* Squirrel a context with a region. */ +void *dm_rh_get_region_context(struct dm_region *region) +{ + return region->context; +} +EXPORT_SYMBOL_GPL(dm_rh_get_region_context); + +void dm_rh_set_region_context(struct dm_region *region, void *context) +{ + region->context = context; +} +EXPORT_SYMBOL_GPL(dm_rh_set_region_context); + /* - * FIXME: shall we pass in a structure instead of all these args to - * dm_region_hash_create()???? + * Create region hash client. */ -#define RH_HASH_MULT 2654435387U -#define RH_HASH_SHIFT 12 - #define MIN_REGIONS 64 -struct dm_region_hash *dm_region_hash_create( - void *context, void (*dispatch_bios)(void *context, - struct bio_list *bios), - void (*wakeup_workers)(void *context), - void (*wakeup_all_recovery_waiters)(void *context), - sector_t target_begin, unsigned max_recovery, - struct dm_dirty_log *log, uint32_t region_size, - region_t nr_regions) +struct dm_region_hash *dm_region_hash_create(unsigned max_recovery, + void (*dispatch)(void *dispatch_context, + struct bio_list *bios, int error), + void *dispatch_context, + void (*wake)(void *wake_context), void *wake_context, + struct dm_dirty_log *log, uint32_t region_size, region_t nr_regions) { - struct dm_region_hash *rh; + unsigned i; unsigned nr_buckets, max_buckets; - size_t i; + unsigned hash_primes[] = { + /* Table of primes for rh_hash/table size optimization. */ + 3, 7, 13, 27, 53, 97, 193, 389, 769, + 1543, 3079, 6151, 12289, 24593, + }; + struct dm_region_hash *rh; - /* - * Calculate a suitable number of buckets for our hash - * table. - */ - max_buckets = nr_regions >> 6; - for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) - ; - nr_buckets >>= 1; + if (region_size & (region_size - 1)) { + DMERR("region size must be 2^^n"); + return ERR_PTR(-EINVAL); + } rh = kmalloc(sizeof(*rh), GFP_KERNEL); if (!rh) { @@ -180,21 +186,29 @@ struct dm_region_hash *dm_region_hash_cr return ERR_PTR(-ENOMEM); } - rh->context = context; - rh->dispatch_bios = dispatch_bios; - rh->wakeup_workers = wakeup_workers; - rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters; - rh->target_begin = target_begin; rh->max_recovery = max_recovery; + rh->dispatch = dispatch; + rh->dispatch_context = dispatch_context; + rh->wake = wake; + rh->wake_context = wake_context; rh->log = log; rh->region_size = region_size; rh->region_shift = ffs(region_size) - 1; rwlock_init(&rh->hash_lock); - rh->mask = nr_buckets - 1; - rh->nr_buckets = nr_buckets; - rh->shift = RH_HASH_SHIFT; - rh->prime = RH_HASH_MULT; + /* Calculate a suitable number of buckets for our hash table. */ + max_buckets = nr_regions >> 6; + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) + ; + nr_buckets >>= 1; + rh->mask = rh->nr_buckets = nr_buckets; + rh->mask--; + rh->shift = ffs(nr_buckets); + + /* Check prime array limits. */ + i = rh->shift - 1 > ARRAY_SIZE(hash_primes) ? + ARRAY_SIZE(hash_primes) - 1 : rh->shift - 2; + rh->prime = hash_primes[i]; rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); if (!rh->buckets) { @@ -232,9 +246,9 @@ void dm_region_hash_destroy(struct dm_re struct dm_region *reg, *nreg; BUG_ON(!list_empty(&rh->quiesced_regions)); + for (h = 0; h < rh->nr_buckets; h++) { - list_for_each_entry_safe(reg, nreg, rh->buckets + h, - hash_list) { + list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) { BUG_ON(atomic_read(®->pending)); mempool_free(reg, rh->region_pool); } @@ -289,7 +303,6 @@ static struct dm_region *__rh_alloc(stru nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? DM_RH_CLEAN : DM_RH_NOSYNC; - nreg->rh = rh; nreg->key = region; INIT_LIST_HEAD(&nreg->list); atomic_set(&nreg->pending, 0); @@ -355,88 +368,35 @@ int dm_rh_get_state(struct dm_region_has } EXPORT_SYMBOL_GPL(dm_rh_get_state); -static void complete_resync_work(struct dm_region *reg, int success) -{ - struct dm_region_hash *rh = reg->rh; - - rh->log->type->set_region_sync(rh->log, reg->key, success); - - /* - * Dispatch the bios before we call 'wake_up_all'. - * This is important because if we are suspending, - * we want to know that recovery is complete and - * the work queue is flushed. If we wake_up_all - * before we dispatch_bios (queue bios and call wake()), - * then we risk suspending before the work queue - * has been properly flushed. - */ - rh->dispatch_bios(rh->context, ®->delayed_bios); - if (atomic_dec_and_test(&rh->recovery_in_flight)) - rh->wakeup_all_recovery_waiters(rh->context); - up(&rh->recovery_count); -} - -/* dm_rh_mark_nosync - * @ms - * @bio - * @done - * @error - * - * The bio was written on some mirror(s) but failed on other mirror(s). - * We can successfully endio the bio but should avoid the region being - * marked clean by setting the state DM_RH_NOSYNC. - * - * This function is _not_ safe in interrupt context! - */ -void dm_rh_mark_nosync(struct dm_region_hash *rh, - struct bio *bio, unsigned done, int error) +void dm_rh_set_state(struct dm_region_hash *rh, region_t region, + enum dm_rh_region_states state, int may_block) { - unsigned long flags; - struct dm_dirty_log *log = rh->log; struct dm_region *reg; - region_t region = dm_rh_bio_to_region(rh, bio); - int recovering = 0; + struct dm_dirty_log *log = rh->log; - /* We must inform the log that the sync count has changed. */ - log->type->set_region_sync(log, region, 0); + if (state == DM_RH_NOSYNC) + log->type->set_region_sync(log, region, 0); + else if (state == DM_RH_CLEAN) + log->type->clear_region(log, region); + else if (state == DM_RH_DIRTY) + log->type->mark_region(log, region); read_lock(&rh->hash_lock); reg = __rh_find(rh, region); + reg->state = state; read_unlock(&rh->hash_lock); - - /* region hash entry should exist because write was in-flight */ - BUG_ON(!reg); - BUG_ON(!list_empty(®->list)); - - spin_lock_irqsave(&rh->region_lock, flags); - /* - * Possible cases: - * 1) DM_RH_DIRTY - * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed - * 3) DM_RH_RECOVERING: flushing pending writes - * Either case, the region should have not been connected to list. - */ - recovering = (reg->state == DM_RH_RECOVERING); - reg->state = DM_RH_NOSYNC; - BUG_ON(!list_empty(®->list)); - spin_unlock_irqrestore(&rh->region_lock, flags); - - bio_endio(bio, error); - if (recovering) - complete_resync_work(reg, 0); } -EXPORT_SYMBOL_GPL(dm_rh_mark_nosync); +EXPORT_SYMBOL_GPL(dm_rh_set_state); void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) { struct dm_region *reg, *next; - LIST_HEAD(clean); LIST_HEAD(recovered); LIST_HEAD(failed_recovered); /* - * Quickly grab the lists. + * Quickly grab the lists and remove any regions from hash. */ write_lock_irq(&rh->hash_lock); spin_lock(&rh->region_lock); @@ -458,7 +418,7 @@ void dm_rh_update_states(struct dm_regio list_splice_init(&rh->failed_recovered_regions, &failed_recovered); - list_for_each_entry(reg, &failed_recovered, list) + list_for_each_entry(reg, &recovered, list) list_del(®->hash_list); } @@ -472,12 +432,24 @@ void dm_rh_update_states(struct dm_regio */ list_for_each_entry_safe(reg, next, &recovered, list) { rh->log->type->clear_region(rh->log, reg->key); - complete_resync_work(reg, 1); + rh->log->type->set_region_sync(rh->log, reg->key, 1); + + if (reg->delayed_bios.head) + rh->dispatch(rh->dispatch_context, + ®->delayed_bios, 0); + + up(&rh->recovery_count); mempool_free(reg, rh->region_pool); } list_for_each_entry_safe(reg, next, &failed_recovered, list) { - complete_resync_work(reg, errors_handled ? 0 : 1); + rh->log->type->set_region_sync(rh->log, reg->key, + errors_handled ? 0 : 1); + if (reg->delayed_bios.head) + rh->dispatch(rh->dispatch_context, + ®->delayed_bios, -EIO); + + up(&rh->recovery_count); mempool_free(reg, rh->region_pool); } @@ -486,54 +458,53 @@ void dm_rh_update_states(struct dm_regio mempool_free(reg, rh->region_pool); } - rh->log->type->flush(rh->log); + dm_rh_flush(rh); } EXPORT_SYMBOL_GPL(dm_rh_update_states); -static void rh_inc(struct dm_region_hash *rh, region_t region) +void dm_rh_inc(struct dm_region_hash *rh, region_t region) { struct dm_region *reg; read_lock(&rh->hash_lock); reg = __rh_find(rh, region); - - spin_lock_irq(&rh->region_lock); - atomic_inc(®->pending); - if (reg->state == DM_RH_CLEAN) { - reg->state = DM_RH_DIRTY; - list_del_init(®->list); /* take off the clean list */ - spin_unlock_irq(&rh->region_lock); - rh->log->type->mark_region(rh->log, reg->key); - } else - spin_unlock_irq(&rh->region_lock); + spin_lock_irq(&rh->region_lock); + reg->state = DM_RH_DIRTY; + list_del_init(®->list); /* Take off the clean list. */ + spin_unlock_irq(&rh->region_lock); + } + atomic_inc(®->pending); read_unlock(&rh->hash_lock); } +EXPORT_SYMBOL_GPL(dm_rh_inc); void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) { struct bio *bio; for (bio = bios->head; bio; bio = bio->bi_next) - rh_inc(rh, dm_rh_bio_to_region(rh, bio)); + dm_rh_inc(rh, dm_rh_bio_to_region(rh, bio)); } EXPORT_SYMBOL_GPL(dm_rh_inc_pending); -void dm_rh_dec(struct dm_region_hash *rh, region_t region) +int dm_rh_dec(struct dm_region_hash *rh, region_t region) { - unsigned long flags; + int r = 0; struct dm_region *reg; - int should_wake = 0; read_lock(&rh->hash_lock); reg = __rh_lookup(rh, region); read_unlock(&rh->hash_lock); - spin_lock_irqsave(&rh->region_lock, flags); + BUG_ON(!reg); + if (atomic_dec_and_test(®->pending)) { + unsigned long flags; + /* * There is no pending I/O for this region. * We can move the region to corresponding list for next action. @@ -545,19 +516,19 @@ void dm_rh_dec(struct dm_region_hash *rh * until the region is recovered or the map is reloaded. */ - /* do nothing for DM_RH_NOSYNC */ - if (reg->state == DM_RH_RECOVERING) { + spin_lock_irqsave(&rh->region_lock, flags); + if (reg->state == DM_RH_RECOVERING) list_add_tail(®->list, &rh->quiesced_regions); - } else if (reg->state == DM_RH_DIRTY) { + else { reg->state = DM_RH_CLEAN; list_add(®->list, &rh->clean_regions); } - should_wake = 1; + spin_unlock_irqrestore(&rh->region_lock, flags); + + r = 1; } - spin_unlock_irqrestore(&rh->region_lock, flags); - if (should_wake) - rh->wakeup_workers(rh->context); + return r; } EXPORT_SYMBOL_GPL(dm_rh_dec); @@ -578,44 +549,49 @@ static int __rh_recovery_prepare(struct return r; /* - * Get this region, and start it quiescing by setting the - * recovering flag. + * Get this region, and start it quiescing by setting + * the recovering flag. */ read_lock(&rh->hash_lock); reg = __rh_find(rh, region); read_unlock(&rh->hash_lock); spin_lock_irq(&rh->region_lock); + reg->state = DM_RH_RECOVERING; /* Already quiesced ? */ - if (atomic_read(®->pending)) - list_del_init(®->list); - else - list_move(®->list, &rh->quiesced_regions); + list_del_init(®->list); + if (!atomic_read(®->pending)) + list_add(®->list, &rh->quiesced_regions); spin_unlock_irq(&rh->region_lock); - return 1; } -void dm_rh_recovery_prepare(struct dm_region_hash *rh) +int dm_rh_recovery_prepare(struct dm_region_hash *rh) { - /* Extra reference to avoid race with dm_rh_stop_recovery */ + int r = 0; + + /* Extra reference to avoid race with rh_stop_recovery */ atomic_inc(&rh->recovery_in_flight); while (!down_trylock(&rh->recovery_count)) { atomic_inc(&rh->recovery_in_flight); + if (__rh_recovery_prepare(rh) <= 0) { atomic_dec(&rh->recovery_in_flight); up(&rh->recovery_count); + r = -ENOENT; break; } } /* Drop the extra reference */ if (atomic_dec_and_test(&rh->recovery_in_flight)) - rh->wakeup_all_recovery_waiters(rh->context); + r = -ESRCH; + + return r; } EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); @@ -630,28 +606,32 @@ struct dm_region *dm_rh_recovery_start(s if (!list_empty(&rh->quiesced_regions)) { reg = list_entry(rh->quiesced_regions.next, struct dm_region, list); - list_del_init(®->list); /* remove from the quiesced list */ + list_del_init(®->list); /* Remove from the quiesced list. */ } - spin_unlock_irq(&rh->region_lock); + spin_unlock_irq(&rh->region_lock); return reg; } EXPORT_SYMBOL_GPL(dm_rh_recovery_start); -void dm_rh_recovery_end(struct dm_region *reg, int success) +/* + * Put region on list of recovered ones. + */ +void dm_rh_recovery_end(struct dm_region_hash *rh, struct dm_region *reg, + int error) { - struct dm_region_hash *rh = reg->rh; - spin_lock_irq(&rh->region_lock); - if (success) - list_add(®->list, ®->rh->recovered_regions); - else { + if (error) { reg->state = DM_RH_NOSYNC; - list_add(®->list, ®->rh->failed_recovered_regions); - } + list_add(®->list, &rh->failed_recovered_regions); + } else + list_add(®->list, &rh->recovered_regions); + + atomic_dec(&rh->recovery_in_flight); spin_unlock_irq(&rh->region_lock); - rh->wakeup_workers(rh->context); + rh->wake(rh->wake_context); + BUG_ON(atomic_read(&rh->recovery_in_flight) < 0); } EXPORT_SYMBOL_GPL(dm_rh_recovery_end); @@ -668,21 +648,53 @@ int dm_rh_flush(struct dm_region_hash *r } EXPORT_SYMBOL_GPL(dm_rh_flush); -void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) +void dm_rh_delay_by_region(struct dm_region_hash *rh, + struct bio *bio, region_t region) { struct dm_region *reg; + /* FIXME: locking. */ read_lock(&rh->hash_lock); - reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); + reg = __rh_find(rh, region); bio_list_add(®->delayed_bios, bio); read_unlock(&rh->hash_lock); } +EXPORT_SYMBOL_GPL(dm_rh_delay_by_region); + +void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) +{ + return dm_rh_delay_by_region(rh, bio, + dm_rh_bio_to_region(rh, bio)); +} EXPORT_SYMBOL_GPL(dm_rh_delay); +void dm_rh_dispatch_bios(struct dm_region_hash *rh, + region_t region, int error) +{ + struct dm_region *reg; + struct bio_list delayed_bios; + + /* FIXME: locking. */ + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + BUG_ON(!reg); + delayed_bios = reg->delayed_bios; + bio_list_init(®->delayed_bios); + read_unlock(&rh->hash_lock); + + if (delayed_bios.head) + rh->dispatch(rh->dispatch_context, &delayed_bios, error); + + up(&rh->recovery_count); +} +EXPORT_SYMBOL_GPL(dm_rh_dispatch_bios); + void dm_rh_stop_recovery(struct dm_region_hash *rh) { int i; + rh->wake(rh->wake_context); + /* wait for any recovering regions */ for (i = 0; i < rh->max_recovery; i++) down(&rh->recovery_count); @@ -696,7 +708,7 @@ void dm_rh_start_recovery(struct dm_regi for (i = 0; i < rh->max_recovery; i++) up(&rh->recovery_count); - rh->wakeup_workers(rh->context); + rh->wake(rh->wake_context); } EXPORT_SYMBOL_GPL(dm_rh_start_recovery);