Unreviewed part of the patch - eventually this shrinks to nothing. Pure refactoring parts move into the previous patch. Functional changes like the hash function changes need to be removed and placed in separate later patches. - AGK --- drivers/md/dm-raid1.c | 256 +++++++++++++++++++------------ drivers/md/dm-region-hash.c | 353 ++++++++++++++++++++++++++------------------ 2 files changed, 372 insertions(+), 237 deletions(-) Index: linux-2.6.27/drivers/md/dm-raid1.c =================================================================== --- linux-2.6.27.orig/drivers/md/dm-raid1.c 2008-10-15 12:23:41.000000000 +0100 +++ linux-2.6.27/drivers/md/dm-raid1.c 2008-10-15 12:24:02.000000000 +0100 @@ -1,5 +1,6 @@ /* * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -9,26 +10,18 @@ #include "dm-bio-list.h" #include "dm-bio-record.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include #include #include #include +#define DM_MSG_PREFIX "raid1" + #define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */ -#define DM_MSG_PREFIX "raid1" #define DM_IO_PAGES 64 +#define DM_KCOPYD_PAGES 64 #define DM_RAID1_HANDLE_ERRORS 0x01 #define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) @@ -83,10 +76,66 @@ struct mirror_set { struct work_struct trigger_event; - unsigned nr_mirrors; + unsigned int nr_mirrors; struct mirror mirror[0]; }; +static void wake(void *context) +{ + struct mirror_set *ms = context; + + queue_work(ms->kmirrord_wq, &ms->kmirrord_work); +} + +static void delayed_wake_fn(unsigned long data) +{ + struct mirror_set *ms = (struct mirror_set *) data; + + clear_bit(0, &ms->timer_pending); + wake(ms); +} + +static void delayed_wake(struct mirror_set *ms) +{ + if (test_and_set_bit(0, &ms->timer_pending)) + return; + + ms->timer.expires = jiffies + HZ / 5; + ms->timer.data = (unsigned long) ms; + ms->timer.function = delayed_wake_fn; + add_timer(&ms->timer); +} + +static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) +{ + unsigned long flags; + int should_wake = 0; + struct bio_list *bl; + + bl = (rw == WRITE) ? &ms->writes : &ms->reads; + spin_lock_irqsave(&ms->lock, flags); + should_wake = !bl->head; + bio_list_add(bl, bio); + spin_unlock_irqrestore(&ms->lock, flags); + + if (should_wake) + wake(ms); +} + +static void dispatch_bios(void *context, struct bio_list *bio_list, int error) +{ + struct mirror_set *ms = context; + struct bio *bio; + + while ((bio = bio_list_pop(bio_list))) + queue_bio(ms, bio, WRITE); +} + +static region_t bio_to_region(struct mirror_set *ms, struct bio *bio) +{ + return dm_rh_sector_to_region(ms->rh, bio->bi_sector - ms->ti->begin); +} + #define MIN_READ_RECORDS 20 struct dm_raid1_read_record { struct mirror *m; @@ -115,7 +164,7 @@ static void bio_set_m(struct bio *bio, s static struct mirror *get_default_mirror(struct mirror_set *ms) { - return &ms->mirror[atomic_read(&ms->default_mirror)]; + return ms->mirror + atomic_read(&ms->default_mirror); } static void set_default_mirror(struct mirror *m) @@ -178,7 +227,7 @@ static void fail_mirror(struct mirror *m } if (unlikely(new == ms->mirror + ms->nr_mirrors)) - DMWARN("All sides of mirror have failed."); + DMWARN("All mirrors have failed."); out: schedule_work(&ms->trigger_event); @@ -195,8 +244,7 @@ static void recovery_complete(int read_e void *context) { struct dm_region *reg = context; - struct mirror_set *ms = reg->rh->ms; - int m, bit = 0; + struct mirror_set *ms = dm_rh_get_region_context(reg); if (read_err) { /* Read error means the failure of default mirror. */ @@ -205,15 +253,18 @@ static void recovery_complete(int read_e } if (write_err) { + int bit, m; + DMERR_LIMIT("Write error during recovery (error = 0x%lx)", write_err); /* * Bits correspond to devices (excluding default mirror). * The default mirror cannot change during recovery. */ - for (m = 0; m < ms->nr_mirrors; m++) { + for (bit = m = 0; m < ms->nr_mirrors; m++) { if (&ms->mirror[m] == get_default_mirror(ms)) continue; + if (test_bit(bit, &write_err)) fail_mirror(ms->mirror + m, DM_RAID1_SYNC_ERROR); @@ -221,56 +272,61 @@ static void recovery_complete(int read_e } } - dm_rh_recovery_end(reg, !(read_err || write_err)); + dm_rh_recovery_end(ms->rh, reg, read_err || write_err); } static int recover(struct mirror_set *ms, struct dm_region *reg) { int r; unsigned i; - struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; - struct mirror *m; unsigned long flags = 0; + region_t key = dm_rh_get_region_key(reg); + struct dm_region_hash *rh = ms->rh; + sector_t region_size = dm_rh_get_region_size(rh); + struct mirror *m; + struct dm_io_region from, to[ms->nr_mirrors - 1], *dest; /* fill in the source */ m = get_default_mirror(ms); from.bdev = m->dev->bdev; - from.sector = m->offset + dm_rh_region_to_sector(reg->rh, reg->key); - if (reg->key == (ms->nr_regions - 1)) { + from.sector = m->offset + dm_rh_region_to_sector(rh, key); + if (key == ms->nr_regions - 1) { /* - * The final region may be smaller than - * region_size. + * The final region may be smaller than region_size. */ - from.count = ms->ti->len & (reg->rh->region_size - 1); + from.count = ms->ti->len & (region_size - 1); if (!from.count) - from.count = reg->rh->region_size; + from.count = region_size; } else - from.count = reg->rh->region_size; + from.count = region_size; /* fill in the destinations */ for (i = 0, dest = to; i < ms->nr_mirrors; i++) { - if (&ms->mirror[i] == get_default_mirror(ms)) + m = ms->mirror + i; + if (m == get_default_mirror(ms)) continue; - m = ms->mirror + i; dest->bdev = m->dev->bdev; - dest->sector = m->offset + dm_rh_region_to_sector(reg->rh, reg->key); + dest->sector = m->offset + dm_rh_region_to_sector(rh, key); dest->count = from.count; dest++; } - /* hand to kcopyd */ + /* Keep mirror set reference in region context for callback function. */ + dm_rh_set_region_context(reg, ms); + + /* Hand to kcopyd. */ if (!errors_handled(ms)) set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, recovery_complete, reg); - return r; } static void do_recovery(struct mirror_set *ms) { + struct dm_region_hash *rh = ms->rh; struct dm_region *reg; struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); int r; @@ -278,15 +334,17 @@ static void do_recovery(struct mirror_se /* * Start quiescing some regions. */ - dm_rh_recovery_prepare(ms->rh); + r = dm_rh_recovery_prepare(rh); + if (r == -ESRCH) + wake_up_all(&_kmirrord_recovery_stopped); /* * Copy any already quiesced regions. */ - while ((reg = dm_rh_recovery_start(ms->rh))) { + while ((reg = dm_rh_recovery_start(rh))) { r = recover(ms, reg); if (r) - rh_recovery_end(reg, 0); + dm_rh_recovery_end(rh, reg, r); } /* @@ -328,7 +386,7 @@ static int default_ok(struct mirror *m) static int mirror_available(struct mirror_set *ms, struct bio *bio) { struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); - region_t region = dm_rh_bio_to_region(ms->rh, bio); + region_t region = bio_to_region(ms->rh, bio); if (log->type->in_sync(log, region, 0)) return choose_mirror(ms, bio->bi_sector) ? 1 : 0; @@ -404,9 +462,17 @@ static void read_async_bio(struct mirror map_region(&io, m, bio); bio_set_m(bio, m); - (void) dm_io(&io_req, 1, &io, NULL); + BUG_ON(dm_io(&io_req, 1, &io, NULL)); } +static inline int region_in_sync(struct dm_region_hash *rh, + region_t region, int may_block) +{ + int state = dm_rh_get_state(rh, region, may_block); + return state == DM_RH_CLEAN || state == DM_RH_DIRTY; +} + + static void do_reads(struct mirror_set *ms, struct bio_list *reads) { region_t region; @@ -414,13 +480,13 @@ static void do_reads(struct mirror_set * struct mirror *m; while ((bio = bio_list_pop(reads))) { - region = dm_rh_bio_to_region(ms->rh, bio); + region = bio_to_region(ms, bio); m = get_default_mirror(ms); /* * We can only read balance if the region is in sync. */ - if (likely(rh_in_sync(ms->rh, region, 1))) + if (likely(region_in_sync(ms->rh, region, 1))) m = choose_mirror(ms, bio->bi_sector); else if (m && atomic_read(&m->error_count)) m = NULL; @@ -451,33 +517,25 @@ static void do_reads(struct mirror_set * * * The bio was written on some mirror(s) but failed on other mirror(s). * We can successfully endio the bio but should avoid the region being - * marked clean by setting the state DM_RH_NOSYNC. + * marked clean by setting the state RH_NOSYNC. * * This function is _not_ safe in interrupt context! */ -static void __bio_mark_nosync(struct mirror_set *ms, - struct bio *bio, unsigned done, int error) +static void __bio_mark_nosync(struct mirror_set *ms, struct bio *bio) { - unsigned long flags; struct dm_region_hash *rh = ms->rh; - struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); - struct dm_region *reg; - region_t region = dm_rh_bio_to_region(rh, bio); - int recovering = 0; + region_t region = bio_to_region(rh, bio); + int recovering = dm_rh_get_state(rh, region, 0) == DM_RH_RECOVERING; - /* We must inform the log that the sync count has changed. */ - log->type->set_region_sync(log, region, 0); ms->in_sync = 0; - read_lock(&rh->hash_lock); - reg = __rh_find(rh, region); - read_unlock(&rh->hash_lock); - - /* region hash entry should exist because write was in-flight */ - BUG_ON(!reg); - BUG_ON(!list_empty(®->list)); + /* + * Region hash entry should exist because write was in-flight. + * + * The log'll be informed about the state change via the region hash. + */ + dm_rh_set_state(rh, region, DM_RH_NOSYNC, 0); - spin_lock_irqsave(&rh->region_lock, flags); /* * Possible cases: * 1) DM_RH_DIRTY @@ -485,14 +543,21 @@ static void __bio_mark_nosync(struct mir * 3) DM_RH_RECOVERING: flushing pending writes * Either case, the region should have not been connected to list. */ - recovering = (reg->state == DM_RH_RECOVERING); - reg->state = DM_RH_NOSYNC; - BUG_ON(!list_empty(®->list)); - spin_unlock_irqrestore(&rh->region_lock, flags); - - bio_endio(bio, error); - if (recovering) - complete_resync_work(reg, 0); + bio_endio(bio, 0); + if (recovering) { + /* + * Dispatch the bios before we call 'wake_up_all'. + * This is important because if we are suspending, + * we want to know that recovery is complete and + * the work queue is flushed. If we wake_up_all + * before we dispatch_bios (queue bios and call wake()), + * then we risk suspending before the work queue + * has been properly flushed. + */ + dm_rh_dispatch_bios(rh, region, 0); + if (!dm_rh_recovery_in_flight(rh)) + wake_up_all(&_kmirrord_recovery_stopped); + } } static void write_callback(unsigned long error, void *context) @@ -535,6 +600,7 @@ static void write_callback(unsigned long spin_lock_irqsave(&ms->lock, flags); if (!ms->failures.head) should_wake = 1; + bio_list_add(&ms->failures, bio); spin_unlock_irqrestore(&ms->lock, flags); if (should_wake) @@ -567,13 +633,13 @@ static void do_write(struct mirror_set * * to the mirror set in write_callback(). */ bio_set_m(bio, get_default_mirror(ms)); - - (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); + BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL)); } static void do_writes(struct mirror_set *ms, struct bio_list *writes) { - int state; + enum dm_rh_region_states state; + struct dm_region_hash *rh = ms->rh; struct bio *bio; struct bio_list sync, nosync, recover, *this_list = NULL; @@ -588,7 +654,7 @@ static void do_writes(struct mirror_set bio_list_init(&recover); while ((bio = bio_list_pop(writes))) { - state = dm_rh_get_state(ms->rh, dm_rh_bio_to_region(ms->rh, bio), 1); + state = dm_rh_get_state(rh, bio_to_region(ms, bio), 1); switch (state) { case DM_RH_CLEAN: case DM_RH_DIRTY: @@ -612,9 +678,9 @@ static void do_writes(struct mirror_set * be written to (writes to recover regions are going to * be delayed). */ - dm_rh_inc_pending(ms->rh, &sync); - dm_rh_inc_pending(ms->rh, &nosync); - ms->log_failure = dm_rh_flush(ms->rh) ? 1 : 0; + dm_rh_inc_pending(rh, &sync); + dm_rh_inc_pending(rh, &nosync); + ms->log_failure = dm_rh_flush(rh) ? 1 : 0; /* * Dispatch io. @@ -629,7 +695,7 @@ static void do_writes(struct mirror_set do_write(ms, bio); while ((bio = bio_list_pop(&recover))) - dm_rh_delay(ms->rh, bio); + dm_rh_delay(rh, bio); while ((bio = bio_list_pop(&nosync))) { map_bio(get_default_mirror(ms), bio); @@ -646,7 +712,7 @@ static void do_failures(struct mirror_se if (!ms->log_failure) { while ((bio = bio_list_pop(failures))) - __bio_mark_nosync(ms, bio, bio->bi_size, 0); + __bio_mark_nosync(ms, bio); return; } @@ -713,7 +779,7 @@ static void do_mirror(struct work_struct bio_list_init(&ms->failures); spin_unlock_irqrestore(&ms->lock, flags); - dm_rh_update_states(ms->rh); + dm_rh_update_states(ms->rh, errors_handled(ms)); do_recovery(ms); do_reads(ms, &reads); do_writes(ms, &writes); @@ -769,7 +835,8 @@ static struct mirror_set *alloc_context( return NULL; } - ms->rh = dm_region_hash_create(dl, region_size, ms->nr_regions)); + ms->rh = dm_region_hash_create(MAX_RECOVERY, dispatch_bios, ms, wake, + ms, dl, region_size, ms->nr_regions); if (IS_ERR(ms->rh)) { ti->error = "Error creating dirty region hash"; dm_io_client_destroy(ms->io_client); @@ -828,8 +895,8 @@ static int get_mirror(struct mirror_set * Create dirty log: log_type #log_params */ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, - unsigned argc, char **argv, - unsigned *args_used) + unsigned argc, char **argv, + unsigned *args_used) { unsigned param_count; struct dm_dirty_log *dl; @@ -962,7 +1029,7 @@ static int mirror_ctr(struct dm_target * } ti->private = ms; - ti->split_io = ms->rh->region_size; + ti->split_io = dm_rh_get_region_size(ms->rh); ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); if (!ms->kmirrord_wq) { @@ -1022,22 +1089,6 @@ static void mirror_dtr(struct dm_target free_context(ms, ti, ms->nr_mirrors); } -static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) -{ - unsigned long flags; - int should_wake = 0; - struct bio_list *bl; - - bl = (rw == WRITE) ? &ms->writes : &ms->reads; - spin_lock_irqsave(&ms->lock, flags); - should_wake = !(bl->head); - bio_list_add(bl, bio); - spin_unlock_irqrestore(&ms->lock, flags); - - if (should_wake) - wake(ms); -} - /* * Mirror mapping function */ @@ -1052,12 +1103,12 @@ static int mirror_map(struct dm_target * if (rw == WRITE) { /* Save region for mirror_end_io() handler */ - map_context->ll = dm_rh_bio_to_region(ms->rh, bio); + map_context->ll = bio_to_region(ms->rh, bio); queue_bio(ms, bio, rw); return DM_MAPIO_SUBMITTED; } - r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0); + r = log->type->in_sync(log, bio_to_region(ms->rh, bio), 0); if (r < 0 && r != -EWOULDBLOCK) return r; @@ -1105,7 +1156,11 @@ static int mirror_end_io(struct dm_targe * We need to dec pending if this was a write. */ if (rw == WRITE) { - dm_rh_dec(ms->rh, map_context->ll); + int r = dm_rh_dec(ms->rh, map_context->ll); + + if (r) + wake(ms); + return error; } @@ -1172,7 +1227,7 @@ static void mirror_presuspend(struct dm_ dm_rh_stop_recovery(ms->rh); wait_event(_kmirrord_recovery_stopped, - !atomic_read(ms->rh->recovery_in_flight)); + !dm_rh_recovery_in_flight(ms->rh)); if (log->type->presuspend && log->type->presuspend(log)) /* FIXME: need better error handling */ @@ -1206,6 +1261,7 @@ static void mirror_resume(struct dm_targ if (log->type->resume && log->type->resume(log)) /* FIXME: need better error handling */ DMWARN("log resume failed"); + dm_rh_start_recovery(ms->rh); } Index: linux-2.6.27/drivers/md/dm-region-hash.c =================================================================== --- linux-2.6.27.orig/drivers/md/dm-region-hash.c 2008-10-15 12:23:41.000000000 +0100 +++ linux-2.6.27/drivers/md/dm-region-hash.c 2008-10-15 12:24:02.000000000 +0100 @@ -21,9 +21,15 @@ /*----------------------------------------------------------------- * Region hash * - * The mirror splits itself up into discrete regions. Each - * region can be in one of three states: clean, dirty, - * nosync. There is no need to put clean regions in the hash. + * A storage set (eg. RAID1, RAID5) splits itself up into discrete regions. + * Each region can be in one of three states: + * + * o clean + * o dirty, + * o nosync. + * + * There is no need to put clean regions in the hash. + * * * In addition to being present in the hash table a region _may_ * be present on one of three lists. @@ -34,14 +40,13 @@ * hash table. * * quiesced_regions: These regions have been spun down, ready - * for recovery. rh_recovery_start() will remove regions from - * this list and hand them to kmirrord, which will schedule the - * recovery io with kcopyd. + * for recovery. dm_rh_recovery_start() will remove regions from + * this list and hand them to the caller, which will schedule the + * recovery io. * - * recovered_regions: Regions that kcopyd has successfully + * recovered_regions: Regions that the caller has successfully * recovered. dm_rh_update_states() will now schedule any delayed - * io, up the recovery_count, and remove the region from the - * hash. + * io, up the recovery_count, and remove the region from the hash. * * There are 2 locks: * A rw spin lock 'hash_lock' protects just the hash table, @@ -55,6 +60,14 @@ * context, so all other uses will have to suspend local irqs. *---------------------------------------------------------------*/ struct dm_region_hash { + /* Callback function to dispatch queued writes on recovered regions. */ + void (*dispatch)(void *context, struct bio_list *bios, int error); + void *dispatch_context; + + /* Callback function to wakeup callers worker thread. */ + void (*wake)(void *context); + void *wake_context; + uint32_t region_size; unsigned region_shift; @@ -82,9 +95,9 @@ struct dm_region_hash { }; struct dm_region { - struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ region_t key; - int state; + enum dm_rh_region_states state; + void *context; /* Caller context. */ struct list_head hash_list; struct list_head list; @@ -104,7 +117,7 @@ EXPORT_SYMBOL_GPL(dm_rh_sector_to_region region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) { - return dm_rh_sector_to_region(rh, bio->bi_sector - rh->ms->ti->begin); + return dm_rh_sector_to_region(rh, bio->bi_sector); } EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); @@ -114,54 +127,58 @@ sector_t dm_rh_region_to_sector(struct d } EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); -static void wake(struct mirror_set *ms) +/* + * Retrival fns. + */ +region_t dm_rh_get_region_key(struct dm_region *reg) { - queue_work(ms->kmirrord_wq, &ms->kmirrord_work); + return reg->key; } +EXPORT_SYMBOL_GPL(dm_rh_get_region_key); -static void delayed_wake_fn(unsigned long data) +sector_t dm_rh_get_region_size(struct dm_region_hash *rh) { - struct mirror_set *ms = (struct mirror_set *) data; - - clear_bit(0, &ms->timer_pending); - wake(ms); + return rh->region_size; } +EXPORT_SYMBOL_GPL(dm_rh_get_region_size); -static void delayed_wake(struct mirror_set *ms) +/* Squirrel a context with a region. */ +void *dm_rh_get_region_context(struct dm_region *region) { - if (test_and_set_bit(0, &ms->timer_pending)) - return; - - ms->timer.expires = jiffies + HZ / 5; - ms->timer.data = (unsigned long) ms; - ms->timer.function = delayed_wake_fn; - add_timer(&ms->timer); + return region->context; } +EXPORT_SYMBOL_GPL(dm_rh_get_region_context); -/* FIXME move this */ -static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); - -#define RH_HASH_MULT 2654435387U -#define RH_HASH_SHIFT 12 +void dm_rh_set_region_context(struct dm_region *region, void *context) +{ + region->context = context; +} +EXPORT_SYMBOL_GPL(dm_rh_set_region_context); +/* + * Create region hash client. + */ #define MIN_REGIONS 64 struct dm_region_hash *dm_region_hash_create(unsigned max_recovery, - struct dm_dirty_log *log, - uint32_t region_size, - region_t nr_regions) + void (*dispatch)(void *dispatch_context, + struct bio_list *bios, int error), + void *dispatch_context, + void (*wake)(void *wake_context), void *wake_context, + struct dm_dirty_log *log, uint32_t region_size, region_t nr_regions) { - struct dm_region_hash *rh; + unsigned i; unsigned nr_buckets, max_buckets; - size_t i; + unsigned hash_primes[] = { + /* Table of primes for rh_hash/table size optimization. */ + 3, 7, 13, 27, 53, 97, 193, 389, 769, + 1543, 3079, 6151, 12289, 24593, + }; + struct dm_region_hash *rh; - /* - * Calculate a suitable number of buckets for our hash - * table. - */ - max_buckets = nr_regions >> 6; - for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) - ; - nr_buckets >>= 1; + if (region_size & (region_size - 1)) { + DMERR("region size must be 2^^n"); + return ERR_PTR(-EINVAL); + } rh = kmalloc(sizeof(*rh), GFP_KERNEL); if (!rh) { @@ -170,15 +187,28 @@ struct dm_region_hash *dm_region_hash_cr } rh->max_recovery = max_recovery; + rh->dispatch = dispatch; + rh->dispatch_context = dispatch_context; + rh->wake = wake; + rh->wake_context = wake_context; rh->log = log; rh->region_size = region_size; rh->region_shift = ffs(region_size) - 1; rwlock_init(&rh->hash_lock); - rh->mask = nr_buckets - 1; - rh->nr_buckets = nr_buckets; - rh->shift = RH_HASH_SHIFT; - rh->prime = RH_HASH_MULT; + /* Calculate a suitable number of buckets for our hash table. */ + max_buckets = nr_regions >> 6; + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) + ; + nr_buckets >>= 1; + rh->mask = rh->nr_buckets = nr_buckets; + rh->mask--; + rh->shift = ffs(nr_buckets); + + /* Check prime array limits. */ + i = rh->shift - 1 > ARRAY_SIZE(hash_primes) ? + ARRAY_SIZE(hash_primes) - 1 : rh->shift - 2; + rh->prime = hash_primes[i]; rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); if (!rh->buckets) { @@ -216,6 +246,7 @@ void dm_region_hash_destroy(struct dm_re struct dm_region *reg, *nreg; BUG_ON(!list_empty(&rh->quiesced_regions)); + for (h = 0; h < rh->nr_buckets; h++) { list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) { BUG_ON(atomic_read(®->pending)); @@ -234,6 +265,12 @@ void dm_region_hash_destroy(struct dm_re } EXPORT_SYMBOL_GPL(dm_region_hash_destroy); +struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh) +{ + return rh->log; +} +EXPORT_SYMBOL_GPL(dm_rh_dirty_log); + static inline unsigned rh_hash(struct dm_region_hash *rh, region_t region) { return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask; @@ -268,7 +305,6 @@ static struct dm_region *__rh_alloc(stru nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? DM_RH_CLEAN : DM_RH_NOSYNC; - nreg->rh = rh; nreg->key = region; INIT_LIST_HEAD(&nreg->list); atomic_set(&nreg->pending, 0); @@ -333,53 +369,35 @@ int dm_rh_get_state(struct dm_region_has } EXPORT_SYMBOL_GPL(dm_rh_get_state); -static inline int rh_in_sync(struct dm_region_hash *rh, - region_t region, int may_block) +void dm_rh_set_state(struct dm_region_hash *rh, region_t region, + enum dm_rh_region_states state, int may_block) { - int state = dm_rh_get_state(rh, region, may_block); - return state == DM_RH_CLEAN || state == DM_RH_DIRTY; -} - -static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list) -{ - struct bio *bio; - - while ((bio = bio_list_pop(bio_list))) { - queue_bio(ms, bio, WRITE); - } -} - -static void complete_resync_work(struct dm_region *reg, int success) -{ - struct dm_region_hash *rh = reg->rh; + struct dm_region *reg; + struct dm_dirty_log *log = rh->log; - rh->log->type->set_region_sync(rh->log, reg->key, success); + if (state == DM_RH_NOSYNC) + log->type->set_region_sync(log, region, 0); + else if (state == DM_RH_CLEAN) + log->type->clear_region(log, region); + else if (state == DM_RH_DIRTY) + log->type->mark_region(log, region); - /* - * Dispatch the bios before we call 'wake_up_all'. - * This is important because if we are suspending, - * we want to know that recovery is complete and - * the work queue is flushed. If we wake_up_all - * before we dispatch_bios (queue bios and call wake()), - * then we risk suspending before the work queue - * has been properly flushed. - */ - dispatch_bios(rh->ms, ®->delayed_bios); - if (atomic_dec_and_test(&rh->recovery_in_flight)) - wake_up_all(&_kmirrord_recovery_stopped); - up(&rh->recovery_count); + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + reg->state = state; + read_unlock(&rh->hash_lock); } +EXPORT_SYMBOL_GPL(dm_rh_set_state); -static void dm_rh_update_states(struct dm_region_hash *rh) +void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) { struct dm_region *reg, *next; - LIST_HEAD(clean); LIST_HEAD(recovered); LIST_HEAD(failed_recovered); /* - * Quickly grab the lists. + * Quickly grab the lists and remove any regions from hash. */ write_lock_irq(&rh->hash_lock); spin_lock(&rh->region_lock); @@ -401,7 +419,7 @@ static void dm_rh_update_states(struct d list_splice_init(&rh->failed_recovered_regions, &failed_recovered); - list_for_each_entry(reg, &failed_recovered, list) + list_for_each_entry(reg, &recovered, list) list_del(®->hash_list); } @@ -415,12 +433,24 @@ static void dm_rh_update_states(struct d */ list_for_each_entry_safe(reg, next, &recovered, list) { rh->log->type->clear_region(rh->log, reg->key); - complete_resync_work(reg, 1); + rh->log->type->set_region_sync(rh->log, reg->key, 1); + + if (reg->delayed_bios.head) + rh->dispatch(rh->dispatch_context, + ®->delayed_bios, 0); + + up(&rh->recovery_count); mempool_free(reg, rh->region_pool); } list_for_each_entry_safe(reg, next, &failed_recovered, list) { - complete_resync_work(reg, errors_handled(rh->ms) ? 0 : 1); + rh->log->type->set_region_sync(rh->log, reg->key, + errors_handled ? 0 : 1); + if (reg->delayed_bios.head) + rh->dispatch(rh->dispatch_context, + ®->delayed_bios, -EIO); + + up(&rh->recovery_count); mempool_free(reg, rh->region_pool); } @@ -429,53 +459,53 @@ static void dm_rh_update_states(struct d mempool_free(reg, rh->region_pool); } - rh->log->type->flush(rh->log); + dm_rh_flush(rh); } +EXPORT_SYMBOL_GPL(dm_rh_update_states); -static void rh_inc(struct dm_region_hash *rh, region_t region) +void dm_rh_inc(struct dm_region_hash *rh, region_t region) { struct dm_region *reg; read_lock(&rh->hash_lock); reg = __rh_find(rh, region); - - spin_lock_irq(&rh->region_lock); - atomic_inc(®->pending); - if (reg->state == DM_RH_CLEAN) { - reg->state = DM_RH_DIRTY; - list_del_init(®->list); /* take off the clean list */ - spin_unlock_irq(&rh->region_lock); - rh->log->type->mark_region(rh->log, reg->key); - } else - spin_unlock_irq(&rh->region_lock); + spin_lock_irq(&rh->region_lock); + reg->state = DM_RH_DIRTY; + list_del_init(®->list); /* Take off the clean list. */ + spin_unlock_irq(&rh->region_lock); + } + atomic_inc(®->pending); read_unlock(&rh->hash_lock); } +EXPORT_SYMBOL_GPL(dm_rh_inc); -static void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) +void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) { struct bio *bio; for (bio = bios->head; bio; bio = bio->bi_next) - rh_inc(rh, dm_rh_bio_to_region(rh, bio)); + dm_rh_inc(rh, dm_rh_bio_to_region(rh, bio)); } EXPORT_SYMBOL_GPL(dm_rh_inc_pending); -static void dm_rh_dec(struct dm_region_hash *rh, region_t region) +int dm_rh_dec(struct dm_region_hash *rh, region_t region) { - unsigned long flags; + int r = 0; struct dm_region *reg; - int should_wake = 0; read_lock(&rh->hash_lock); reg = __rh_lookup(rh, region); read_unlock(&rh->hash_lock); - spin_lock_irqsave(&rh->region_lock, flags); + BUG_ON(!reg); + if (atomic_dec_and_test(®->pending)) { + unsigned long flags; + /* * There is no pending I/O for this region. * We can move the region to corresponding list for next action. @@ -487,20 +517,21 @@ static void dm_rh_dec(struct dm_region_h * until the region is recovered or the map is reloaded. */ - /* do nothing for DM_RH_NOSYNC */ - if (reg->state == DM_RH_RECOVERING) { + spin_lock_irqsave(&rh->region_lock, flags); + if (reg->state == DM_RH_RECOVERING) list_add_tail(®->list, &rh->quiesced_regions); - } else if (reg->state == DM_RH_DIRTY) { + else { reg->state = DM_RH_CLEAN; list_add(®->list, &rh->clean_regions); } - should_wake = 1; + spin_unlock_irqrestore(&rh->region_lock, flags); + + r = 1; } - spin_unlock_irqrestore(&rh->region_lock, flags); - if (should_wake) - wake(rh->ms); + return r; } +EXPORT_SYMBOL_GPL(dm_rh_dec); /* * Starts quiescing a region in preparation for recovery. @@ -519,44 +550,49 @@ static int __rh_recovery_prepare(struct return r; /* - * Get this region, and start it quiescing by setting the - * recovering flag. + * Get this region, and start it quiescing by setting + * the recovering flag. */ read_lock(&rh->hash_lock); reg = __rh_find(rh, region); read_unlock(&rh->hash_lock); spin_lock_irq(&rh->region_lock); + reg->state = DM_RH_RECOVERING; /* Already quiesced ? */ - if (atomic_read(®->pending)) - list_del_init(®->list); - else - list_move(®->list, &rh->quiesced_regions); + list_del_init(®->list); + if (!atomic_read(®->pending)) + list_add(®->list, &rh->quiesced_regions); spin_unlock_irq(&rh->region_lock); - return 1; } -static void dm_rh_recovery_prepare(struct dm_region_hash *rh) +int dm_rh_recovery_prepare(struct dm_region_hash *rh) { - /* Extra reference to avoid race with dm_rh_stop_recovery */ + int r = 0; + + /* Extra reference to avoid race with rh_stop_recovery */ atomic_inc(&rh->recovery_in_flight); while (!down_trylock(&rh->recovery_count)) { atomic_inc(&rh->recovery_in_flight); + if (__rh_recovery_prepare(rh) <= 0) { atomic_dec(&rh->recovery_in_flight); up(&rh->recovery_count); + r = -ENOENT; break; } } /* Drop the extra reference */ if (atomic_dec_and_test(&rh->recovery_in_flight)) - wake_up_all(&_kmirrord_recovery_stopped); + r = -ESRCH; + + return r; } EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); @@ -571,52 +607,95 @@ struct dm_region *dm_rh_recovery_start(s if (!list_empty(&rh->quiesced_regions)) { reg = list_entry(rh->quiesced_regions.next, struct dm_region, list); - list_del_init(®->list); /* remove from the quiesced list */ + list_del_init(®->list); /* Remove from the quiesced list. */ } - spin_unlock_irq(&rh->region_lock); + spin_unlock_irq(&rh->region_lock); return reg; } EXPORT_SYMBOL_GPL(dm_rh_recovery_start); -void dm_rh_recovery_end(struct dm_region *reg, int success) +/* + * Put region on list of recovered ones. + */ +void dm_rh_recovery_end(struct dm_region_hash *rh, struct dm_region *reg, + int error) { - struct dm_region_hash *rh = reg->rh; - spin_lock_irq(&rh->region_lock); - if (success) - list_add(®->list, ®->rh->recovered_regions); - else { + if (error) { reg->state = DM_RH_NOSYNC; - list_add(®->list, ®->rh->failed_recovered_regions); - } + list_add(®->list, &rh->failed_recovered_regions); + } else + list_add(®->list, &rh->recovered_regions); + + atomic_dec(&rh->recovery_in_flight); spin_unlock_irq(&rh->region_lock); - wake(rh->ms); + rh->wake(rh->wake_context); + BUG_ON(atomic_read(&rh->recovery_in_flight) < 0); } EXPORT_SYMBOL_GPL(dm_rh_recovery_end); +/* Return recovery in flight count. */ +int dm_rh_recovery_in_flight(struct dm_region_hash *rh) +{ + return atomic_read(&rh->recovery_in_flight); +} +EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight); + int dm_rh_flush(struct dm_region_hash *rh) { return rh->log->type->flush(rh->log); } EXPORT_SYMBOL_GPL(dm_rh_flush); -void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) +void dm_rh_delay_by_region(struct dm_region_hash *rh, + struct bio *bio, region_t region) { struct dm_region *reg; + /* FIXME: locking. */ read_lock(&rh->hash_lock); - reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); + reg = __rh_find(rh, region); bio_list_add(®->delayed_bios, bio); read_unlock(&rh->hash_lock); } +EXPORT_SYMBOL_GPL(dm_rh_delay_by_region); + +void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) +{ + return dm_rh_delay_by_region(rh, bio, + dm_rh_bio_to_region(rh, bio)); +} EXPORT_SYMBOL_GPL(dm_rh_delay); +void dm_rh_dispatch_bios(struct dm_region_hash *rh, + region_t region, int error) +{ + struct dm_region *reg; + struct bio_list delayed_bios; + + /* FIXME: locking. */ + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + BUG_ON(!reg); + delayed_bios = reg->delayed_bios; + bio_list_init(®->delayed_bios); + read_unlock(&rh->hash_lock); + + if (delayed_bios.head) + rh->dispatch(rh->dispatch_context, &delayed_bios, error); + + up(&rh->recovery_count); +} +EXPORT_SYMBOL_GPL(dm_rh_dispatch_bios); + void dm_rh_stop_recovery(struct dm_region_hash *rh) { int i; + rh->wake(rh->wake_context); + /* wait for any recovering regions */ for (i = 0; i < rh->max_recovery; i++) down(&rh->recovery_count); @@ -630,7 +709,7 @@ void dm_rh_start_recovery(struct dm_regi for (i = 0; i < rh->max_recovery; i++) up(&rh->recovery_count); - wake(rh->ms); + rh->wake(rh->wake_context); } EXPORT_SYMBOL_GPL(dm_rh_start_recovery);