--- drivers/md/dm-regions.c | 634 ++++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/dm-regions.h | 75 +++++ 2 files changed, 709 insertions(+) Index: linux/drivers/md/dm-regions.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux/drivers/md/dm-regions.c 2007-06-06 20:40:09.000000000 +0100 @@ -0,0 +1,634 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2004-2007 Red Hat Inc. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-log.h" +#include "dm-regions.h" + +#include +#include +#include +#include + +#define DM_MSG_PREFIX "region hash" + +/*----------------------------------------------------------------- + * Region hash + * + * The set splits itself up into discrete regions. + * Each region can be in one of three states: + * + * o clean + * o dirty, + * o nosync. + * + * There is no need to put clean regions in the hash. + * + * + * In addition to being present in the hash table a region _may_ + * be present on one of three lists. + * + * clean_regions: Regions on this list have no io pending to + * them, they are in sync, we are no longer interested in them, + * they are dull. rh_update_states() will remove them from the + * hash table. + * + * quiesced_regions: These regions have been spun down, ready + * for recovery. rh_recovery_start() will remove regions from + * this list and hand them to kmirrord, which will schedule the + * recovery io with kcopyd. + * + * recovered_regions: Regions that kcopyd has successfully + * recovered. rh_update_states() will now schedule any delayed + * io, up the recovery_count, and remove the region from the hash. + * + * There are 2 locks: + * A rw spin lock 'hash_lock' protects just the hash table, + * this is never held in write mode from interrupt context, + * which I believe means that we only have to disable irqs when + * doing a write lock. + * + * An ordinary spin lock 'region_lock' that protects the three + * lists in the region_hash, with the 'state', 'list' and + * 'delayed_bios' fields of the regions. This is used from irq + * context, so all other uses will have to suspend local irqs. + *---------------------------------------------------------------*/ +enum region_hash_flags { + RECOVERY, +}; + +struct region_hash { + unsigned max_recovery; /* Max # of regions to recover in parallel */ + unsigned long flags; + + /* Callback function to dispatch queued writes on recovered regions. */ + void (*dispatch)(void *context, struct bio_list *bios); + void *dispatch_context; + + /* Callback function to wakeup callers worker thread. */ + void (*wake)(void *context); + void *wake_context; + + uint32_t region_size; + unsigned region_shift; + + /* holds persistent region state */ + struct dirty_log *log; + + /* hash table */ + rwlock_t hash_lock; + mempool_t *region_pool; + unsigned mask; + unsigned nr_buckets; + struct list_head *buckets; + + spinlock_t region_lock; + struct semaphore recovery_count; + struct list_head clean_regions; + struct list_head quiesced_regions; + struct list_head recovered_regions; +}; + +struct region { + struct region_hash *rh; /* FIXME: can we get rid of this ? */ + region_t key; + int state; + void *context; /* Caller context. */ + + struct list_head hash_list; + struct list_head list; + + atomic_t pending; + struct bio_list delayed_bios; +}; + +/* + * Conversion fns + */ +region_t rh_sector_to_region(void *rh, sector_t sector) +{ + return sector >> ((struct region_hash*) rh)->region_shift; +} +EXPORT_SYMBOL(rh_sector_to_region); + +region_t rh_bio_to_region(void *rh, struct bio *bio) +{ + return rh_sector_to_region(rh, bio->bi_sector); +} +EXPORT_SYMBOL(rh_bio_to_region); + +sector_t rh_region_to_sector(void *rh, region_t region) +{ + return region << ((struct region_hash*) rh)->region_shift; +} +EXPORT_SYMBOL(rh_region_to_sector); + +/* + * Retrival fns. + */ +region_t rh_get_region_key(void *reg) +{ + return ((struct region *)reg)->key; +} +EXPORT_SYMBOL(rh_get_region_key); + +sector_t rh_get_region_size(void *rh) +{ + return ((struct region_hash *)rh)->region_size; +} +EXPORT_SYMBOL(rh_get_region_size); + +/* Squirrel a context with a region. */ +void *rh_reg_get_context(void *reg) +{ + return ((struct region*) reg)->context; +} +EXPORT_SYMBOL(rh_reg_get_context); + +void rh_reg_set_context(void *reg, void *context) +{ + ((struct region*) reg)->context = context; +} +EXPORT_SYMBOL(rh_reg_set_context); + +/* + * Region struct allocation/free. + */ +static void *region_alloc(unsigned gfp_mask, void *pool_data) +{ + return kmalloc(sizeof(struct region), gfp_mask); +} + +static void region_free(void *element, void *pool_data) +{ + kfree(element); +} + +#define MIN_REGIONS 64 +int rh_init(void **region_hash, + unsigned max_recovery, + void (*dispatch)(void *dispatch_context, struct bio_list *bios), + void *dispatch_context, + void (*wake)(void *wake_context), + void *wake_context, + struct dirty_log *log, uint32_t region_size, region_t nr_regions) +{ + unsigned nr_buckets, max_buckets; + size_t i; + struct region_hash *rh; + + if (region_size & (region_size - 1)) { + DMERR("region size must be 2^^n"); + return -EINVAL; + } + + rh = kmalloc(sizeof(*rh), GFP_KERNEL); + if (!rh) { + DMERR("unable to allocate region hash memory"); + return -ENOMEM; + } + + /* + * Calculate a suitable number of buckets for our hash + * table. + */ + max_buckets = nr_regions >> 6; + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1); + nr_buckets >>= 1; + + rh->max_recovery = max_recovery; + rh->dispatch = dispatch; + rh->dispatch_context = dispatch_context; + rh->wake = wake; + rh->wake_context = wake_context; + rh->log = log; + rh->region_size = region_size; + rh->region_shift = ffs(region_size) - 1; + rwlock_init(&rh->hash_lock); + rh->mask = nr_buckets - 1; + rh->nr_buckets = nr_buckets; + + rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); + if (!rh->buckets) { + DMERR("unable to allocate region hash bucket memory"); + vfree(rh); + return -ENOMEM; + } + + for (i = 0; i < nr_buckets; i++) + INIT_LIST_HEAD(rh->buckets + i); + + spin_lock_init(&rh->region_lock); + sema_init(&rh->recovery_count, 0); + INIT_LIST_HEAD(&rh->clean_regions); + INIT_LIST_HEAD(&rh->quiesced_regions); + INIT_LIST_HEAD(&rh->recovered_regions); + + rh->region_pool = mempool_create(MIN_REGIONS, region_alloc, + region_free, NULL); + if (!rh->region_pool) { + vfree(rh->buckets); + vfree(rh); + return -ENOMEM; + } + + *region_hash = rh; + + return 0; +} +EXPORT_SYMBOL(rh_init); + +void rh_exit(void *v) +{ + unsigned h; + struct region *reg, *tmp; + struct region_hash *rh = v; + + BUG_ON(!list_empty(&rh->quiesced_regions)); + + for (h = 0; h < rh->nr_buckets; h++) { + list_for_each_entry_safe(reg, tmp, rh->buckets + h, hash_list) { + BUG_ON(atomic_read(®->pending)); + mempool_free(reg, rh->region_pool); + } + } + + dm_destroy_dirty_log(rh->log); + + if (rh->region_pool) + mempool_destroy(rh->region_pool); + + vfree(rh->buckets); + kfree(rh); +} +EXPORT_SYMBOL(rh_exit); + +#define RH_HASH_MULT 2654435387U + +static inline unsigned rh_hash(struct region_hash *rh, region_t region) +{ + return (unsigned) ((region * RH_HASH_MULT) >> 12) & rh->mask; +} + +static struct region *__rh_lookup(struct region_hash *rh, region_t region) +{ + struct region *reg; + + list_for_each_entry(reg, rh->buckets + rh_hash(rh, region), hash_list) { + if (reg->key == region) + return reg; + } + + return NULL; +} + +static void __rh_insert(struct region_hash *rh, struct region *reg) +{ + unsigned h = rh_hash(rh, reg->key); + list_add(®->hash_list, rh->buckets + h); +} + +static struct region *__rh_alloc(struct region_hash *rh, region_t region) +{ + struct region *reg, *nreg; + + read_unlock(&rh->hash_lock); + nreg = mempool_alloc(rh->region_pool, GFP_NOIO); + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? + RH_CLEAN : RH_NOSYNC; + nreg->rh = rh; + nreg->key = region; + + INIT_LIST_HEAD(&nreg->list); + + atomic_set(&nreg->pending, 0); + bio_list_init(&nreg->delayed_bios); + + write_lock_irq(&rh->hash_lock); + + reg = __rh_lookup(rh, region); + if (reg) + /* we lost the race */ + mempool_free(nreg, rh->region_pool); + else { + __rh_insert(rh, nreg); + if (nreg->state == RH_CLEAN) { + spin_lock(&rh->region_lock); + list_add(&nreg->list, &rh->clean_regions); + spin_unlock(&rh->region_lock); + } + reg = nreg; + } + + write_unlock_irq(&rh->hash_lock); + read_lock(&rh->hash_lock); + + return reg; +} + +static inline struct region *__rh_find(struct region_hash *rh, region_t region) +{ + struct region *reg; + + reg = __rh_lookup(rh, region); + if (!reg) + reg = __rh_alloc(rh, region); + + return reg; +} + +int rh_state(void *v, region_t region, int may_block) +{ + int r = 0; + struct region *reg; + struct region_hash *rh = v; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + if (reg) + r = reg->state; + read_unlock(&rh->hash_lock); + + if (r) + return r; + + /* + * The region wasn't in the hash, so we fall back to the dirty log. + */ + r = rh->log->type->in_sync(rh->log, region, may_block); + + /* + * Any error from the dirty log (eg. -EWOULDBLOCK) gets + * taken as a RH_NOSYNC + */ + return r == 1 ? RH_CLEAN : RH_NOSYNC; +} +EXPORT_SYMBOL(rh_state); + +void rh_update_states(void *v) +{ + struct region *reg, *next; + struct region_hash *rh = v; + LIST_HEAD(clean); + LIST_HEAD(recovered); + + /* + * Quickly grab the lists. + */ + write_lock_irq(&rh->hash_lock); + spin_lock(&rh->region_lock); + if (!list_empty(&rh->clean_regions)) { + list_splice(&rh->clean_regions, &clean); + INIT_LIST_HEAD(&rh->clean_regions); + + list_for_each_entry(reg, &clean, list) { + rh->log->type->clear_region(rh->log, reg->key); + list_del(®->hash_list); + } + } + + if (!list_empty(&rh->recovered_regions)) { + list_splice(&rh->recovered_regions, &recovered); + INIT_LIST_HEAD(&rh->recovered_regions); + + list_for_each_entry(reg, &recovered, list) + list_del(®->hash_list); + } + + spin_unlock(&rh->region_lock); + write_unlock_irq(&rh->hash_lock); + + /* + * All the regions on the recovered and clean lists have + * now been pulled out of the system, so no need to do + * any more locking. + */ + list_for_each_entry_safe (reg, next, &recovered, list) { + if (reg->state != RH_ERROR) + rh->log->type->clear_region(rh->log, reg->key); + + rh->log->type->set_region_sync(rh->log, reg->key, + reg->state != RH_ERROR); + up(&rh->recovery_count); + if (reg->delayed_bios.head) + rh->dispatch(rh->dispatch_context, ®->delayed_bios); + + mempool_free(reg, rh->region_pool); + } + + if (!list_empty(&recovered)) + rh_flush(rh); + + list_for_each_entry_safe(reg, next, &clean, list) + mempool_free(reg, rh->region_pool); +} +EXPORT_SYMBOL(rh_update_states); + +void rh_inc(void *v, region_t region) +{ + struct region *reg; + struct region_hash *rh = v; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + if (reg->state == RH_CLEAN) { + rh->log->type->mark_region(rh->log, reg->key); + + spin_lock_irq(&rh->region_lock); + reg->state = RH_DIRTY; + list_del_init(®->list); /* Take off the clean list. */ + spin_unlock_irq(&rh->region_lock); + } + + atomic_inc(®->pending); + read_unlock(&rh->hash_lock); +} +EXPORT_SYMBOL(rh_inc); + +void rh_inc_pending(void *v, struct bio_list *bios) +{ + struct bio *bio; + struct region_hash *rh = v; + + for (bio = bios->head; bio; bio = bio->bi_next) + rh_inc(rh, rh_bio_to_region(rh, bio)); +} +EXPORT_SYMBOL(rh_inc_pending); + +void rh_dec(void *v, region_t region) +{ + unsigned long flags; + struct region *reg; + struct region_hash *rh = v; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + BUG_ON(!reg); + + if (atomic_dec_and_test(®->pending)) { + spin_lock_irqsave(&rh->region_lock, flags); + if (reg->state == RH_RECOVERING) { + list_add_tail(®->list, &rh->quiesced_regions); + } else { + reg->state = RH_CLEAN; + list_add(®->list, &rh->clean_regions); + } + spin_unlock_irqrestore(&rh->region_lock, flags); + } +} +EXPORT_SYMBOL(rh_dec); + +/* + * Starts quiescing a region in preparation for recovery. + */ +static int __rh_recovery_prepare(struct region_hash *rh) +{ + int r; + struct region *reg; + region_t region; + + /* + * Ask the dirty log what's next. + */ + r = rh->log->type->get_resync_work(rh->log, ®ion); + if (r <= 0) + return r; + + /* + * Get this region, and start it quiescing + * by setting the recovering flag. + */ + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + spin_lock_irq(&rh->region_lock); + + reg->state = RH_RECOVERING; + + /* Already quiesced ? */ + list_del_init(®->list); + if (!atomic_read(®->pending)) + list_add(®->list, &rh->quiesced_regions); + + spin_unlock_irq(&rh->region_lock); + + return 1; +} + +int rh_recovery_prepare(void *v) +{ + struct region_hash *rh = v; + + if (test_bit(RECOVERY, &rh->flags)) { + while (!down_trylock(&rh->recovery_count)) { + if (__rh_recovery_prepare(rh) <= 0) { + up(&rh->recovery_count); + return -ENOENT; + } + } + } + + return 0; +} +EXPORT_SYMBOL(rh_recovery_prepare); + +/* + * Returns any quiesced regions. + */ +void *rh_recovery_start(void *v) +{ + struct region *reg = NULL; + struct region_hash *rh = v; + + spin_lock_irq(&rh->region_lock); + if (!list_empty(&rh->quiesced_regions)) { + reg = list_entry(rh->quiesced_regions.next, + struct region, list); + list_del_init(®->list); /* Remove from the quiesced list. */ + } + spin_unlock_irq(&rh->region_lock); + + return (void*) reg; +} +EXPORT_SYMBOL(rh_recovery_start); + +/* + * Put region on list of recovered ones. + */ +void rh_recovery_end(void *v, int error) +{ + struct region *reg = v; + struct region_hash *rh = reg->rh; + + if (error) + reg->state = RH_ERROR; + + spin_lock_irq(&rh->region_lock); + list_add(®->list, &rh->recovered_regions); + spin_unlock_irq(&rh->region_lock); +} +EXPORT_SYMBOL(rh_recovery_end); + +void rh_flush(void *v) +{ + struct region_hash *rh = v; + + rh->log->type->flush(rh->log); +} +EXPORT_SYMBOL(rh_flush); + +void rh_delay_by_region(void *v, struct bio *bio, region_t region) +{ + struct region_hash *rh = v; + struct region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + bio_list_add(®->delayed_bios, bio); + read_unlock(&rh->hash_lock); +} +EXPORT_SYMBOL(rh_delay_by_region); + +void rh_delay(void *v, struct bio *bio) +{ + return rh_delay_by_region(v, bio, rh_bio_to_region(v, bio)); +} +EXPORT_SYMBOL(rh_delay); + +void rh_stop_recovery(void *v) +{ + int i; + struct region_hash *rh = v; + + clear_bit(RECOVERY, &rh->flags); + rh->wake(rh->wake_context); + + /* wait for any recovering regions */ + for (i = 0; i < rh->max_recovery; i++) + down(&rh->recovery_count); +} +EXPORT_SYMBOL(rh_stop_recovery); + +void rh_start_recovery(void *v) +{ + int i; + struct region_hash *rh = v; + + set_bit(RECOVERY, &rh->flags); + for (i = 0; i < rh->max_recovery; i++) + up(&rh->recovery_count); + + rh->wake(rh->wake_context); +} +EXPORT_SYMBOL(rh_start_recovery); + +MODULE_DESCRIPTION(DM_NAME " region hash"); +MODULE_AUTHOR("Heinz Mauelshagen"); +MODULE_LICENSE("GPL"); Index: linux/drivers/md/dm-regions.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux/drivers/md/dm-regions.h 2007-06-06 20:40:09.000000000 +0100 @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2004-2007 Red Hat Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_REGIONS_H +#define DM_REGIONS_H + +#include "dm.h" +#include "dm-log.h" +#include "dm-bio-list.h" + +/*----------------------------------------------------------------- + * Region hash + *----------------------------------------------------------------*/ + +/* + * States a region can have. + */ +enum { + RH_CLEAN = 0x01, /* No writes in flight. */ + RH_DIRTY = 0x02, /* Writes in flight. */ + RH_NOSYNC = 0x04, /* Out of sync. */ + RH_RECOVERING = 0x08, /* Under resynchronization. */ + RH_ERROR = 0x10, /* Error recovering region */ +}; + +/* + * Conversion fns + */ +region_t rh_bio_to_region(void *rh, struct bio *bio); +region_t rh_sector_to_region(void *rh, sector_t sector); +sector_t rh_region_to_sector(void *rh, region_t region); + + +/* + * Functions to set a caller context in a region. + */ +void *rh_reg_get_context(void *reg); +void rh_reg_set_context(void *reg, void *context); + +/* + * Reagion hash and region parameters. + */ +region_t rh_get_region_size(void *rh); +sector_t rh_get_region_key(void *reg); + +int rh_init(void **rh, + unsigned max_recovery, + void (*dispatch)(void *dispatch_context, struct bio_list *bios), + void *dispatch_context, + void (*wake)(void *wake_context), + void *wake_context, + struct dirty_log *log, uint32_t region_size, region_t nr_regions); +void rh_exit(void *rh); + +int rh_state(void *rh, region_t region, int may_block); +void rh_update_states(void *rh); +void rh_flush(void *rh); + +void rh_inc(void *rh, region_t region); +void rh_inc_pending(void *rh, struct bio_list *bios); +void rh_dec(void *rh, region_t region); +void rh_delay(void *rh, struct bio *bio); +void rh_delay_by_region(void *rh, struct bio *bio, region_t region); + +int rh_recovery_prepare(void *rh); +void *rh_recovery_start(void *rh); +void rh_recovery_end(void *reg, int error); +void rh_stop_recovery(void *rh); +void rh_start_recovery(void *rh); + +#endif