Experimental version of the snapshot and mirror targets for 2.6. --- diff/drivers/md/Kconfig 2003-12-29 10:09:48.000000000 +0000 +++ source/drivers/md/Kconfig 2003-12-29 10:12:47.000000000 +0000 @@ -143,5 +143,18 @@ Recent tools use a new version of the ioctl interface, only select this option if you intend using such tools. +config DM_SNAPSHOT + tristate "Snapshot target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + Allow volume managers to take writeable snapshots of a device. + +config DM_MIRROR + tristate "Mirror target (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + Allow volume managers to mirror logical volumes, also + needed for live data migration tools such as 'pvmove'. + endmenu --- diff/drivers/md/Makefile 2003-02-13 11:46:52.000000000 +0000 +++ source/drivers/md/Makefile 2003-12-29 10:12:47.000000000 +0000 @@ -3,7 +3,9 @@ # dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ - dm-ioctl.o + dm-ioctl.o dm-io.o kcopyd.o dm-daemon.o + +dm-mirror-objs := dm-log.o dm-raid1.o # Note: link order is important. All raid personalities # and xor.o must come before md.o, as they each initialise @@ -17,3 +19,5 @@ obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_BLK_DEV_MD) += md.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o +obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o dm-exception-store.o +obj-$(CONFIG_DM_MIRROR) += dm-mirror.o --- diff/drivers/md/dm-linear.c 2003-09-30 15:46:14.000000000 +0100 +++ source/drivers/md/dm-linear.c 2003-12-29 10:12:47.000000000 +0000 @@ -65,7 +65,8 @@ kfree(lc); } -static int linear_map(struct dm_target *ti, struct bio *bio) +static int linear_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) { struct linear_c *lc = (struct linear_c *) ti->private; --- diff/drivers/md/dm-stripe.c 2003-09-30 15:46:14.000000000 +0100 +++ source/drivers/md/dm-stripe.c 2003-12-29 10:12:47.000000000 +0000 @@ -166,7 +166,8 @@ kfree(sc); } -static int stripe_map(struct dm_target *ti, struct bio *bio) +static int stripe_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) { struct stripe_c *sc = (struct stripe_c *) ti->private; --- diff/drivers/md/dm-table.c 2003-12-29 10:09:48.000000000 +0000 +++ source/drivers/md/dm-table.c 2003-12-29 10:12:47.000000000 +0000 @@ -149,7 +149,7 @@ return 0; } -static void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) +void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) { unsigned long size; void *addr; --- diff/drivers/md/dm-target.c 2003-06-30 10:07:21.000000000 +0100 +++ source/drivers/md/dm-target.c 2003-12-29 10:12:47.000000000 +0000 @@ -157,7 +157,8 @@ /* empty */ } -static int io_err_map(struct dm_target *ti, struct bio *bio) +static int io_err_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) { return -EIO; } --- diff/drivers/md/dm.c 2003-12-29 10:09:48.000000000 +0000 +++ source/drivers/md/dm.c 2003-12-29 10:12:47.000000000 +0000 @@ -6,6 +6,9 @@ #include "dm.h" +// FIXME: remove this +#include "dm-log.h" + #include #include #include @@ -20,6 +23,9 @@ static unsigned int major = 0; static unsigned int _major = 0; +/* + * One of these is allocated per bio. + */ struct dm_io { struct mapped_device *md; int error; @@ -27,6 +33,16 @@ atomic_t io_count; }; +/* + * One of these is allocated per target within a bio. Hopefully + * this will be simplified out one day. + */ +struct target_io { + struct dm_io *io; + struct dm_target *ti; + union map_info info; +}; + struct deferred_io { struct bio *bio; struct deferred_io *next; @@ -63,6 +79,7 @@ * io objects are allocated from here. */ mempool_t *io_pool; + mempool_t *tio_pool; /* * Event handling. @@ -73,6 +90,7 @@ #define MIN_IOS 256 static kmem_cache_t *_io_cache; +static kmem_cache_t *_tio_cache; static __init int local_init(void) { @@ -84,9 +102,18 @@ if (!_io_cache) return -ENOMEM; + /* allocate a slab for the target ios */ + _tio_cache = kmem_cache_create("dm tio", sizeof(struct target_io), + 0, 0, NULL, NULL); + if (!_tio_cache) { + kmem_cache_destroy(_io_cache); + return -ENOMEM; + } + _major = major; r = register_blkdev(_major, _name); if (r < 0) { + kmem_cache_destroy(_tio_cache); kmem_cache_destroy(_io_cache); return r; } @@ -99,6 +126,7 @@ static void local_exit(void) { + kmem_cache_destroy(_tio_cache); kmem_cache_destroy(_io_cache); if (unregister_blkdev(_major, _name) < 0) @@ -124,6 +152,7 @@ xx(dm_target) xx(dm_linear) xx(dm_stripe) + xx(kcopyd) xx(dm_interface) #undef xx }; @@ -184,6 +213,16 @@ mempool_free(io, md->io_pool); } +static inline struct target_io *alloc_tio(struct mapped_device *md) +{ + return mempool_alloc(md->tio_pool, GFP_NOIO); +} + +static inline void free_tio(struct mapped_device *md, struct target_io *tio) +{ + mempool_free(tio, md->tio_pool); +} + static inline struct deferred_io *alloc_deferred(void) { return kmalloc(sizeof(struct deferred_io), GFP_NOIO); @@ -229,15 +268,6 @@ * interests of getting something for people to use I give * you this clearly demarcated crap. *---------------------------------------------------------------*/ -static inline sector_t to_sector(unsigned int bytes) -{ - return bytes >> SECTOR_SHIFT; -} - -static inline unsigned int to_bytes(sector_t sector) -{ - return sector << SECTOR_SHIFT; -} /* * Decrements the number of outstanding ios that a bio has been @@ -266,17 +296,30 @@ static int clone_endio(struct bio *bio, unsigned int done, int error) { - struct dm_io *io = bio->bi_private; + int r = 0; + struct target_io *tio = bio->bi_private; + struct dm_io *io = tio->io; + dm_endio_fn endio = tio->ti->type->end_io; if (bio->bi_size) return 1; + if (endio) { + r = endio(tio->ti, bio, error, &tio->info); + if (r < 0) + error = r; + + else if (r > 0) + /* the target wants another shot at the io */ + return 1; /* FIXME: do we need to reset bio at all ? */ + } + + free_tio(io->md, tio); dec_pending(io, error); bio_put(bio); - return 0; + return r; } - static sector_t max_io_len(struct mapped_device *md, sector_t sector, struct dm_target *ti) { @@ -297,7 +340,8 @@ return len; } -static void __map_bio(struct dm_target *ti, struct bio *clone, struct dm_io *io) +static void __map_bio(struct dm_target *ti, struct bio *clone, + struct target_io *tio) { int r; @@ -307,22 +351,24 @@ BUG_ON(!clone->bi_size); clone->bi_end_io = clone_endio; - clone->bi_private = io; + clone->bi_private = tio; /* * Map the clone. If r == 0 we don't need to do * anything, the target has assumed ownership of * this io. */ - atomic_inc(&io->io_count); - r = ti->type->map(ti, clone); + atomic_inc(&tio->io->io_count); + r = ti->type->map(ti, clone, &tio->info); if (r > 0) /* the bio has been remapped so dispatch it */ generic_make_request(clone); - else if (r < 0) + else if (r < 0) { /* error the io and bail out */ - dec_pending(io, -EIO); + free_tio(tio->io->md, tio); + dec_pending(tio->io, -EIO); + } } struct clone_info { @@ -381,6 +427,15 @@ struct bio *clone, *bio = ci->bio; struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector); sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); + struct target_io *tio; + + /* + * Allocate a target io object. + */ + tio = alloc_tio(ci->md); + tio->io = ci->io; + tio->ti = ti; + memset(&tio->info, 0, sizeof(tio->info)); if (ci->sector_count <= max) { /* @@ -389,7 +444,7 @@ */ clone = clone_bio(bio, ci->sector, ci->idx, bio->bi_vcnt - ci->idx, ci->sector_count); - __map_bio(ti, clone, ci->io); + __map_bio(ti, clone, tio); ci->sector_count = 0; } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { @@ -412,7 +467,7 @@ } clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len); - __map_bio(ti, clone, ci->io); + __map_bio(ti, clone, tio); ci->sector += len; ci->sector_count -= len; @@ -427,7 +482,7 @@ clone = split_bvec(bio, ci->sector, ci->idx, bv->bv_offset, max); - __map_bio(ti, clone, ci->io); + __map_bio(ti, clone, tio); ci->sector += max; ci->sector_count -= max; @@ -436,7 +491,7 @@ len = to_sector(bv->bv_len) - max; clone = split_bvec(bio, ci->sector, ci->idx, bv->bv_offset + to_bytes(max), len); - __map_bio(ti, clone, ci->io); + __map_bio(ti, clone, tio); ci->sector += len; ci->sector_count -= len; @@ -588,41 +643,33 @@ /* get a minor number for the dev */ r = persistent ? specific_minor(minor) : next_free_minor(&minor); - if (r < 0) { - kfree(md); - return NULL; - } + if (r < 0) + goto bad1; memset(md, 0, sizeof(*md)); init_rwsem(&md->lock); atomic_set(&md->holders, 1); md->queue = blk_alloc_queue(GFP_KERNEL); - if (!md->queue) { - kfree(md); - return NULL; - } + if (!md->queue) + goto bad1; md->queue->queuedata = md; blk_queue_make_request(md->queue, dm_request); md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, mempool_free_slab, _io_cache); - if (!md->io_pool) { - free_minor(minor); - blk_put_queue(md->queue); - kfree(md); - return NULL; - } + if (!md->io_pool) + goto bad2; + + md->tio_pool = mempool_create(MIN_IOS, mempool_alloc_slab, + mempool_free_slab, _tio_cache); + if (!md->tio_pool) + goto bad3; md->disk = alloc_disk(1); - if (!md->disk) { - mempool_destroy(md->io_pool); - free_minor(minor); - blk_put_queue(md->queue); - kfree(md); - return NULL; - } + if (!md->disk) + goto bad4; md->disk->major = _major; md->disk->first_minor = minor; @@ -637,11 +684,22 @@ init_waitqueue_head(&md->eventq); return md; + + bad4: + mempool_destroy(md->tio_pool); + bad3: + mempool_destroy(md->io_pool); + bad2: + free_minor(minor); + bad1: + kfree(md); + return NULL; } static void free_dev(struct mapped_device *md) { free_minor(md->disk->first_minor); + mempool_destroy(md->tio_pool); mempool_destroy(md->io_pool); del_gendisk(md->disk); put_disk(md->disk); --- diff/drivers/md/dm.h 2003-12-29 10:09:48.000000000 +0000 +++ source/drivers/md/dm.h 2003-12-29 10:12:47.000000000 +0000 @@ -151,6 +151,16 @@ return dm_round_up(n, size) / size; } +static inline sector_t to_sector(unsigned long n) +{ + return (n >> 9); +} + +static inline unsigned long to_bytes(sector_t n) +{ + return (n << 9); +} + /* * The device-mapper can be driven through one of two interfaces; * ioctl or filesystem, depending which patch you have applied. @@ -167,4 +177,9 @@ int dm_stripe_init(void); void dm_stripe_exit(void); +int kcopyd_init(void); +void kcopyd_exit(void); + +void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); + #endif --- diff/include/linux/device-mapper.h 2003-06-30 10:07:24.000000000 +0100 +++ source/include/linux/device-mapper.h 2003-12-29 10:12:47.000000000 +0000 @@ -13,6 +13,11 @@ typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; +union map_info { + void *ptr; + unsigned long long ll; +}; + /* * In the constructor the target parameter will already have the * table, type, begin and len fields filled in. @@ -32,7 +37,19 @@ * = 0: The target will handle the io by resubmitting it later * > 0: simple remap complete */ -typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio); +typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio, + union map_info *map_context); + +/* + * Returns: + * < 0 : error (currently ignored) + * 0 : ended successfully + * 1 : for some reason the io has still not completed (eg, + * multipath target might want to requeue a failed io). + */ +typedef int (*dm_endio_fn) (struct dm_target *ti, + struct bio *bio, int error, + union map_info *map_context); typedef void (*dm_suspend_fn) (struct dm_target *ti); typedef void (*dm_resume_fn) (struct dm_target *ti); @@ -60,6 +77,7 @@ dm_ctr_fn ctr; dm_dtr_fn dtr; dm_map_fn map; + dm_endio_fn end_io; dm_suspend_fn suspend; dm_resume_fn resume; dm_status_fn status; --- diff/mm/mempool.c 2003-01-13 14:18:16.000000000 +0000 +++ source/mm/mempool.c 2003-12-29 10:12:47.000000000 +0000 @@ -89,11 +89,6 @@ } EXPORT_SYMBOL(mempool_create); -/* - * mempool_resize is disabled for now, because it has no callers. Feel free - * to turn it back on if needed. - */ -#if 0 /** * mempool_resize - resize an existing memory pool * @pool: pointer to the memory pool which was allocated via @@ -163,7 +158,6 @@ return 0; } EXPORT_SYMBOL(mempool_resize); -#endif /** * mempool_destroy - deallocate a memory pool --- diff/drivers/md/dm-daemon.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-daemon.c 2003-12-29 10:12:47.000000000 +0000 @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#include "dm.h" +#include "dm-daemon.h" + +#include +#include +#include + +static int daemon(void *arg) +{ + struct dm_daemon *dd = (struct dm_daemon *) arg; + DECLARE_WAITQUEUE(wq, current); + + daemonize("%s", dd->name); + + atomic_set(&dd->please_die, 0); + + add_wait_queue(&dd->job_queue, &wq); + + down(&dd->run_lock); + up(&dd->start_lock); + + /* + * dd->fn() could do anything, very likely it will + * suspend. So we can't set the state to + * TASK_INTERRUPTIBLE before calling it. In order to + * prevent a race with a waking thread we do this little + * dance with the dd->woken variable. + */ + while (1) { + if (atomic_read(&dd->please_die)) + goto out; + +#if 0 + /* FIXME: not convinced by this */ + if (current->flags & PF_FREEZE) + refrigerator(PF_IOTHREAD); +#endif + + do { + set_current_state(TASK_RUNNING); + atomic_set(&dd->woken, 0); + dd->fn(); + set_current_state(TASK_INTERRUPTIBLE); + + } while (atomic_read(&dd->woken)); + + schedule(); + } + + out: + remove_wait_queue(&dd->job_queue, &wq); + up(&dd->run_lock); + return 0; +} + +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void)) +{ + pid_t pid = 0; + + /* + * Initialise the dm_daemon. + */ + dd->fn = fn; + strncpy(dd->name, name, sizeof(dd->name) - 1); + sema_init(&dd->start_lock, 1); + sema_init(&dd->run_lock, 1); + init_waitqueue_head(&dd->job_queue); + + /* + * Start the new thread. + */ + down(&dd->start_lock); + pid = kernel_thread(daemon, dd, CLONE_KERNEL); + if (pid <= 0) { + DMERR("Failed to start %s thread", name); + return -EAGAIN; + } + + /* + * wait for the daemon to up this mutex. + */ + down(&dd->start_lock); + up(&dd->start_lock); + + return 0; +} + +void dm_daemon_stop(struct dm_daemon *dd) +{ + atomic_set(&dd->please_die, 1); + dm_daemon_wake(dd); + down(&dd->run_lock); + up(&dd->run_lock); +} + +void dm_daemon_wake(struct dm_daemon *dd) +{ + atomic_set(&dd->woken, 1); + wake_up_interruptible(&dd->job_queue); +} + +EXPORT_SYMBOL(dm_daemon_start); +EXPORT_SYMBOL(dm_daemon_stop); +EXPORT_SYMBOL(dm_daemon_wake); --- diff/drivers/md/dm-daemon.h 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-daemon.h 2003-12-29 10:12:47.000000000 +0000 @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#ifndef DM_DAEMON_H +#define DM_DAEMON_H + +#include +#include + +struct dm_daemon { + void (*fn)(void); + char name[16]; + atomic_t please_die; + struct semaphore start_lock; + struct semaphore run_lock; + + atomic_t woken; + wait_queue_head_t job_queue; +}; + +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void)); +void dm_daemon_stop(struct dm_daemon *dd); +void dm_daemon_wake(struct dm_daemon *dd); +int dm_daemon_running(struct dm_daemon *dd); + +#endif --- diff/drivers/md/dm-exception-store.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-exception-store.c 2003-12-29 10:12:47.000000000 +0000 @@ -0,0 +1,674 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-snapshot.h" +#include "dm-io.h" +#include "kcopyd.h" + +#include +#include +#include +#include + +/*----------------------------------------------------------------- + * Persistent snapshots, by persistent we mean that the snapshot + * will survive a reboot. + *---------------------------------------------------------------*/ + +/* + * We need to store a record of which parts of the origin have + * been copied to the snapshot device. The snapshot code + * requires that we copy exception chunks to chunk aligned areas + * of the COW store. It makes sense therefore, to store the + * metadata in chunk size blocks. + * + * There is no backward or forward compatibility implemented, + * snapshots with different disk versions than the kernel will + * not be usable. It is expected that "lvcreate" will blank out + * the start of a fresh COW device before calling the snapshot + * constructor. + * + * The first chunk of the COW device just contains the header. + * After this there is a chunk filled with exception metadata, + * followed by as many exception chunks as can fit in the + * metadata areas. + * + * All on disk structures are in little-endian format. The end + * of the exceptions info is indicated by an exception with a + * new_chunk of 0, which is invalid since it would point to the + * header chunk. + */ + +/* + * Magic for persistent snapshots: "SnAp" - Feeble isn't it. + */ +#define SNAP_MAGIC 0x70416e53 + +/* + * The on-disk version of the metadata. + */ +#define SNAPSHOT_DISK_VERSION 1 + +struct disk_header { + uint32_t magic; + + /* + * Is this snapshot valid. There is no way of recovering + * an invalid snapshot. + */ + uint32_t valid; + + /* + * Simple, incrementing version. no backward + * compatibility. + */ + uint32_t version; + + /* In sectors */ + uint32_t chunk_size; +}; + +struct disk_exception { + uint64_t old_chunk; + uint64_t new_chunk; +}; + +struct commit_callback { + void (*callback)(void *, int success); + void *context; +}; + +/* + * The top level structure for a persistent exception store. + */ +struct pstore { + struct dm_snapshot *snap; /* up pointer to my snapshot */ + int version; + int valid; + uint32_t chunk_size; + uint32_t exceptions_per_area; + + /* + * Now that we have an asynchronous kcopyd there is no + * need for large chunk sizes, so it wont hurt to have a + * whole chunks worth of metadata in memory at once. + */ + void *area; + + /* + * Used to keep track of which metadata area the data in + * 'chunk' refers to. + */ + uint32_t current_area; + + /* + * The next free chunk for an exception. + */ + uint32_t next_free; + + /* + * The index of next free exception in the current + * metadata area. + */ + uint32_t current_committed; + + atomic_t pending_count; + uint32_t callback_count; + struct commit_callback *callbacks; +}; + +static inline unsigned int sectors_to_pages(unsigned int sectors) +{ + return sectors / (PAGE_SIZE >> 9); +} + +static int alloc_area(struct pstore *ps) +{ + int r = -ENOMEM; + size_t i, len, nr_pages; + struct page *page, *last = NULL; + + len = ps->chunk_size << SECTOR_SHIFT; + + /* + * Allocate the chunk_size block of memory that will hold + * a single metadata area. + */ + ps->area = vmalloc(len); + if (!ps->area) + return r; + + nr_pages = sectors_to_pages(ps->chunk_size); + + /* + * We lock the pages for ps->area into memory since + * they'll be doing a lot of io. We also chain them + * together ready for dm-io. + */ + for (i = 0; i < nr_pages; i++) { + page = vmalloc_to_page(ps->area + (i * PAGE_SIZE)); + SetPageLocked(page); + if (last) + last->list.next = &page->list; + last = page; + } + + return 0; +} + +static void free_area(struct pstore *ps) +{ + size_t i, nr_pages; + struct page *page; + + nr_pages = sectors_to_pages(ps->chunk_size); + for (i = 0; i < nr_pages; i++) { + page = vmalloc_to_page(ps->area + (i * PAGE_SIZE)); + page->list.next = NULL; + ClearPageLocked(page); + } + + vfree(ps->area); +} + +/* + * Read or write a chunk aligned and sized block of data from a device. + */ +static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) +{ + struct io_region where; + unsigned long bits; + + where.bdev = ps->snap->cow->bdev; + where.sector = ps->chunk_size * chunk; + where.count = ps->chunk_size; + + return dm_io_sync(1, &where, rw, vmalloc_to_page(ps->area), 0, &bits); +} + +/* + * Read or write a metadata area. Remembering to skip the first + * chunk which holds the header. + */ +static int area_io(struct pstore *ps, uint32_t area, int rw) +{ + int r; + uint32_t chunk; + + /* convert a metadata area index to a chunk index */ + chunk = 1 + ((ps->exceptions_per_area + 1) * area); + + r = chunk_io(ps, chunk, rw); + if (r) + return r; + + ps->current_area = area; + return 0; +} + +static int zero_area(struct pstore *ps, uint32_t area) +{ + memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); + return area_io(ps, area, WRITE); +} + +static int read_header(struct pstore *ps, int *new_snapshot) +{ + int r; + struct disk_header *dh; + + r = chunk_io(ps, 0, READ); + if (r) + return r; + + dh = (struct disk_header *) ps->area; + + if (le32_to_cpu(dh->magic) == 0) { + *new_snapshot = 1; + + } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) { + *new_snapshot = 0; + ps->valid = le32_to_cpu(dh->valid); + ps->version = le32_to_cpu(dh->version); + ps->chunk_size = le32_to_cpu(dh->chunk_size); + + } else { + DMWARN("Invalid/corrupt snapshot"); + r = -ENXIO; + } + + return r; +} + +static int write_header(struct pstore *ps) +{ + struct disk_header *dh; + + memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); + + dh = (struct disk_header *) ps->area; + dh->magic = cpu_to_le32(SNAP_MAGIC); + dh->valid = cpu_to_le32(ps->valid); + dh->version = cpu_to_le32(ps->version); + dh->chunk_size = cpu_to_le32(ps->chunk_size); + + return chunk_io(ps, 0, WRITE); +} + +/* + * Access functions for the disk exceptions, these do the endian conversions. + */ +static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) +{ + if (index >= ps->exceptions_per_area) + return NULL; + + return ((struct disk_exception *) ps->area) + index; +} + +static int read_exception(struct pstore *ps, + uint32_t index, struct disk_exception *result) +{ + struct disk_exception *e; + + e = get_exception(ps, index); + if (!e) + return -EINVAL; + + /* copy it */ + result->old_chunk = le64_to_cpu(e->old_chunk); + result->new_chunk = le64_to_cpu(e->new_chunk); + + return 0; +} + +static int write_exception(struct pstore *ps, + uint32_t index, struct disk_exception *de) +{ + struct disk_exception *e; + + e = get_exception(ps, index); + if (!e) + return -EINVAL; + + /* copy it */ + e->old_chunk = cpu_to_le64(de->old_chunk); + e->new_chunk = cpu_to_le64(de->new_chunk); + + return 0; +} + +/* + * Registers the exceptions that are present in the current area. + * 'full' is filled in to indicate if the area has been + * filled. + */ +static int insert_exceptions(struct pstore *ps, int *full) +{ + int r; + unsigned int i; + struct disk_exception de; + + /* presume the area is full */ + *full = 1; + + for (i = 0; i < ps->exceptions_per_area; i++) { + r = read_exception(ps, i, &de); + + if (r) + return r; + + /* + * If the new_chunk is pointing at the start of + * the COW device, where the first metadata area + * is we know that we've hit the end of the + * exceptions. Therefore the area is not full. + */ + if (de.new_chunk == 0LL) { + ps->current_committed = i; + *full = 0; + break; + } + + /* + * Keep track of the start of the free chunks. + */ + if (ps->next_free <= de.new_chunk) + ps->next_free = de.new_chunk + 1; + + /* + * Otherwise we add the exception to the snapshot. + */ + r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); + if (r) + return r; + } + + return 0; +} + +static int read_exceptions(struct pstore *ps) +{ + uint32_t area; + int r, full = 1; + + /* + * Keeping reading chunks and inserting exceptions until + * we find a partially full area. + */ + for (area = 0; full; area++) { + r = area_io(ps, area, READ); + if (r) + return r; + + r = insert_exceptions(ps, &full); + if (r) + return r; + } + + return 0; +} + +static inline struct pstore *get_info(struct exception_store *store) +{ + return (struct pstore *) store->context; +} + +static void persistent_fraction_full(struct exception_store *store, + sector_t *numerator, sector_t *denominator) +{ + *numerator = get_info(store)->next_free * store->snap->chunk_size; + *denominator = get_dev_size(store->snap->cow->bdev); +} + +static void persistent_destroy(struct exception_store *store) +{ + struct pstore *ps = get_info(store); + + dm_io_put(sectors_to_pages(ps->chunk_size)); + vfree(ps->callbacks); + free_area(ps); + kfree(ps); +} + +static int persistent_read_metadata(struct exception_store *store) +{ + int r, new_snapshot; + struct pstore *ps = get_info(store); + + /* + * Read the snapshot header. + */ + r = read_header(ps, &new_snapshot); + if (r) + return r; + + /* + * Do we need to setup a new snapshot ? + */ + if (new_snapshot) { + r = write_header(ps); + if (r) { + DMWARN("write_header failed"); + return r; + } + + r = zero_area(ps, 0); + if (r) { + DMWARN("zero_area(0) failed"); + return r; + } + + } else { + /* + * Sanity checks. + */ + if (!ps->valid) { + DMWARN("snapshot is marked invalid"); + return -EINVAL; + } + + if (ps->version != SNAPSHOT_DISK_VERSION) { + DMWARN("unable to handle snapshot disk version %d", + ps->version); + return -EINVAL; + } + + /* + * Read the metadata. + */ + r = read_exceptions(ps); + if (r) + return r; + } + + return 0; +} + +static int persistent_prepare(struct exception_store *store, + struct exception *e) +{ + struct pstore *ps = get_info(store); + uint32_t stride; + sector_t size = get_dev_size(store->snap->cow->bdev); + + /* Is there enough room ? */ + if (size < ((ps->next_free + 1) * store->snap->chunk_size)) + return -ENOSPC; + + e->new_chunk = ps->next_free; + + /* + * Move onto the next free pending, making sure to take + * into account the location of the metadata chunks. + */ + stride = (ps->exceptions_per_area + 1); + if ((++ps->next_free % stride) == 1) + ps->next_free++; + + atomic_inc(&ps->pending_count); + return 0; +} + +static void persistent_commit(struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context) +{ + int r; + unsigned int i; + struct pstore *ps = get_info(store); + struct disk_exception de; + struct commit_callback *cb; + + de.old_chunk = e->old_chunk; + de.new_chunk = e->new_chunk; + write_exception(ps, ps->current_committed++, &de); + + /* + * Add the callback to the back of the array. This code + * is the only place where the callback array is + * manipulated, and we know that it will never be called + * multiple times concurrently. + */ + cb = ps->callbacks + ps->callback_count++; + cb->callback = callback; + cb->context = callback_context; + + /* + * If there are no more exceptions in flight, or we have + * filled this metadata area we commit the exceptions to + * disk. + */ + if (atomic_dec_and_test(&ps->pending_count) || + (ps->current_committed == ps->exceptions_per_area)) { + r = area_io(ps, ps->current_area, WRITE); + if (r) + ps->valid = 0; + + for (i = 0; i < ps->callback_count; i++) { + cb = ps->callbacks + i; + cb->callback(cb->context, r == 0 ? 1 : 0); + } + + ps->callback_count = 0; + } + + /* + * Have we completely filled the current area ? + */ + if (ps->current_committed == ps->exceptions_per_area) { + ps->current_committed = 0; + r = zero_area(ps, ps->current_area + 1); + if (r) + ps->valid = 0; + } +} + +static void persistent_drop(struct exception_store *store) +{ + struct pstore *ps = get_info(store); + + ps->valid = 0; + if (write_header(ps)) + DMWARN("write header failed"); +} + +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) +{ + int r; + struct pstore *ps; + + r = dm_io_get(sectors_to_pages(chunk_size)); + if (r) + return r; + + /* allocate the pstore */ + ps = kmalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) { + r = -ENOMEM; + goto bad; + } + + ps->snap = store->snap; + ps->valid = 1; + ps->version = SNAPSHOT_DISK_VERSION; + ps->chunk_size = chunk_size; + ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) / + sizeof(struct disk_exception); + ps->next_free = 2; /* skipping the header and first area */ + ps->current_committed = 0; + + r = alloc_area(ps); + if (r) + goto bad; + + /* + * Allocate space for all the callbacks. + */ + ps->callback_count = 0; + atomic_set(&ps->pending_count, 0); + ps->callbacks = dm_vcalloc(ps->exceptions_per_area, + sizeof(*ps->callbacks)); + + if (!ps->callbacks) { + r = -ENOMEM; + goto bad; + } + + store->destroy = persistent_destroy; + store->read_metadata = persistent_read_metadata; + store->prepare_exception = persistent_prepare; + store->commit_exception = persistent_commit; + store->drop_snapshot = persistent_drop; + store->fraction_full = persistent_fraction_full; + store->context = ps; + + return 0; + + bad: + dm_io_put(sectors_to_pages(chunk_size)); + if (ps) { + if (ps->callbacks) + vfree(ps->callbacks); + + kfree(ps); + } + return r; +} + +/*----------------------------------------------------------------- + * Implementation of the store for non-persistent snapshots. + *---------------------------------------------------------------*/ +struct transient_c { + sector_t next_free; +}; + +void transient_destroy(struct exception_store *store) +{ + kfree(store->context); +} + +int transient_read_metadata(struct exception_store *store) +{ + return 0; +} + +int transient_prepare(struct exception_store *store, struct exception *e) +{ + struct transient_c *tc = (struct transient_c *) store->context; + sector_t size = get_dev_size(store->snap->cow->bdev); + + if (size < (tc->next_free + store->snap->chunk_size)) + return -1; + + e->new_chunk = sector_to_chunk(store->snap, tc->next_free); + tc->next_free += store->snap->chunk_size; + + return 0; +} + +void transient_commit(struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context) +{ + /* Just succeed */ + callback(callback_context, 1); +} + +static void transient_fraction_full(struct exception_store *store, + sector_t *numerator, sector_t *denominator) +{ + *numerator = ((struct transient_c *) store->context)->next_free; + *denominator = get_dev_size(store->snap->cow->bdev); +} + +int dm_create_transient(struct exception_store *store, + struct dm_snapshot *s, int blocksize) +{ + struct transient_c *tc; + + memset(store, 0, sizeof(*store)); + store->destroy = transient_destroy; + store->read_metadata = transient_read_metadata; + store->prepare_exception = transient_prepare; + store->commit_exception = transient_commit; + store->fraction_full = transient_fraction_full; + store->snap = s; + + tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); + if (!tc) + return -ENOMEM; + + tc->next_free = 0; + store->context = tc; + + return 0; +} --- diff/drivers/md/dm-io.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-io.c 2003-12-29 10:12:47.000000000 +0000 @@ -0,0 +1,580 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the GPL. + */ + +#include "dm-io.h" + +#include +#include +#include +#include +#include + +#define BIO_POOL_SIZE 256 + + +/*----------------------------------------------------------------- + * Bio set, move this to bio.c + *---------------------------------------------------------------*/ +#define BV_NAME_SIZE 16 +struct biovec_pool { + int nr_vecs; + char name[BV_NAME_SIZE]; + kmem_cache_t *slab; + mempool_t *pool; + atomic_t allocated; /* FIXME: debug */ +}; + +#define BIOVEC_NR_POOLS 6 +struct bio_set { + char name[BV_NAME_SIZE]; + kmem_cache_t *bio_slab; + mempool_t *bio_pool; + struct biovec_pool pools[BIOVEC_NR_POOLS]; +}; + +/*----------------*/ + +static void bio_set_exit(struct bio_set *bs) +{ + unsigned i; + struct biovec_pool *bp; + + if (bs->bio_pool) + mempool_destroy(bs->bio_pool); + + if (bs->bio_slab) + kmem_cache_destroy(bs->bio_slab); + + for (i = 0; i < BIOVEC_NR_POOLS; i++) { + bp = bs->pools + i; + if (bp->pool) + mempool_destroy(bp->pool); + + if (bp->slab) + kmem_cache_destroy(bp->slab); + } +} + +/*----------------*/ + +static void mk_name(char *str, size_t len, const char *prefix, unsigned count) +{ + int r; + + r = snprintf(str, len, "%s-%u", prefix, count); + if (r < 0) + str[len - 1] = '\0'; +} + +static int bio_set_init(struct bio_set *bs, const char *slab_prefix, + unsigned pool_entries, unsigned scale) +{ + /* FIXME: this must match bvec_index(), why not go the + * whole hog and have a pool per power of 2 ? */ + static unsigned _vec_lengths[BIOVEC_NR_POOLS] = { + 1, 4, 16, 64, 128, BIO_MAX_PAGES + }; + + + int r; + unsigned i, size; + struct biovec_pool *bp; + + /* zero the bs so we can tear down properly on error */ + memset(bs, 0, sizeof(*bs)); + + /* + * Set up the bio pool. + */ + r = snprintf(bs->name, sizeof(bs->name), "%s-bio", slab_prefix); + if (r < 0) + bs->name[sizeof(bs->name) - 1] = '\0'; + + bs->bio_slab = kmem_cache_create(bs->name, sizeof(struct bio), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!bs->bio_slab) { + DMWARN("can't init bio slab"); + goto bad; + } + + bs->bio_pool = mempool_create(pool_entries, mempool_alloc_slab, + mempool_free_slab, bs->bio_slab); + if (!bs->bio_pool) { + DMWARN("can't init bio pool"); + goto bad; + } + + /* + * Set up the biovec pools. + */ + for (i = 0; i < BIOVEC_NR_POOLS; i++) { + bp = bs->pools + i; + bp->nr_vecs = _vec_lengths[i]; + atomic_set(&bp->allocated, 1); /* FIXME: debug */ + + + size = bp->nr_vecs * sizeof(struct bio_vec); + + mk_name(bp->name, sizeof(bp->name), slab_prefix, i); + bp->slab = kmem_cache_create(bp->name, size, 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!bp->slab) { + DMWARN("can't init biovec slab cache"); + goto bad; + } + + if (i >= scale) + pool_entries >>= 1; + + bp->pool = mempool_create(pool_entries, mempool_alloc_slab, + mempool_free_slab, bp->slab); + if (!bp->pool) { + DMWARN("can't init biovec mempool"); + goto bad; + } + } + + return 0; + + bad: + bio_set_exit(bs); + return -ENOMEM; +} + +/*----------------*/ + +/* FIXME: blech */ +static inline unsigned bvec_index(unsigned nr) +{ + switch (nr) { + case 1: return 0; + case 2 ... 4: return 1; + case 5 ... 16: return 2; + case 17 ... 64: return 3; + case 65 ... 128:return 4; + case 129 ... BIO_MAX_PAGES: return 5; + } + + BUG(); + return 0; +} + +static inline void bs_bio_init(struct bio *bio) +{ + bio->bi_next = NULL; + bio->bi_flags = 1 << BIO_UPTODATE; + bio->bi_rw = 0; + bio->bi_vcnt = 0; + bio->bi_idx = 0; + bio->bi_phys_segments = 0; + bio->bi_hw_segments = 0; + bio->bi_size = 0; + bio->bi_max_vecs = 0; + bio->bi_end_io = NULL; + atomic_set(&bio->bi_cnt, 1); + bio->bi_private = NULL; +} + +static unsigned _bio_count = 0; +struct bio *bio_set_alloc(struct bio_set *bs, int gfp_mask, int nr_iovecs) +{ + struct biovec_pool *bp; + struct bio_vec *bv = NULL; + unsigned long idx; + struct bio *bio; + + bio = mempool_alloc(bs->bio_pool, gfp_mask); + if (unlikely(!bio)) + return NULL; + + bio_init(bio); + + if (likely(nr_iovecs)) { + idx = bvec_index(nr_iovecs); + bp = bs->pools + idx; + bv = mempool_alloc(bp->pool, gfp_mask); + if (!bv) { + mempool_free(bio, bs->bio_pool); + return NULL; + } + + memset(bv, 0, bp->nr_vecs * sizeof(*bv)); + bio->bi_flags |= idx << BIO_POOL_OFFSET; + bio->bi_max_vecs = bp->nr_vecs; + atomic_inc(&bp->allocated); + } + + bio->bi_io_vec = bv; + return bio; +} + +static void bio_set_free(struct bio_set *bs, struct bio *bio) +{ + struct biovec_pool *bp = bs->pools + BIO_POOL_IDX(bio); + + if (atomic_dec_and_test(&bp->allocated)) + BUG(); + + mempool_free(bio->bi_io_vec, bp->pool); + mempool_free(bio, bs->bio_pool); +} + +/*----------------------------------------------------------------- + * dm-io proper + *---------------------------------------------------------------*/ +static struct bio_set _bios; + +/* FIXME: can we shrink this ? */ +struct io { + unsigned long error; + atomic_t count; + struct task_struct *sleeper; + io_notify_fn callback; + void *context; +}; + +/* + * io contexts are only dynamically allocated for asynchronous + * io. Since async io is likely to be the majority of io we'll + * have the same number of io contexts as buffer heads ! (FIXME: + * must reduce this). + */ +static unsigned _num_ios; +static mempool_t *_io_pool; + +static void *alloc_io(int gfp_mask, void *pool_data) +{ + return kmalloc(sizeof(struct io), gfp_mask); +} + +static void free_io(void *element, void *pool_data) +{ + kfree(element); +} + +static unsigned int pages_to_ios(unsigned int pages) +{ + return 4 * pages; /* too many ? */ +} + +static int resize_pool(unsigned int new_ios) +{ + int r = 0; + + if (_io_pool) { + if (new_ios == 0) { + /* free off the pool */ + mempool_destroy(_io_pool); + _io_pool = NULL; + bio_set_exit(&_bios); + + } else { + /* resize the pool */ + r = mempool_resize(_io_pool, new_ios, GFP_KERNEL); + } + + } else { + /* create new pool */ + _io_pool = mempool_create(new_ios, alloc_io, free_io, NULL); + if (!_io_pool) + r = -ENOMEM; + + r = bio_set_init(&_bios, "dm-io", 512, 1); + if (r) { + mempool_destroy(_io_pool); + _io_pool = NULL; + } + } + + if (!r) + _num_ios = new_ios; + + return r; +} + +int dm_io_get(unsigned int num_pages) +{ + return resize_pool(_num_ios + pages_to_ios(num_pages)); +} + +void dm_io_put(unsigned int num_pages) +{ + resize_pool(_num_ios - pages_to_ios(num_pages)); +} + +/*----------------------------------------------------------------- + * We need to keep track of which region a bio is doing io for. + * In order to save a memory allocation we store this the last + * bvec which we know is unused (blech). + *---------------------------------------------------------------*/ +static inline void bio_set_region(struct bio *bio, unsigned region) +{ +// bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len = region; +} + +static inline unsigned bio_get_region(struct bio *bio) +{ +// return bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len; + return 0; +} + +/*----------------------------------------------------------------- + * We need an io object to keep track of the number of bios that + * have been dispatched for a particular io. + *---------------------------------------------------------------*/ +static void dec_count(struct io *io, unsigned int region, int error) +{ + if (error) + set_bit(region, &io->error); + + if (atomic_dec_and_test(&io->count)) { + if (io->sleeper) + wake_up_process(io->sleeper); + + else { + int r = io->error; + io_notify_fn fn = io->callback; + void *context = io->context; + + mempool_free(io, _io_pool); + fn(r, context); + } + } +} + +static int endio(struct bio *bio, unsigned int done, int error) +{ + struct io *io = (struct io *) bio->bi_private; + + /* keep going until we've finished */ + if (bio->bi_size) + return 1; + + /* FIXME: kcopyd needs pages zeroing on read failure ??? + * sounds like kcopyd is broken */ + dec_count(io, bio_get_region(bio), error); + bio_put(bio); + + return 0; +} + +static void bio_dtr(struct bio *bio) +{ + _bio_count--; + bio_set_free(&_bios, bio); +} + +/*----------------------------------------------------------------- + * These little objects provide an abstraction for getting a new + * destination page for io. + *---------------------------------------------------------------*/ +struct dpages { + void (*get_page)(struct dpages *dp, + struct page **p, unsigned long *len, unsigned *offset); + void (*next_page)(struct dpages *dp); + + unsigned context_u; + void *context_ptr; +}; + +/* + * Functions for getting the pages from a list. + */ +void list_get_page(struct dpages *dp, + struct page **p, unsigned long *len, unsigned *offset) +{ + unsigned o = dp->context_u; + + *p = (struct page *) dp->context_ptr; + *len = PAGE_SIZE - o; + *offset = o; +} + +void list_next_page(struct dpages *dp) +{ + struct page *page = (struct page *) dp->context_ptr; + dp->context_ptr = list_entry(page->list.next, struct page, list); + dp->context_u = 0; +} + +void list_dp_init(struct dpages *dp, struct page *page, unsigned offset) +{ + dp->get_page = list_get_page; + dp->next_page = list_next_page; + dp->context_u = offset; + dp->context_ptr = page; +} + +/* + * Functions for getting the pages from a bvec. + */ +void bvec_get_page(struct dpages *dp, + struct page **p, unsigned long *len, unsigned *offset) +{ + struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; + *p = bvec->bv_page; + *len = bvec->bv_len; + *offset = bvec->bv_offset; +} + +void bvec_next_page(struct dpages *dp) +{ + struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; + dp->context_ptr = bvec + 1; +} + +void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec) +{ + dp->context_ptr = bvec; +} + +/*----------------------------------------------------------------- + * IO routines that accept a list of pages. + *---------------------------------------------------------------*/ +static void do_region(int rw, unsigned int region, struct io_region *where, + struct dpages *dp, struct io *io) +{ + struct bio *bio; + struct page *page; + unsigned long len; + unsigned offset; + unsigned num_bvecs; + sector_t remaining = where->count; + + while (remaining) { + /* + * Allocate a suitably sized bio, we add an extra + * bvec for bio_get/set_region(). + */ + num_bvecs = (remaining / (PAGE_SIZE >> 9)) + 2; + _bio_count++; + bio = bio_set_alloc(&_bios, GFP_NOIO, num_bvecs); + bio->bi_sector = where->sector + (where->count - remaining); + bio->bi_bdev = where->bdev; + bio->bi_end_io = endio; + bio->bi_private = io; + bio->bi_destructor = bio_dtr; + bio_set_region(bio, region); + + /* + * Try and add as many pages as possible. + */ + while (remaining) { + dp->get_page(dp, &page, &len, &offset); + len = min(len, to_bytes(remaining)); + if (!bio_add_page(bio, page, len, offset)) + break; + + offset = 0; + remaining -= to_sector(len); + dp->next_page(dp); + } + + atomic_inc(&io->count); + submit_bio(rw, bio); + } +} + +static void dispatch_io(int rw, unsigned int num_regions, + struct io_region *where, struct dpages *dp, + struct io *io) +{ + int i; + + for (i = 0; i < num_regions; i++) + if (where[i].count) + do_region(rw, i, where + i, dp, io); + + /* + * Drop the extra refence that we were holding to avoid + * the io being completed too early. + */ + dec_count(io, 0, 0); +} + +int sync_io(unsigned int num_regions, struct io_region *where, + int rw, struct dpages *dp, unsigned long *error_bits) +{ + struct io io; + + BUG_ON(num_regions > 1 && rw != WRITE); + + io.error = 0; + atomic_set(&io.count, 1); /* see dispatch_io() */ + io.sleeper = current; + + dispatch_io(rw, num_regions, where, dp, &io); + blk_run_queues(); + + while (1) { + /* FIXME: handle signals */ + set_current_state(TASK_UNINTERRUPTIBLE); + + if (!atomic_read(&io.count)) + break; + + schedule(); + } + set_current_state(TASK_RUNNING); + + *error_bits = io.error; + return io.error ? -EIO : 0; +} + +int async_io(unsigned int num_regions, struct io_region *where, int rw, + struct dpages *dp, io_notify_fn fn, void *context) +{ + struct io *io = mempool_alloc(_io_pool, GFP_NOIO); + + io->error = 0; + atomic_set(&io->count, 1); /* see dispatch_io() */ + io->sleeper = NULL; + io->callback = fn; + io->context = context; + + dispatch_io(rw, num_regions, where, dp, io); + return 0; +} + +int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, + struct page *pages, unsigned int offset, + unsigned long *error_bits) +{ + struct dpages dp; + list_dp_init(&dp, pages, offset); + return sync_io(num_regions, where, rw, &dp, error_bits); +} + +int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw, + struct bio_vec *bvec, unsigned long *error_bits) +{ + struct dpages dp; + bvec_dp_init(&dp, bvec); + return sync_io(num_regions, where, rw, &dp, error_bits); +} + +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, + struct page *pages, unsigned int offset, + io_notify_fn fn, void *context) +{ + struct dpages dp; + list_dp_init(&dp, pages, offset); + return async_io(num_regions, where, rw, &dp, fn, context); +} + +int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw, + struct bio_vec *bvec, io_notify_fn fn, void *context) +{ + struct dpages dp; + bvec_dp_init(&dp, bvec); + return async_io(num_regions, where, rw, &dp, fn, context); +} + + +EXPORT_SYMBOL(dm_io_get); +EXPORT_SYMBOL(dm_io_put); +EXPORT_SYMBOL(dm_io_sync); +EXPORT_SYMBOL(dm_io_async); --- diff/drivers/md/dm-io.h 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-io.h 2003-12-29 10:12:47.000000000 +0000 @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the GPL. + */ + +#ifndef _DM_IO_H +#define _DM_IO_H + +#include "dm.h" + +/* FIXME make this configurable */ +#define DM_MAX_IO_REGIONS 8 + +struct io_region { + struct block_device *bdev; + sector_t sector; + sector_t count; +}; + + +/* + * 'error' is a bitset, with each bit indicating whether an error + * occurred doing io to the corresponding region. + */ +typedef void (*io_notify_fn)(unsigned long error, void *context); + + +/* + * Before anyone uses the IO interface they should call + * dm_io_get(), specifying roughly how many pages they are + * expecting to perform io on concurrently. + * + * This function may block. + */ +int dm_io_get(unsigned int num_pages); +void dm_io_put(unsigned int num_pages); + + +/* + * Synchronous IO. + * + * Please ensure that the rw flag in the next two functions is + * either READ or WRITE, ie. we don't take READA. Any + * regions with a zero count field will be ignored. + */ +int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, + struct page *pages, unsigned int offset, + unsigned long *error_bits); + +int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw, + struct bio_vec *bvec, unsigned long *error_bits); + + +/* + * Aynchronous IO. + * + * The 'where' array may be safely allocated on the stack since + * the function takes a copy. + */ +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, + struct page *pages, unsigned int offset, + io_notify_fn fn, void *context); + +int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw, + struct bio_vec *bvec, io_notify_fn fn, void *context); + +#endif --- diff/drivers/md/dm-log.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-log.c 2003-12-29 10:12:47.000000000 +0000 @@ -0,0 +1,307 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#include +#include +#include +#include + +#include "dm-log.h" +#include "dm-io.h" + +static LIST_HEAD(_log_types); +static spinlock_t _lock = SPIN_LOCK_UNLOCKED; + +int dm_register_dirty_log_type(struct dirty_log_type *type) +{ + spin_lock(&_lock); + type->use_count = 0; + try_module_get(type->module); + + list_add(&type->list, &_log_types); + spin_unlock(&_lock); + + return 0; +} + +int dm_unregister_dirty_log_type(struct dirty_log_type *type) +{ + spin_lock(&_lock); + + if (type->use_count) + DMWARN("Attempt to unregister a log type that is still in use"); + else { + list_del(&type->list); + module_put(type->module); + } + + spin_unlock(&_lock); + + return 0; +} + +static struct dirty_log_type *get_type(const char *type_name) +{ + struct dirty_log_type *type; + struct list_head *tmp; + + spin_lock(&_lock); + list_for_each (tmp, &_log_types) { + type = list_entry(tmp, struct dirty_log_type, list); + if (!strcmp(type_name, type->name)) { + type->use_count++; + spin_unlock(&_lock); + return type; + } + } + + spin_unlock(&_lock); + return NULL; +} + +static void put_type(struct dirty_log_type *type) +{ + spin_lock(&_lock); + type->use_count--; + spin_unlock(&_lock); +} + +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size, + unsigned int argc, char **argv) +{ + struct dirty_log_type *type; + struct dirty_log *log; + + log = kmalloc(sizeof(*log), GFP_KERNEL); + if (!log) + return NULL; + + type = get_type(type_name); + if (!type) { + kfree(log); + return NULL; + } + + log->type = type; + if (type->ctr(log, dev_size, argc, argv)) { + kfree(log); + put_type(type); + return NULL; + } + + return log; +} + +void dm_destroy_dirty_log(struct dirty_log *log) +{ + log->type->dtr(log); + put_type(log->type); + kfree(log); +} + + +/*----------------------------------------------------------------- + * In core log, ie. trivial, non-persistent + * + * For now we'll keep this simple and just have 2 bitsets, one + * for clean/dirty, the other for sync/nosync. The sync bitset + * will be freed when everything is in sync. + * + * FIXME: problems with a 64bit sector_t + *---------------------------------------------------------------*/ +struct core_log { + sector_t region_size; + unsigned int region_count; + unsigned long *clean_bits; + unsigned long *sync_bits; + unsigned long *recovering_bits; /* FIXME: this seems excessive */ + + int sync_search; +}; + +#define BYTE_SHIFT 3 +static int core_ctr(struct dirty_log *log, sector_t dev_size, + unsigned int argc, char **argv) +{ + struct core_log *clog; + sector_t region_size; + unsigned int region_count; + size_t bitset_size; + + if (argc != 1) { + DMWARN("wrong number of arguments to core_log"); + return -EINVAL; + } + + if (sscanf(argv[0], SECTOR_FORMAT, ®ion_size) != 1) { + DMWARN("invalid region size string"); + return -EINVAL; + } + + region_count = dm_div_up(dev_size, region_size); + + clog = kmalloc(sizeof(*clog), GFP_KERNEL); + if (!clog) { + DMWARN("couldn't allocate core log"); + return -ENOMEM; + } + + clog->region_size = region_size; + clog->region_count = region_count; + + /* + * Work out how many words we need to hold the bitset. + */ + bitset_size = dm_round_up(region_count, + sizeof(*clog->clean_bits) << BYTE_SHIFT); + bitset_size >>= BYTE_SHIFT; + + clog->clean_bits = vmalloc(bitset_size); + if (!clog->clean_bits) { + DMWARN("couldn't allocate clean bitset"); + kfree(clog); + return -ENOMEM; + } + memset(clog->clean_bits, -1, bitset_size); + + clog->sync_bits = vmalloc(bitset_size); + if (!clog->sync_bits) { + DMWARN("couldn't allocate sync bitset"); + vfree(clog->clean_bits); + kfree(clog); + return -ENOMEM; + } + memset(clog->sync_bits, 0, bitset_size); + + clog->recovering_bits = vmalloc(bitset_size); + if (!clog->recovering_bits) { + DMWARN("couldn't allocate sync bitset"); + vfree(clog->sync_bits); + vfree(clog->clean_bits); + kfree(clog); + return -ENOMEM; + } + memset(clog->recovering_bits, 0, bitset_size); + clog->sync_search = 0; + log->context = clog; + return 0; +} + +static void core_dtr(struct dirty_log *log) +{ + struct core_log *clog = (struct core_log *) log->context; + vfree(clog->clean_bits); + vfree(clog->sync_bits); + vfree(clog->recovering_bits); + kfree(clog); +} + +static sector_t core_get_region_size(struct dirty_log *log) +{ + struct core_log *clog = (struct core_log *) log->context; + return clog->region_size; +} + +static int core_is_clean(struct dirty_log *log, region_t region) +{ + struct core_log *clog = (struct core_log *) log->context; + return test_bit(region, clog->clean_bits); +} + +static int core_in_sync(struct dirty_log *log, region_t region, int block) +{ + struct core_log *clog = (struct core_log *) log->context; + + return test_bit(region, clog->sync_bits) ? 1 : 0; +} + +static int core_flush(struct dirty_log *log) +{ + /* no op */ + return 0; +} + +static void core_mark_region(struct dirty_log *log, region_t region) +{ + struct core_log *clog = (struct core_log *) log->context; + clear_bit(region, clog->clean_bits); +} + +static void core_clear_region(struct dirty_log *log, region_t region) +{ + struct core_log *clog = (struct core_log *) log->context; + set_bit(region, clog->clean_bits); +} + +static int core_get_resync_work(struct dirty_log *log, region_t *region) +{ + struct core_log *clog = (struct core_log *) log->context; + + if (clog->sync_search >= clog->region_count) + return 0; + + do { + *region = find_next_zero_bit(clog->sync_bits, + clog->region_count, + clog->sync_search); + clog->sync_search = *region + 1; + + if (*region == clog->region_count) + return 0; + + } while (test_bit(*region, clog->recovering_bits)); + + set_bit(*region, clog->recovering_bits); + return 1; +} + +static void core_complete_resync_work(struct dirty_log *log, region_t region, + int success) +{ + struct core_log *clog = (struct core_log *) log->context; + + clear_bit(region, clog->recovering_bits); + if (success) + set_bit(region, clog->sync_bits); +} + +static struct dirty_log_type _core_type = { + .name = "core", + + .ctr = core_ctr, + .dtr = core_dtr, + .get_region_size = core_get_region_size, + .is_clean = core_is_clean, + .in_sync = core_in_sync, + .flush = core_flush, + .mark_region = core_mark_region, + .clear_region = core_clear_region, + .get_resync_work = core_get_resync_work, + .complete_resync_work = core_complete_resync_work +}; + +__init int dm_dirty_log_init(void) +{ + int r; + + r = dm_register_dirty_log_type(&_core_type); + if (r) + DMWARN("couldn't register core log"); + + return r; +} + +void dm_dirty_log_exit(void) +{ + dm_unregister_dirty_log_type(&_core_type); +} + +EXPORT_SYMBOL(dm_register_dirty_log_type); +EXPORT_SYMBOL(dm_unregister_dirty_log_type); +EXPORT_SYMBOL(dm_dirty_log_init); +EXPORT_SYMBOL(dm_dirty_log_exit); +EXPORT_SYMBOL(dm_create_dirty_log); +EXPORT_SYMBOL(dm_destroy_dirty_log); --- diff/drivers/md/dm-log.h 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-log.h 2003-12-29 10:12:47.000000000 +0000 @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#ifndef DM_DIRTY_LOG +#define DM_DIRTY_LOG + +#include "dm.h" + +typedef sector_t region_t; + +struct dirty_log_type; + +struct dirty_log { + struct dirty_log_type *type; + void *context; +}; + +struct dirty_log_type { + struct list_head list; + const char *name; + struct module *module; + unsigned int use_count; + + int (*ctr)(struct dirty_log *log, sector_t dev_size, + unsigned int argc, char **argv); + void (*dtr)(struct dirty_log *log); + + /* + * Retrieves the smallest size of region that the log can + * deal with. + */ + sector_t (*get_region_size)(struct dirty_log *log); + + /* + * A predicate to say whether a region is clean or not. + * May block. + */ + int (*is_clean)(struct dirty_log *log, region_t region); + + /* + * Returns: 0, 1, -EWOULDBLOCK, < 0 + * + * A predicate function to check the area given by + * [sector, sector + len) is in sync. + * + * If -EWOULDBLOCK is returned the state of the region is + * unknown, typically this will result in a read being + * passed to a daemon to deal with, since a daemon is + * allowed to block. + */ + int (*in_sync)(struct dirty_log *log, region_t region, int can_block); + + /* + * Flush the current log state (eg, to disk). This + * function may block. + */ + int (*flush)(struct dirty_log *log); + + /* + * Mark an area as clean or dirty. These functions may + * block, though for performance reasons blocking should + * be extremely rare (eg, allocating another chunk of + * memory for some reason). + */ + void (*mark_region)(struct dirty_log *log, region_t region); + void (*clear_region)(struct dirty_log *log, region_t region); + + /* + * Returns: <0 (error), 0 (no region), 1 (region) + * + * The mirrord will need perform recovery on regions of + * the mirror that are in the NOSYNC state. This + * function asks the log to tell the caller about the + * next region that this machine should recover. + * + * Do not confuse this function with 'in_sync()', one + * tells you if an area is synchronised, the other + * assigns recovery work. + */ + int (*get_resync_work)(struct dirty_log *log, region_t *region); + + /* + * This notifies the log that the resync of an area has + * been completed. The log should then mark this region + * as CLEAN. + */ + void (*complete_resync_work)(struct dirty_log *log, + region_t region, int success); +}; + +int dm_register_dirty_log_type(struct dirty_log_type *type); +int dm_unregister_dirty_log_type(struct dirty_log_type *type); + + +/* + * Make sure you use these two functions, rather than calling + * type->constructor/destructor() directly. + */ +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size, + unsigned int argc, char **argv); +void dm_destroy_dirty_log(struct dirty_log *log); + +/* + * init/exit functions. + */ +int dm_dirty_log_init(void); +void dm_dirty_log_exit(void); + +#endif --- diff/drivers/md/dm-raid1.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-raid1.c 2003-12-29 10:12:47.000000000 +0000 @@ -0,0 +1,1300 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-daemon.h" +#include "dm-io.h" +#include "dm-log.h" +#include "kcopyd.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct dm_daemon _kmirrord; + +/*----------------------------------------------------------------- + * buffer lists: + * + * We play with singly linked lists of bios, but we want to be + * careful to add new bios to the back of the list, to avoid + * buffers being starved of attention. + *---------------------------------------------------------------*/ +struct bio_list { + struct bio *head; + struct bio *tail; +}; + +static inline void bio_list_init(struct bio_list *bl) +{ + bl->head = bl->tail = NULL; +} + +static inline void bio_list_add(struct bio_list *bl, struct bio *bio) +{ + bio->bi_next = NULL; + + if (bl->tail) { + bl->tail->bi_next = bio; + bl->tail = bio; + } else + bl->head = bl->tail = bio; +} + +static struct bio *bio_list_pop(struct bio_list *bl) +{ + struct bio *bio = bl->head; + + if (bio) { + bl->head = bl->head->bi_next; + if (!bl->head) + bl->tail = NULL; + + bio->bi_next = NULL; + } + + return bio; +} + +/*----------------------------------------------------------------- + * Region hash + * + * The mirror splits itself up into discrete regions. Each + * region can be in one of three states: clean, dirty, + * nosync. There is no need to put clean regions in the hash. + * + * In addition to being present in the hash table a region _may_ + * be present on one of three lists. + * + * clean_regions: Regions on this list have no io pending to + * them, they are in sync, we are no longer interested in them, + * they are dull. rh_update_states() will remove them from the + * hash table. + * + * quiesced_regions: These regions have been spun down, ready + * for recovery. rh_recovery_start() will remove regions from + * this list and hand them to kmirrord, which will schedule the + * recovery io with kcopyd. + * + * recovered_regions: Regions that kcopyd has successfully + * recovered. rh_update_states() will now schedule any delayed + * io, up the recovery_count, and remove the region from the + * hash. + * + * There are 2 locks: + * A rw spin lock 'hash_lock' protects just the hash table, + * this is never held in write mode from interrupt context, + * which I believe means that we only have to disable irqs when + * doing a write lock. + * + * An ordinary spin lock 'region_lock' that protects the three + * lists in the region_hash, with the 'state', 'list' and + * 'bhs_delayed' fields of the regions. This is used from irq + * context, so all other uses will have to suspend local irqs. + *---------------------------------------------------------------*/ +struct mirror_set; +struct region_hash { + struct mirror_set *ms; + sector_t region_size; + unsigned region_shift; + + /* holds persistent region state */ + struct dirty_log *log; + + /* hash table */ + rwlock_t hash_lock; + mempool_t *region_pool; + unsigned int mask; + unsigned int nr_buckets; + struct list_head *buckets; + + spinlock_t region_lock; + struct semaphore recovery_count; + struct list_head clean_regions; + struct list_head quiesced_regions; + struct list_head recovered_regions; +}; + +enum { + RH_CLEAN, + RH_DIRTY, + RH_NOSYNC, + RH_RECOVERING +}; + +struct region { + struct region_hash *rh; /* FIXME: can we get rid of this ? */ + region_t key; + int state; + + struct list_head hash_list; + struct list_head list; + + atomic_t pending; + struct bio *delayed_bios; +}; + +/* + * Conversion fns + */ +static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio) +{ + return bio->bi_sector >> rh->region_shift; +} + +static inline sector_t region_to_sector(struct region_hash *rh, region_t region) +{ + return region << rh->region_shift; +} + +/* FIXME move this */ +static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); + +static void *region_alloc(int gfp_mask, void *pool_data) +{ + return kmalloc(sizeof(struct region), gfp_mask); +} + +static void region_free(void *element, void *pool_data) +{ + kfree(element); +} + +#define MIN_REGIONS 64 +#define MAX_RECOVERY 1 +static int rh_init(struct region_hash *rh, struct mirror_set *ms, + struct dirty_log *log, sector_t region_size, + region_t nr_regions) +{ + unsigned int nr_buckets, max_buckets; + size_t i; + + /* + * Calculate a suitable number of buckets for our hash + * table. + */ + max_buckets = nr_regions >> 6; + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) + ; + nr_buckets >>= 1; + + rh->ms = ms; + rh->log = log; + rh->region_size = region_size; + rh->region_shift = ffs(region_size); + rwlock_init(&rh->hash_lock); + rh->mask = nr_buckets - 1; + rh->nr_buckets = nr_buckets; + + rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); + if (!rh->buckets) { + DMERR("unable to allocate region hash memory"); + return -ENOMEM; + } + + for (i = 0; i < nr_buckets; i++) + INIT_LIST_HEAD(rh->buckets + i); + + spin_lock_init(&rh->region_lock); + sema_init(&rh->recovery_count, 0); + INIT_LIST_HEAD(&rh->clean_regions); + INIT_LIST_HEAD(&rh->quiesced_regions); + INIT_LIST_HEAD(&rh->recovered_regions); + + rh->region_pool = mempool_create(MIN_REGIONS, region_alloc, + region_free, NULL); + if (!rh->region_pool) { + vfree(rh->buckets); + rh->buckets = NULL; + return -ENOMEM; + } + + return 0; +} + +static void rh_exit(struct region_hash *rh) +{ + unsigned int h; + struct region *reg; + struct list_head *tmp, *tmp2; + + BUG_ON(!list_empty(&rh->quiesced_regions)); + for (h = 0; h < rh->nr_buckets; h++) { + list_for_each_safe (tmp, tmp2, rh->buckets + h) { + reg = list_entry(tmp, struct region, hash_list); + BUG_ON(atomic_read(®->pending)); + mempool_free(reg, rh->region_pool); + } + } + + if (rh->log) + dm_destroy_dirty_log(rh->log); + if (rh->region_pool) + mempool_destroy(rh->region_pool); + vfree(rh->buckets); +} + +#define RH_HASH_MULT 2654435387U + +static inline unsigned int rh_hash(struct region_hash *rh, region_t region) +{ + return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask; +} + +static struct region *__rh_lookup(struct region_hash *rh, region_t region) +{ + struct region *reg; + + list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list) + if (reg->key == region) + return reg; + + return NULL; +} + +static void __rh_insert(struct region_hash *rh, struct region *reg) +{ + unsigned int h = rh_hash(rh, reg->key); + list_add(®->hash_list, rh->buckets + h); +} + +static struct region *__rh_alloc(struct region_hash *rh, region_t region) +{ + struct region *reg, *nreg; + + read_unlock(&rh->hash_lock); + nreg = mempool_alloc(rh->region_pool, GFP_NOIO); + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? + RH_CLEAN : RH_NOSYNC; + nreg->rh = rh; + nreg->key = region; + + INIT_LIST_HEAD(&nreg->list); + + atomic_set(&nreg->pending, 0); + nreg->delayed_bios = NULL; + write_lock_irq(&rh->hash_lock); + + reg = __rh_lookup(rh, region); + if (reg) + /* we lost the race */ + mempool_free(nreg, rh->region_pool); + + else { + __rh_insert(rh, nreg); + if (nreg->state == RH_CLEAN) { + spin_lock_irq(&rh->region_lock); + list_add(&nreg->list, &rh->clean_regions); + spin_unlock_irq(&rh->region_lock); + } + reg = nreg; + } + write_unlock_irq(&rh->hash_lock); + read_lock(&rh->hash_lock); + + return reg; +} + +static inline struct region *__rh_find(struct region_hash *rh, region_t region) +{ + struct region *reg; + + reg = __rh_lookup(rh, region); + if (!reg) + reg = __rh_alloc(rh, region); + + return reg; +} + +static int rh_state(struct region_hash *rh, region_t region, int may_block) +{ + int r; + struct region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + if (reg) + return reg->state; + + /* + * The region wasn't in the hash, so we fall back to the + * dirty log. + */ + r = rh->log->type->in_sync(rh->log, region, may_block); + + /* + * Any error from the dirty log (eg. -EWOULDBLOCK) gets + * taken as a RH_NOSYNC + */ + return r == 1 ? RH_CLEAN : RH_NOSYNC; +} + +static inline int rh_in_sync(struct region_hash *rh, + region_t region, int may_block) +{ + int state = rh_state(rh, region, may_block); + return state == RH_CLEAN || state == RH_DIRTY; +} + +static void dispatch_bios(struct mirror_set *ms, struct bio *bio) +{ + struct bio *nbio; + + while (bio) { + nbio = bio->bi_next; + queue_bio(ms, bio, WRITE); + bio = nbio; + } +} + +static void rh_update_states(struct region_hash *rh) +{ + struct list_head *tmp, *tmp2; + struct region *reg; + + LIST_HEAD(clean); + LIST_HEAD(recovered); + + /* + * Quickly grab the lists. + */ + write_lock_irq(&rh->hash_lock); + spin_lock(&rh->region_lock); + if (!list_empty(&rh->clean_regions)) { + list_splice(&rh->clean_regions, &clean); + INIT_LIST_HEAD(&rh->clean_regions); + + list_for_each_entry (reg, &clean, list) { + rh->log->type->clear_region(rh->log, reg->key); + list_del(®->hash_list); + } + } + + if (!list_empty(&rh->recovered_regions)) { + list_splice(&rh->recovered_regions, &recovered); + INIT_LIST_HEAD(&rh->recovered_regions); + + list_for_each_entry (reg, &recovered, list) + list_del(®->hash_list); + } + spin_unlock(&rh->region_lock); + write_unlock_irq(&rh->hash_lock); + + /* + * All the regions on the recovered and clean lists have + * now been pulled out of the system, so no need to do + * any more locking. + */ + list_for_each_safe (tmp, tmp2, &recovered) { + reg = list_entry(tmp, struct region, list); + + rh->log->type->complete_resync_work(rh->log, reg->key, 1); + dispatch_bios(rh->ms, reg->delayed_bios); + up(&rh->recovery_count); + mempool_free(reg, rh->region_pool); + } + + list_for_each_safe (tmp, tmp2, &clean) { + reg = list_entry(tmp, struct region, list); + mempool_free(reg, rh->region_pool); + } +} + +static void rh_inc(struct region_hash *rh, region_t region) +{ + struct region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + if (reg->state == RH_CLEAN) { + rh->log->type->mark_region(rh->log, reg->key); + + spin_lock_irq(&rh->region_lock); + reg->state = RH_DIRTY; + list_del_init(®->list); /* take off the clean list */ + spin_unlock_irq(&rh->region_lock); + } + + atomic_inc(®->pending); + read_unlock(&rh->hash_lock); +} + +static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios) +{ + struct bio *bio; + + for (bio = bios->head; bio; bio = bio->bi_next) + rh_inc(rh, bio_to_region(rh, bio)); +} + +static void rh_dec(struct region_hash *rh, region_t region) +{ + unsigned long flags; + struct region *reg; + int wake = 0; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + if (atomic_dec_and_test(®->pending)) { + spin_lock_irqsave(&rh->region_lock, flags); + if (reg->state == RH_RECOVERING) { + list_add_tail(®->list, &rh->quiesced_regions); + } else { + reg->state = RH_CLEAN; + list_add(®->list, &rh->clean_regions); + } + spin_unlock_irqrestore(&rh->region_lock, flags); + wake = 1; + } + + if (wake) + dm_daemon_wake(&_kmirrord); +} + +/* + * Starts quiescing a region in preparation for recovery. + */ +static int __rh_recovery_prepare(struct region_hash *rh) +{ + int r; + struct region *reg; + region_t region; + + /* + * Ask the dirty log what's next. + */ + r = rh->log->type->get_resync_work(rh->log, ®ion); + if (r <= 0) + return r; + + /* + * Get this region, and start it quiescing by setting the + * recovering flag. + */ + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + spin_lock_irq(&rh->region_lock); + reg->state = RH_RECOVERING; + + /* Already quiesced ? */ + if (atomic_read(®->pending)) + list_del_init(®->list); + + else { + list_del_init(®->list); + list_add(®->list, &rh->quiesced_regions); + } + spin_unlock_irq(&rh->region_lock); + + return 1; +} + +static void rh_recovery_prepare(struct region_hash *rh) +{ + while (!down_trylock(&rh->recovery_count)) + if (__rh_recovery_prepare(rh) <= 0) { + up(&rh->recovery_count); + break; + } +} + +/* + * Returns any quiesced regions. + */ +static struct region *rh_recovery_start(struct region_hash *rh) +{ + struct region *reg = NULL; + + spin_lock_irq(&rh->region_lock); + if (!list_empty(&rh->quiesced_regions)) { + reg = list_entry(rh->quiesced_regions.next, + struct region, list); + list_del_init(®->list); /* remove from the quiesced list */ + } + spin_unlock_irq(&rh->region_lock); + + return reg; +} + +/* FIXME: success ignored for now */ +static void rh_recovery_end(struct region *reg, int success) +{ + struct region_hash *rh = reg->rh; + + spin_lock_irq(&rh->region_lock); + list_add(®->list, ®->rh->recovered_regions); + spin_unlock_irq(&rh->region_lock); + + dm_daemon_wake(&_kmirrord); +} + +static void rh_flush(struct region_hash *rh) +{ + rh->log->type->flush(rh->log); +} + +static void rh_delay(struct region_hash *rh, struct bio *bio) +{ + struct region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, bio_to_region(rh, bio)); + bio->bi_next = reg->delayed_bios; + reg->delayed_bios = bio; + read_unlock(&rh->hash_lock); +} + +static void rh_stop_recovery(struct region_hash *rh) +{ + int i; + + /* wait for any recovering regions */ + for (i = 0; i < MAX_RECOVERY; i++) + down(&rh->recovery_count); +} + +static void rh_start_recovery(struct region_hash *rh) +{ + int i; + + for (i = 0; i < MAX_RECOVERY; i++) + up(&rh->recovery_count); + + dm_daemon_wake(&_kmirrord); +} + +/*----------------------------------------------------------------- + * Mirror set structures. + *---------------------------------------------------------------*/ +struct mirror { + atomic_t error_count; + struct dm_dev *dev; + sector_t offset; +}; + +struct mirror_set { + struct dm_target *ti; + struct list_head list; + struct region_hash rh; + struct kcopyd_client *kcopyd_client; + + spinlock_t lock; /* protects the next two lists */ + struct bio_list reads; + struct bio_list writes; + + /* recovery */ + region_t nr_regions; + region_t sync_count; + + unsigned int nr_mirrors; + struct mirror mirror[0]; +}; + +/* + * Every mirror should look like this one. + */ +#define DEFAULT_MIRROR 0 + +/* + * This is yucky. We squirrel the mirror_set struct away inside + * bi_next for write buffers. This is safe since the bh + * doesn't get submitted to the lower levels of block layer. + */ +static struct mirror_set *bio_get_ms(struct bio *bio) +{ + return (struct mirror_set *) bio->bi_next; +} + +static void bio_set_ms(struct bio *bio, struct mirror_set *ms) +{ + bio->bi_next = (struct bio *) ms; +} + +/*----------------------------------------------------------------- + * Recovery. + * + * When a mirror is first activated we may find that some regions + * are in the no-sync state. We have to recover these by + * recopying from the default mirror to all the others. + *---------------------------------------------------------------*/ +static void recovery_complete(int read_err, unsigned int write_err, + void *context) +{ + struct region *reg = (struct region *) context; + struct mirror_set *ms = reg->rh->ms; + + /* FIXME: better error handling */ + rh_recovery_end(reg, read_err || write_err); + if (++ms->sync_count == ms->nr_regions) + /* the sync is complete */ + dm_table_event(ms->ti->table); +} + +static int recover(struct mirror_set *ms, struct region *reg) +{ + int r; + unsigned int i; + struct io_region from, to[ms->nr_mirrors - 1], *dest; + struct mirror *m; + unsigned long flags = 0; + + /* fill in the source */ + m = ms->mirror + DEFAULT_MIRROR; + from.bdev = m->dev->bdev; + from.sector = m->offset + region_to_sector(reg->rh, reg->key); + if (reg->key == (ms->nr_regions - 1)) { + /* + * The final region may be smaller than + * region_size. + */ + from.count = ms->ti->len & (reg->rh->region_size - 1); + if (!from.count) + from.count = reg->rh->region_size; + } else + from.count = reg->rh->region_size; + + /* fill in the destinations */ + for (i = 1; i < ms->nr_mirrors; i++) { + m = ms->mirror + i; + dest = to + (i - 1); + + dest->bdev = m->dev->bdev; + dest->sector = m->offset + region_to_sector(reg->rh, reg->key); + dest->count = from.count; + } + + /* hand to kcopyd */ + set_bit(KCOPYD_IGNORE_ERROR, &flags); + r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, + recovery_complete, reg); + + return r; +} + +static void do_recovery(struct mirror_set *ms) +{ + int r; + struct region *reg; + + /* + * Start quiescing some regions. + */ + rh_recovery_prepare(&ms->rh); + + /* + * Copy any already quiesced regions. + */ + while ((reg = rh_recovery_start(&ms->rh))) { + r = recover(ms, reg); + if (r) + rh_recovery_end(reg, 0); + } +} + +/*----------------------------------------------------------------- + * Reads + *---------------------------------------------------------------*/ +static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) +{ + /* FIXME: add read balancing */ + return ms->mirror + DEFAULT_MIRROR; +} + +/* + * remap a buffer to a particular mirror. + */ +static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio) +{ + bio->bi_bdev = m->dev->bdev; + bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin); +} + +static void do_reads(struct mirror_set *ms, struct bio_list *reads) +{ + region_t region; + struct bio *bio; + struct mirror *m; + + while ((bio = bio_list_pop(reads))) { + region = bio_to_region(&ms->rh, bio); + + /* + * We can only read balance if the region is in sync. + */ + if (rh_in_sync(&ms->rh, region, 0)) + m = choose_mirror(ms, bio->bi_sector); + else + m = ms->mirror + DEFAULT_MIRROR; + + map_bio(ms, m, bio); + generic_make_request(bio); + } +} + +/*----------------------------------------------------------------- + * Writes. + * + * We do different things with the write io depending on the + * state of the region that it's in: + * + * SYNC: increment pending, use kcopyd to write to *all* mirrors + * RECOVERING: delay the io until recovery completes + * NOSYNC: increment pending, just write to the default mirror + *---------------------------------------------------------------*/ +static void write_callback(unsigned long error, void *context) +{ + unsigned int i; + int uptodate = 1; + struct bio *bio = (struct bio *) context; + struct mirror_set *ms; + + ms = bio_get_ms(bio); + bio_set_ms(bio, NULL); + + /* + * NOTE: We don't decrement the pending count here, + * instead it is done by the targets endio function. + * This way we handle both writes to SYNC and NOSYNC + * regions with the same code. + */ + + if (error) { + /* + * only error the io if all mirrors failed. + * FIXME: bogus + */ + uptodate = 0; + for (i = 0; i < ms->nr_mirrors; i++) + if (!test_bit(i, &error)) { + uptodate = 1; + break; + } + } + bio_endio(bio, bio->bi_size, 0); +} + +static void do_write(struct mirror_set *ms, struct bio *bio) +{ + unsigned int i; + struct io_region io[ms->nr_mirrors]; + struct mirror *m; + + for (i = 0; i < ms->nr_mirrors; i++) { + m = ms->mirror + i; + + io[i].bdev = m->dev->bdev; + io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin); + io[i].count = bio->bi_size >> 9; + } + + bio_set_ms(bio, ms); + dm_io_async_bvec(ms->nr_mirrors, io, WRITE, + bio->bi_io_vec + bio->bi_idx, + write_callback, bio); +} + +static void do_writes(struct mirror_set *ms, struct bio_list *writes) +{ + int state; + struct bio *bio; + struct bio_list sync, nosync, recover, *this_list = NULL; + + if (!writes->head) + return; + + /* + * Classify each write. + */ + bio_list_init(&sync); + bio_list_init(&nosync); + bio_list_init(&recover); + + while ((bio = bio_list_pop(writes))) { + state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1); + switch (state) { + case RH_CLEAN: + case RH_DIRTY: + this_list = &sync; + break; + + case RH_NOSYNC: + this_list = &nosync; + break; + + case RH_RECOVERING: + this_list = &recover; + break; + } + + bio_list_add(this_list, bio); + } + + /* + * Increment the pending counts for any regions that will + * be written to (writes to recover regions are going to + * be delayed). + */ + rh_inc_pending(&ms->rh, &sync); + rh_inc_pending(&ms->rh, &nosync); + rh_flush(&ms->rh); + + /* + * Dispatch io. + */ + while ((bio = bio_list_pop(&sync))) + do_write(ms, bio); + + while ((bio = bio_list_pop(&recover))) + rh_delay(&ms->rh, bio); + + while ((bio = bio_list_pop(&nosync))) { + map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio); + generic_make_request(bio); + } +} + +/*----------------------------------------------------------------- + * kmirrord + *---------------------------------------------------------------*/ +static LIST_HEAD(_mirror_sets); +static DECLARE_RWSEM(_mirror_sets_lock); + +static void do_mirror(struct mirror_set *ms) +{ + struct bio_list reads, writes; + + spin_lock(&ms->lock); + memcpy(&reads, &ms->reads, sizeof(reads)); + bio_list_init(&ms->reads); + memcpy(&writes, &ms->writes, sizeof(writes)); + bio_list_init(&ms->writes); + spin_unlock(&ms->lock); + + rh_update_states(&ms->rh); + do_recovery(ms); + do_reads(ms, &reads); + do_writes(ms, &writes); + blk_run_queues(); +} + +static void do_work(void) +{ + struct mirror_set *ms; + + down_read(&_mirror_sets_lock); + list_for_each_entry (ms, &_mirror_sets, list) + do_mirror(ms); + up_read(&_mirror_sets_lock); +} + +/*----------------------------------------------------------------- + * Target functions + *---------------------------------------------------------------*/ +static struct mirror_set *alloc_context(unsigned int nr_mirrors, + sector_t region_size, + struct dm_target *ti, + struct dirty_log *dl) +{ + size_t len; + struct mirror_set *ms = NULL; + + if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors)) + return NULL; + + len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); + + ms = kmalloc(len, GFP_KERNEL); + if (!ms) { + ti->error = "dm-mirror: Cannot allocate mirror context"; + return NULL; + } + + memset(ms, 0, len); + spin_lock_init(&ms->lock); + + ms->ti = ti; + ms->nr_mirrors = nr_mirrors; + ms->nr_regions = dm_div_up(ti->len, region_size); + + if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { + ti->error = "dm-mirror: Error creating dirty region hash"; + kfree(ms); + return NULL; + } + + return ms; +} + +static void free_context(struct mirror_set *ms, struct dm_target *ti, + unsigned int m) +{ + while (m--) + dm_put_device(ti, ms->mirror[m].dev); + + rh_exit(&ms->rh); + kfree(ms); +} + +static inline int _check_region_size(struct dm_target *ti, sector_t size) +{ + return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) || + size > ti->len); +} + +static int get_mirror(struct mirror_set *ms, struct dm_target *ti, + unsigned int mirror, char **argv) +{ + sector_t offset; + + if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) { + ti->error = "dm-mirror: Invalid offset"; + return -EINVAL; + } + + if (dm_get_device(ti, argv[0], offset, ti->len, + dm_table_get_mode(ti->table), + &ms->mirror[mirror].dev)) { + ti->error = "dm-mirror: Device lookup failure"; + return -ENXIO; + } + + ms->mirror[mirror].offset = offset; + + return 0; +} + +static int add_mirror_set(struct mirror_set *ms) +{ + down_write(&_mirror_sets_lock); + list_add_tail(&ms->list, &_mirror_sets); + up_write(&_mirror_sets_lock); + dm_daemon_wake(&_kmirrord); + + return 0; +} + +static void del_mirror_set(struct mirror_set *ms) +{ + down_write(&_mirror_sets_lock); + list_del(&ms->list); + up_write(&_mirror_sets_lock); +} + +/* + * Create dirty log: log_type #log_params + */ +static struct dirty_log *create_dirty_log(struct dm_target *ti, + unsigned int argc, char **argv, + unsigned int *args_used) +{ + unsigned int param_count; + struct dirty_log *dl; + + if (argc < 2) { + ti->error = "dm-mirror: Insufficient mirror log arguments"; + return NULL; + } + + if (sscanf(argv[1], "%u", ¶m_count) != 1 || param_count != 1) { + ti->error = "dm-mirror: Invalid mirror log argument count"; + return NULL; + } + + *args_used = 2 + param_count; + + if (argc < *args_used) { + ti->error = "dm-mirror: Insufficient mirror log arguments"; + return NULL; + } + + dl = dm_create_dirty_log(argv[0], ti->len, param_count, argv + 2); + if (!dl) { + ti->error = "dm-mirror: Error creating mirror dirty log"; + return NULL; + } + + if (!_check_region_size(ti, dl->type->get_region_size(dl))) { + ti->error = "dm-mirror: Invalid region size"; + dm_destroy_dirty_log(dl); + return NULL; + } + + return dl; +} + +/* + * Construct a mirror mapping: + * + * log_type #log_params + * #mirrors [mirror_path offset]{2,} + * + * For now, #log_params = 1, log_type = "core" + * + */ +#define DM_IO_PAGES 64 +static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + unsigned int nr_mirrors, m, args_used; + struct mirror_set *ms; + struct dirty_log *dl; + + dl = create_dirty_log(ti, argc, argv, &args_used); + if (!dl) + return -EINVAL; + + argv += args_used; + argc -= args_used; + + if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || + nr_mirrors < 2) { + ti->error = "dm-mirror: Invalid number of mirrors"; + dm_destroy_dirty_log(dl); + return -EINVAL; + } + + argv++, argc--; + + if (argc != nr_mirrors * 2) { + ti->error = "dm-mirror: Wrong number of mirror arguments"; + dm_destroy_dirty_log(dl); + return -EINVAL; + } + + ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); + if (!ms) { + dm_destroy_dirty_log(dl); + return -ENOMEM; + } + + /* Get the mirror parameter sets */ + for (m = 0; m < nr_mirrors; m++) { + r = get_mirror(ms, ti, m, argv); + if (r) { + free_context(ms, ti, m); + return r; + } + argv += 2; + argc -= 2; + } + + ti->private = ms; + + r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); + if (r) { + free_context(ms, ti, ms->nr_mirrors); + return r; + } + + add_mirror_set(ms); + return 0; +} + +static void mirror_dtr(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + + del_mirror_set(ms); + kcopyd_client_destroy(ms->kcopyd_client); + free_context(ms, ti, ms->nr_mirrors); +} + +static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) +{ + int wake = 0; + struct bio_list *bl; + + bl = (rw == WRITE) ? &ms->writes : &ms->reads; + spin_lock(&ms->lock); + wake = !(bl->head); + bio_list_add(bl, bio); + spin_unlock(&ms->lock); + + if (wake) + dm_daemon_wake(&_kmirrord); +} + +/* + * Mirror mapping function + */ +static int mirror_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + int r, rw = bio_rw(bio); + struct mirror *m; + struct mirror_set *ms = ti->private; + + map_context->ll = bio->bi_sector >> ms->rh.region_shift; + + if (rw == WRITE) { + queue_bio(ms, bio, rw); + return 0; + } + + r = ms->rh.log->type->in_sync(ms->rh.log, + bio_to_region(&ms->rh, bio), 0); + if (r < 0 && r != -EWOULDBLOCK) + return r; + + if (r == -EWOULDBLOCK) /* FIXME: ugly */ + r = 0; + + /* + * We don't want to fast track a recovery just for a read + * ahead. So we just let it silently fail. + * FIXME: get rid of this. + */ + if (!r && rw == READA) + return -EIO; + + if (!r) { + /* Pass this io over to the daemon */ + queue_bio(ms, bio, rw); + return 0; + } + + m = choose_mirror(ms, bio->bi_sector); + if (!m) + return -EIO; + + map_bio(ms, m, bio); + return 1; +} + +static int mirror_end_io(struct dm_target *ti, struct bio *bio, + int error, union map_info *map_context) +{ + int rw = bio_rw(bio); + struct mirror_set *ms = (struct mirror_set *) ti->private; + region_t region = map_context->ll; + + /* + * We need to dec pending if this was a write. + */ + if (rw == WRITE) + rh_dec(&ms->rh, region); + + return 0; +} + +static void mirror_suspend(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + rh_stop_recovery(&ms->rh); +} + +static void mirror_resume(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + rh_start_recovery(&ms->rh); +} + +static int mirror_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + char buffer[32]; + unsigned int m, sz = 0; + struct mirror_set *ms = (struct mirror_set *) ti->private; + + switch (type) { + case STATUSTYPE_INFO: + sz += snprintf(result + sz, maxlen - sz, "%d ", ms->nr_mirrors); + + for (m = 0; m < ms->nr_mirrors; m++) { + format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev); + sz += snprintf(result + sz, maxlen - sz, "%s ", buffer); + } + + sz += snprintf(result + sz, maxlen - sz, + SECTOR_FORMAT "/" SECTOR_FORMAT, + ms->sync_count, ms->nr_regions); + break; + + case STATUSTYPE_TABLE: + sz += snprintf(result + sz, maxlen - sz, + "%s 1 " SECTOR_FORMAT " %d ", + ms->rh.log->type->name, ms->rh.region_size, + ms->nr_mirrors); + + for (m = 0; m < ms->nr_mirrors; m++) { + format_dev_t(buffer, ms->mirror[m].dev->bdev->bd_dev); + sz += snprintf(result + sz, maxlen - sz, + "%s " SECTOR_FORMAT " ", + buffer, ms->mirror[m].offset); + } + } + + return 0; +} + +static struct target_type mirror_target = { + .name = "mirror", + .module = THIS_MODULE, + .ctr = mirror_ctr, + .dtr = mirror_dtr, + .map = mirror_map, + .end_io = mirror_end_io, + .suspend = mirror_suspend, + .resume = mirror_resume, + .status = mirror_status, +}; + +static int __init dm_mirror_init(void) +{ + int r; + + r = dm_dirty_log_init(); + if (r) + return r; + + r = dm_daemon_start(&_kmirrord, "kmirrord", do_work); + if (r) { + DMERR("couldn't start kmirrord"); + dm_dirty_log_exit(); + return r; + } + + r = dm_register_target(&mirror_target); + if (r < 0) { + DMERR("%s: Failed to register mirror target", + mirror_target.name); + dm_dirty_log_exit(); + dm_daemon_stop(&_kmirrord); + } + + return r; +} + +static void __exit dm_mirror_exit(void) +{ + int r; + + r = dm_unregister_target(&mirror_target); + if (r < 0) + DMERR("%s: unregister failed %d", mirror_target.name, r); + + dm_daemon_stop(&_kmirrord); + dm_dirty_log_exit(); +} + +/* Module hooks */ +module_init(dm_mirror_init); +module_exit(dm_mirror_exit); + +MODULE_DESCRIPTION(DM_NAME " mirror target"); +MODULE_AUTHOR("Joe Thornber"); +MODULE_LICENSE("GPL"); --- diff/drivers/md/dm-snapshot.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-snapshot.c 2003-12-29 10:12:47.000000000 +0000 @@ -0,0 +1,1269 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm-snapshot.h" +#include "kcopyd.h" + +/* + * FIXME: Remove this before release. + */ +#if 0 +#define DMDEBUG DMWARN +#else +#define DMDEBUG(x...) +#endif + +/* + * The percentage increment we will wake up users at + */ +#define WAKE_UP_PERCENT 5 + +/* + * kcopyd priority of snapshot operations + */ +#define SNAPSHOT_COPY_PRIORITY 2 + +/* + * Each snapshot reserves this many pages for io + * FIXME: calculate this + */ +#define SNAPSHOT_PAGES 256 + +struct pending_exception { + struct exception e; + + /* + * Origin buffers waiting for this to complete are held + * in a list (using b_reqnext). + */ + struct bio *origin_bios; + struct bio *snapshot_bios; + + /* + * Other pending_exceptions that are processing this + * chunk. When this list is empty, we know we can + * complete the origins. + */ + struct list_head siblings; + + /* Pointer back to snapshot context */ + struct dm_snapshot *snap; + + /* + * 1 indicates the exception has already been sent to + * kcopyd. + */ + int started; +}; + +/* + * Hash table mapping origin volumes to lists of snapshots and + * a lock to protect it + */ +static kmem_cache_t *exception_cache; +static kmem_cache_t *pending_cache; +static mempool_t *pending_pool; + +/* + * One of these per registered origin, held in the snapshot_origins hash + */ +struct origin { + /* The origin device */ + struct block_device *bdev; + + struct list_head hash_list; + + /* List of snapshots for this origin */ + struct list_head snapshots; +}; + +/* + * Size of the hash table for origin volumes. If we make this + * the size of the minors list then it should be nearly perfect + */ +#define ORIGIN_HASH_SIZE 256 +#define ORIGIN_MASK 0xFF +static struct list_head *_origins; +static struct rw_semaphore _origins_lock; + +static int init_origin_hash(void) +{ + int i; + + _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), + GFP_KERNEL); + if (!_origins) { + DMERR("Device mapper: Snapshot: unable to allocate memory"); + return -ENOMEM; + } + + for (i = 0; i < ORIGIN_HASH_SIZE; i++) + INIT_LIST_HEAD(_origins + i); + init_rwsem(&_origins_lock); + + return 0; +} + +static void exit_origin_hash(void) +{ + kfree(_origins); +} + +static inline unsigned int origin_hash(struct block_device *bdev) +{ + return bdev->bd_dev & ORIGIN_MASK; +} + +static struct origin *__lookup_origin(struct block_device *origin) +{ + struct list_head *slist; + struct list_head *ol; + struct origin *o; + + ol = &_origins[origin_hash(origin)]; + list_for_each(slist, ol) { + o = list_entry(slist, struct origin, hash_list); + + if (bdev_equal(o->bdev, origin)) + return o; + } + + return NULL; +} + +static void __insert_origin(struct origin *o) +{ + struct list_head *sl = &_origins[origin_hash(o->bdev)]; + list_add_tail(&o->hash_list, sl); +} + +/* + * Make a note of the snapshot and its origin so we can look it + * up when the origin has a write on it. + */ +static int register_snapshot(struct dm_snapshot *snap) +{ + struct origin *o; + struct block_device *bdev = snap->origin->bdev; + + down_write(&_origins_lock); + o = __lookup_origin(bdev); + + if (!o) { + /* New origin */ + o = kmalloc(sizeof(*o), GFP_KERNEL); + if (!o) { + up_write(&_origins_lock); + return -ENOMEM; + } + + /* Initialise the struct */ + INIT_LIST_HEAD(&o->snapshots); + o->bdev = bdev; + + __insert_origin(o); + } + + list_add_tail(&snap->list, &o->snapshots); + + up_write(&_origins_lock); + return 0; +} + +static void unregister_snapshot(struct dm_snapshot *s) +{ + struct origin *o; + + down_write(&_origins_lock); + o = __lookup_origin(s->origin->bdev); + + list_del(&s->list); + if (list_empty(&o->snapshots)) { + list_del(&o->hash_list); + kfree(o); + } + + up_write(&_origins_lock); +} + +/* + * Implementation of the exception hash tables. + */ +static int init_exception_table(struct exception_table *et, uint32_t size) +{ + unsigned int i; + + et->hash_mask = size - 1; + et->table = dm_vcalloc(size, sizeof(struct list_head)); + if (!et->table) + return -ENOMEM; + + for (i = 0; i < size; i++) + INIT_LIST_HEAD(et->table + i); + + return 0; +} + +static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem) +{ + struct list_head *slot, *entry, *temp; + struct exception *ex; + int i, size; + + size = et->hash_mask + 1; + for (i = 0; i < size; i++) { + slot = et->table + i; + + list_for_each_safe(entry, temp, slot) { + ex = list_entry(entry, struct exception, hash_list); + kmem_cache_free(mem, ex); + } + } + + vfree(et->table); +} + +/* + * FIXME: check how this hash fn is performing. + */ +static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk) +{ + return chunk & et->hash_mask; +} + +static void insert_exception(struct exception_table *eh, struct exception *e) +{ + struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)]; + list_add(&e->hash_list, l); +} + +static inline void remove_exception(struct exception *e) +{ + list_del(&e->hash_list); +} + +/* + * Return the exception data for a sector, or NULL if not + * remapped. + */ +static struct exception *lookup_exception(struct exception_table *et, + chunk_t chunk) +{ + struct list_head *slot, *el; + struct exception *e; + + slot = &et->table[exception_hash(et, chunk)]; + list_for_each(el, slot) { + e = list_entry(el, struct exception, hash_list); + if (e->old_chunk == chunk) + return e; + } + + return NULL; +} + +static inline struct exception *alloc_exception(void) +{ + struct exception *e; + + e = kmem_cache_alloc(exception_cache, GFP_NOIO); + if (!e) + e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); + + return e; +} + +static inline void free_exception(struct exception *e) +{ + kmem_cache_free(exception_cache, e); +} + +static inline struct pending_exception *alloc_pending_exception(void) +{ + return mempool_alloc(pending_pool, GFP_NOIO); +} + +static inline void free_pending_exception(struct pending_exception *pe) +{ + mempool_free(pe, pending_pool); +} + +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) +{ + struct exception *e; + + e = alloc_exception(); + if (!e) + return -ENOMEM; + + e->old_chunk = old; + e->new_chunk = new; + insert_exception(&s->complete, e); + return 0; +} + +/* + * Hard coded magic. + */ +static int calc_max_buckets(void) +{ + unsigned long mem; + + mem = num_physpages << PAGE_SHIFT; + mem /= 50; + mem /= sizeof(struct list_head); + + return mem; +} + +/* + * Rounds a number down to a power of 2. + */ +static inline uint32_t round_down(uint32_t n) +{ + while (n & (n - 1)) + n &= (n - 1); + return n; +} + +/* + * Allocate room for a suitable hash table. + */ +static int init_hash_tables(struct dm_snapshot *s) +{ + sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; + + /* + * Calculate based on the size of the original volume or + * the COW volume... + */ + cow_dev_size = get_dev_size(s->cow->bdev); + origin_dev_size = get_dev_size(s->origin->bdev); + max_buckets = calc_max_buckets(); + + hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; + hash_size = min(hash_size, max_buckets); + + /* Round it down to a power of 2 */ + hash_size = round_down(hash_size); + if (init_exception_table(&s->complete, hash_size)) + return -ENOMEM; + + /* + * Allocate hash table for in-flight exceptions + * Make this smaller than the real hash table + */ + hash_size >>= 3; + if (!hash_size) + hash_size = 64; + + if (init_exception_table(&s->pending, hash_size)) { + exit_exception_table(&s->complete, exception_cache); + return -ENOMEM; + } + + return 0; +} + +/* + * Round a number up to the nearest 'size' boundary. size must + * be a power of 2. + */ +static inline ulong round_up(ulong n, ulong size) +{ + size--; + return (n + size) & ~size; +} + +/* + * Construct a snapshot mapping:

+ */ +static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dm_snapshot *s; + unsigned long chunk_size; + int r = -EINVAL; + char persistent; + char *origin_path; + char *cow_path; + char *value; + int blocksize; + + if (argc < 4) { + ti->error = "dm-snapshot: requires exactly 4 arguments"; + r = -EINVAL; + goto bad1; + } + + origin_path = argv[0]; + cow_path = argv[1]; + persistent = toupper(*argv[2]); + + if (persistent != 'P' && persistent != 'N') { + ti->error = "Persistent flag is not P or N"; + r = -EINVAL; + goto bad1; + } + + chunk_size = simple_strtoul(argv[3], &value, 10); + if (chunk_size == 0 || value == NULL) { + ti->error = "Invalid chunk size"; + r = -EINVAL; + goto bad1; + } + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) { + ti->error = "Cannot allocate snapshot context private " + "structure"; + r = -ENOMEM; + goto bad1; + } + + r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); + if (r) { + ti->error = "Cannot get origin device"; + goto bad2; + } + + /* FIXME: get cow length */ + r = dm_get_device(ti, cow_path, 0, 0, + FMODE_READ | FMODE_WRITE, &s->cow); + if (r) { + dm_put_device(ti, s->origin); + ti->error = "Cannot get COW device"; + goto bad2; + } + + /* + * Chunk size must be multiple of page size. Silently + * round up if it's not. + */ + chunk_size = round_up(chunk_size, PAGE_SIZE >> 9); + + /* Validate the chunk size against the device block size */ + /* FIXME: check this, also ugly */ + blocksize = s->cow->bdev->bd_disk->queue->hardsect_size; + if (chunk_size % (blocksize >> 9)) { + ti->error = "Chunk size is not a multiple of device blocksize"; + r = -EINVAL; + goto bad3; + } + + /* Check chunk_size is a power of 2 */ + if (chunk_size & (chunk_size - 1)) { + ti->error = "Chunk size is not a power of 2"; + r = -EINVAL; + goto bad3; + } + + s->chunk_size = chunk_size; + s->chunk_mask = chunk_size - 1; + s->type = persistent; + for (s->chunk_shift = 0; chunk_size; + s->chunk_shift++, chunk_size >>= 1) + ; + s->chunk_shift--; + + s->valid = 1; + s->have_metadata = 0; + s->last_percent = 0; + init_rwsem(&s->lock); + s->table = ti->table; + + /* Allocate hash table for COW data */ + if (init_hash_tables(s)) { + ti->error = "Unable to allocate hash table space"; + r = -ENOMEM; + goto bad3; + } + + /* + * Check the persistent flag - done here because we need the iobuf + * to check the LV header + */ + s->store.snap = s; + + if (persistent == 'P') + r = dm_create_persistent(&s->store, s->chunk_size); + else + r = dm_create_transient(&s->store, s, blocksize); + + if (r) { + ti->error = "Couldn't create exception store"; + r = -EINVAL; + goto bad4; + } + + r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); + if (r) { + ti->error = "Could not create kcopyd client"; + goto bad5; + } + + /* Add snapshot to the list of snapshots for this origin */ + if (register_snapshot(s)) { + r = -EINVAL; + ti->error = "Cannot register snapshot origin"; + goto bad6; + } + + ti->private = s; + return 0; + + bad6: + kcopyd_client_destroy(s->kcopyd_client); + + bad5: + s->store.destroy(&s->store); + + bad4: + exit_exception_table(&s->pending, pending_cache); + exit_exception_table(&s->complete, exception_cache); + + bad3: + dm_put_device(ti, s->cow); + dm_put_device(ti, s->origin); + + bad2: + kfree(s); + + bad1: + return r; +} + +static void snapshot_dtr(struct dm_target *ti) +{ + struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + + dm_table_event(ti->table); + + unregister_snapshot(s); + + exit_exception_table(&s->pending, pending_cache); + exit_exception_table(&s->complete, exception_cache); + + /* Deallocate memory used */ + s->store.destroy(&s->store); + + dm_put_device(ti, s->origin); + dm_put_device(ti, s->cow); + kcopyd_client_destroy(s->kcopyd_client); + kfree(s); +} + +/* + * We hold lists of bios, using the bi_next field. + */ +static void queue_bio(struct bio **queue, struct bio *bio) +{ + bio->bi_next = *queue; + *queue = bio; +} + +/* + * FIXME: inefficient. + */ +static void queue_bios(struct bio **queue, struct bio *bios) +{ + while (*queue) + queue = &((*queue)->bi_next); + + *queue = bios; +} + +/* + * Flush a list of buffers. + */ +static void flush_bios(struct bio *bio) +{ + struct bio *n; + + DMDEBUG("begin flush"); + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + DMDEBUG("flushing %p", bio); + generic_make_request(bio); + bio = n; + } + + blk_run_queues(); +} + +/* + * Error a list of buffers. + */ +static void error_bios(struct bio *bio) +{ + struct bio *n; + + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + bio_io_error(bio, bio->bi_size); + bio = n; + } +} + +static struct bio *__flush_bios(struct pending_exception *pe) +{ + struct pending_exception *sibling; + + if (list_empty(&pe->siblings)) + return pe->origin_bios; + + sibling = list_entry(pe->siblings.next, + struct pending_exception, siblings); + + list_del(&pe->siblings); + + /* FIXME: I think there's a race on SMP machines here, add spin lock */ + queue_bios(&sibling->origin_bios, pe->origin_bios); + + return NULL; +} + +static void check_free_space(struct dm_snapshot *s) +{ +#if 0 + sector_t numerator, denominator; + double n, d; + unsigned pc; + + if (!s->store.fraction_full) + return; + + s->store.fraction_full(&s->store, &numerator, &denominator); + n = (double) numerator; + d = (double) denominator; + + pc = (int) (n / d); + + if (pc >= s->last_percent + WAKE_UP_PERCENT) { + dm_table_event(s->table); + s->last_percent = pc - pc % WAKE_UP_PERCENT; + } +#endif +} + +static void pending_complete(struct pending_exception *pe, int success) +{ + struct exception *e; + struct dm_snapshot *s = pe->snap; + struct bio *flush = NULL; + + if (success) { + e = alloc_exception(); + if (!e) { + DMWARN("Unable to allocate exception."); + down_write(&s->lock); + s->store.drop_snapshot(&s->store); + s->valid = 0; + flush = __flush_bios(pe); + up_write(&s->lock); + + error_bios(pe->snapshot_bios); + goto out; + } + memcpy(e, &pe->e, sizeof(*e)); + + /* + * Add a proper exception, and remove the + * in-flight exception from the list. + */ + down_write(&s->lock); + insert_exception(&s->complete, e); + remove_exception(&pe->e); + flush = __flush_bios(pe); + + /* Submit any pending write BHs */ + up_write(&s->lock); + + flush_bios(pe->snapshot_bios); + DMDEBUG("Exception completed successfully."); + + /* Notify any interested parties */ + //check_free_space(s); + + } else { + /* Read/write error - snapshot is unusable */ + down_write(&s->lock); + if (s->valid) + DMERR("Error reading/writing snapshot"); + s->store.drop_snapshot(&s->store); + s->valid = 0; + remove_exception(&pe->e); + flush = __flush_bios(pe); + up_write(&s->lock); + + error_bios(pe->snapshot_bios); + + dm_table_event(s->table); + DMDEBUG("Exception failed."); + } + + out: + free_pending_exception(pe); + + if (flush) + flush_bios(flush); +} + +static void commit_callback(void *context, int success) +{ + struct pending_exception *pe = (struct pending_exception *) context; + pending_complete(pe, success); +} + +/* + * Called when the copy I/O has finished. kcopyd actually runs + * this code so don't block. + */ +static void copy_callback(int read_err, unsigned int write_err, void *context) +{ + struct pending_exception *pe = (struct pending_exception *) context; + struct dm_snapshot *s = pe->snap; + + if (read_err || write_err) + pending_complete(pe, 0); + + else + /* Update the metadata if we are persistent */ + s->store.commit_exception(&s->store, &pe->e, commit_callback, + pe); +} + +/* + * Dispatches the copy operation to kcopyd. + */ +static inline void start_copy(struct pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + struct io_region src, dest; + struct block_device *bdev = s->origin->bdev; + sector_t dev_size; + + dev_size = get_dev_size(bdev); + + src.bdev = bdev; + src.sector = chunk_to_sector(s, pe->e.old_chunk); + src.count = min(s->chunk_size, dev_size - src.sector); + + dest.bdev = s->cow->bdev; + dest.sector = chunk_to_sector(s, pe->e.new_chunk); + dest.count = src.count; + + /* Hand over to kcopyd */ + DMDEBUG("starting exception copy"); + kcopyd_copy(s->kcopyd_client, + &src, 1, &dest, 0, copy_callback, pe); +} + +/* + * Looks to see if this snapshot already has a pending exception + * for this chunk, otherwise it allocates a new one and inserts + * it into the pending table. + * + * NOTE: a write lock must be held on snap->lock before calling + * this. + */ +static struct pending_exception * +__find_pending_exception(struct dm_snapshot *s, struct bio *bio) +{ + struct exception *e; + struct pending_exception *pe; + chunk_t chunk = sector_to_chunk(s, bio->bi_sector); + + /* + * Is there a pending exception for this already ? + */ + e = lookup_exception(&s->pending, chunk); + if (e) { + /* cast the exception to a pending exception */ + pe = list_entry(e, struct pending_exception, e); + + } else { + /* + * Create a new pending exception, we don't want + * to hold the lock while we do this. + */ + up_write(&s->lock); + + pe = alloc_pending_exception(); + pe->e.old_chunk = chunk; + pe->origin_bios = pe->snapshot_bios = NULL; + INIT_LIST_HEAD(&pe->siblings); + pe->snap = s; + pe->started = 0; + + down_write(&s->lock); + if (s->store.prepare_exception(&s->store, &pe->e)) { + free_pending_exception(pe); + s->valid = 0; + return NULL; + } + + insert_exception(&s->pending, &pe->e); + } + + return pe; +} + +static inline void remap_exception(struct dm_snapshot *s, struct exception *e, + struct bio *bio) +{ + bio->bi_bdev = s->cow->bdev; + bio->bi_sector = chunk_to_sector(s, e->new_chunk) + + (bio->bi_sector & s->chunk_mask); +} + +static int snapshot_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct exception *e; + struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + int r = 1; + chunk_t chunk; + struct pending_exception *pe; + + chunk = sector_to_chunk(s, bio->bi_sector); + + /* Full snapshots are not usable */ + if (!s->valid) + return -1; + + /* + * Write to snapshot - higher level takes care of RW/RO + * flags so we should only get this if we are + * writeable. + */ + if (bio_rw(bio) == WRITE) { + + /* FIXME: should only take write lock if we need + * to copy an exception */ + down_write(&s->lock); + + /* If the block is already remapped - use that, else remap it */ + e = lookup_exception(&s->complete, chunk); + if (e) { + remap_exception(s, e, bio); + up_write(&s->lock); + + } else { + pe = __find_pending_exception(s, bio); + + if (!pe) { + s->store.drop_snapshot(&s->store); + s->valid = 0; + r = -EIO; + up_write(&s->lock); + } else { + remap_exception(s, &pe->e, bio); + queue_bio(&pe->snapshot_bios, bio); + + if (!pe->started) { + /* this is protected by snap->lock */ + pe->started = 1; + up_write(&s->lock); + start_copy(pe); + } else + up_write(&s->lock); + r = 0; + } + } + + } else { + /* + * FIXME: this read path scares me because we + * always use the origin when we have a pending + * exception. However I can't think of a + * situation where this is wrong - ejt. + */ + + /* Do reads */ + down_read(&s->lock); + + /* See if it it has been remapped */ + e = lookup_exception(&s->complete, chunk); + if (e) + remap_exception(s, e, bio); + else + bio->bi_bdev = s->origin->bdev; + + up_read(&s->lock); + } + + return r; +} + +void snapshot_resume(struct dm_target *ti) +{ + struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + + if (s->have_metadata) + return; + + if (s->store.read_metadata(&s->store)) { + down_write(&s->lock); + s->valid = 0; + up_write(&s->lock); + } + + s->have_metadata = 1; +} + +static int snapshot_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct dm_snapshot *snap = (struct dm_snapshot *) ti->private; + char cow[32]; + char org[32]; + + switch (type) { + case STATUSTYPE_INFO: + if (!snap->valid) + snprintf(result, maxlen, "Invalid"); + else { + if (snap->store.fraction_full) { + sector_t numerator, denominator; + snap->store.fraction_full(&snap->store, + &numerator, + &denominator); + snprintf(result, maxlen, + SECTOR_FORMAT "/" SECTOR_FORMAT, + numerator, denominator); + } + else + snprintf(result, maxlen, "Unknown"); + } + break; + + case STATUSTYPE_TABLE: + /* + * kdevname returns a static pointer so we need + * to make private copies if the output is to + * make sense. + */ + format_dev_t(cow, snap->cow->bdev->bd_dev); + format_dev_t(org, snap->origin->bdev->bd_dev); + snprintf(result, maxlen, "%s %s %c %lld", org, cow, + snap->type, snap->chunk_size); + break; + } + + return 0; +} + +/*----------------------------------------------------------------- + * Origin methods + *---------------------------------------------------------------*/ +static void list_merge(struct list_head *l1, struct list_head *l2) +{ + struct list_head *l1_n, *l2_p; + + l1_n = l1->next; + l2_p = l2->prev; + + l1->next = l2; + l2->prev = l1; + + l2_p->next = l1_n; + l1_n->prev = l2_p; +} + +static int __origin_write(struct list_head *snapshots, struct bio *bio) +{ + int r = 1, first = 1; + struct list_head *sl; + struct dm_snapshot *snap; + struct exception *e; + struct pending_exception *pe, *last = NULL; + chunk_t chunk; + + /* Do all the snapshots on this origin */ + list_for_each(sl, snapshots) { + snap = list_entry(sl, struct dm_snapshot, list); + + /* Only deal with valid snapshots */ + if (!snap->valid) + continue; + + down_write(&snap->lock); + + /* + * Remember, different snapshots can have + * different chunk sizes. + */ + chunk = sector_to_chunk(snap, bio->bi_sector); + + /* + * Check exception table to see if block + * is already remapped in this snapshot + * and trigger an exception if not. + */ + e = lookup_exception(&snap->complete, chunk); + if (!e) { + pe = __find_pending_exception(snap, bio); + if (!pe) { + snap->store.drop_snapshot(&snap->store); + snap->valid = 0; + + } else { + if (last) + list_merge(&pe->siblings, + &last->siblings); + + last = pe; + r = 0; + } + } + + up_write(&snap->lock); + } + + /* + * Now that we have a complete pe list we can start the copying. + */ + if (last) { + pe = last; + do { + down_write(&pe->snap->lock); + if (first) + queue_bio(&pe->origin_bios, bio); + +#if 0 + if (!pe->started) { + pe->started = 1; + up_write(&pe->snap->lock); + start_copy(pe); + } else + up_write(&pe->snap->lock); +#else + pe->started = 1; + up_write(&pe->snap->lock); + start_copy(pe); +#endif + first = 0; + pe = list_entry(pe->siblings.next, + struct pending_exception, siblings); + + } while (pe != last); + } + + return r; +} + +/* + * Called on a write from the origin driver. + */ +int do_origin(struct dm_dev *origin, struct bio *bio) +{ + struct origin *o; + int r; + + down_read(&_origins_lock); + o = __lookup_origin(origin->bdev); + if (!o) + BUG(); + + r = __origin_write(&o->snapshots, bio); + up_read(&_origins_lock); + + return r; +} + +/* + * Origin: maps a linear range of a device, with hooks for snapshotting. + */ + +/* + * Construct an origin mapping: + * The context for an origin is merely a 'struct dm_dev *' + * pointing to the real device. + */ +static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + struct dm_dev *dev; + + if (argc != 1) { + ti->error = "dm-origin: incorrect number of arguments"; + return -EINVAL; + } + + r = dm_get_device(ti, argv[0], 0, ti->len, + dm_table_get_mode(ti->table), &dev); + if (r) { + ti->error = "Cannot get target device"; + return r; + } + + ti->private = dev; + return 0; +} + +static void origin_dtr(struct dm_target *ti) +{ + struct dm_dev *dev = (struct dm_dev *) ti->private; + dm_put_device(ti, dev); +} + +static int origin_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct dm_dev *dev = (struct dm_dev *) ti->private; + bio->bi_bdev = dev->bdev; + + /* Only tell snapshots if this is a write */ + return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1; +} + +static int origin_status(struct dm_target *ti, status_type_t type, char *result, + unsigned int maxlen) +{ + struct dm_dev *dev = (struct dm_dev *) ti->private; + char buffer[32]; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + format_dev_t(buffer, dev->bdev->bd_dev); + snprintf(result, maxlen, "%s", buffer); + break; + } + + return 0; +} + +static struct target_type origin_target = { + name: "snapshot-origin", + module: THIS_MODULE, + ctr: origin_ctr, + dtr: origin_dtr, + map: origin_map, + status: origin_status, +}; + +static struct target_type snapshot_target = { + name: "snapshot", + module: THIS_MODULE, + ctr: snapshot_ctr, + dtr: snapshot_dtr, + map: snapshot_map, + resume: snapshot_resume, + status: snapshot_status, +}; + +static int __init dm_snapshot_init(void) +{ + int r; + + r = dm_register_target(&snapshot_target); + if (r) { + DMERR("snapshot target register failed %d", r); + return r; + } + + r = dm_register_target(&origin_target); + if (r < 0) { + DMERR("Device mapper: Origin: register failed %d\n", r); + goto bad1; + } + + r = init_origin_hash(); + if (r) { + DMERR("init_origin_hash failed."); + goto bad2; + } + + exception_cache = kmem_cache_create("dm-snapshot-ex", + sizeof(struct exception), + __alignof__(struct exception), + 0, NULL, NULL); + if (!exception_cache) { + DMERR("Couldn't create exception cache."); + r = -ENOMEM; + goto bad3; + } + + pending_cache = + kmem_cache_create("dm-snapshot-in", + sizeof(struct pending_exception), + __alignof__(struct pending_exception), + 0, NULL, NULL); + if (!pending_cache) { + DMERR("Couldn't create pending cache."); + r = -ENOMEM; + goto bad4; + } + + pending_pool = mempool_create(128, mempool_alloc_slab, + mempool_free_slab, pending_cache); + if (!pending_pool) { + DMERR("Couldn't create pending pool."); + r = -ENOMEM; + goto bad5; + } + + return 0; + + bad5: + kmem_cache_destroy(pending_cache); + bad4: + kmem_cache_destroy(exception_cache); + bad3: + exit_origin_hash(); + bad2: + dm_unregister_target(&origin_target); + bad1: + dm_unregister_target(&snapshot_target); + return r; +} + +static void __exit dm_snapshot_exit(void) +{ + int r; + + r = dm_unregister_target(&snapshot_target); + if (r) + DMERR("snapshot unregister failed %d", r); + + r = dm_unregister_target(&origin_target); + if (r) + DMERR("origin unregister failed %d", r); + + exit_origin_hash(); + mempool_destroy(pending_pool); + kmem_cache_destroy(pending_cache); + kmem_cache_destroy(exception_cache); +} + +/* Module hooks */ +module_init(dm_snapshot_init); +module_exit(dm_snapshot_exit); + +MODULE_DESCRIPTION(DM_NAME " snapshot target"); +MODULE_AUTHOR("Joe Thornber"); +MODULE_LICENSE("GPL"); --- diff/drivers/md/dm-snapshot.h 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-snapshot.h 2003-12-29 10:12:47.000000000 +0000 @@ -0,0 +1,161 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#ifndef DM_SNAPSHOT_H +#define DM_SNAPSHOT_H + +#include "dm.h" +#include + +struct exception_table { + uint32_t hash_mask; + struct list_head *table; +}; + +/* + * The snapshot code deals with largish chunks of the disk at a + * time. Typically 64k - 256k. + */ +/* FIXME: can we get away with limiting these to a uint32_t ? */ +typedef sector_t chunk_t; + +/* + * An exception is used where an old chunk of data has been + * replaced by a new one. + */ +struct exception { + struct list_head hash_list; + + chunk_t old_chunk; + chunk_t new_chunk; +}; + +/* + * Abstraction to handle the meta/layout of exception stores (the + * COW device). + */ +struct exception_store { + + /* + * Destroys this object when you've finished with it. + */ + void (*destroy) (struct exception_store *store); + + /* + * The target shouldn't read the COW device until this is + * called. + */ + int (*read_metadata) (struct exception_store *store); + + /* + * Find somewhere to store the next exception. + */ + int (*prepare_exception) (struct exception_store *store, + struct exception *e); + + /* + * Update the metadata with this exception. + */ + void (*commit_exception) (struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context); + + /* + * The snapshot is invalid, note this in the metadata. + */ + void (*drop_snapshot) (struct exception_store *store); + + /* + * Return how full the snapshot is. + */ + void (*fraction_full) (struct exception_store *store, + sector_t *numerator, + sector_t *denominator); + + struct dm_snapshot *snap; + void *context; +}; + +struct dm_snapshot { + struct rw_semaphore lock; + struct dm_table *table; + + struct dm_dev *origin; + struct dm_dev *cow; + + /* List of snapshots per Origin */ + struct list_head list; + + /* Size of data blocks saved - must be a power of 2 */ + chunk_t chunk_size; + chunk_t chunk_mask; + chunk_t chunk_shift; + + /* You can't use a snapshot if this is 0 (e.g. if full) */ + int valid; + int have_metadata; + + /* Used for display of table */ + char type; + + /* The last percentage we notified */ + int last_percent; + + struct exception_table pending; + struct exception_table complete; + + /* The on disk metadata handler */ + struct exception_store store; + + struct kcopyd_client *kcopyd_client; +}; + +/* + * Used by the exception stores to load exceptions hen + * initialising. + */ +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new); + +/* + * Constructor and destructor for the default persistent + * store. + */ +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size); + +int dm_create_transient(struct exception_store *store, + struct dm_snapshot *s, int blocksize); + +/* + * Return the number of sectors in the device. + */ +static inline sector_t get_dev_size(struct block_device *bdev) +{ + return bdev->bd_inode->i_size >> SECTOR_SHIFT; +} + +static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector) +{ + return (sector & ~s->chunk_mask) >> s->chunk_shift; +} + +static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk) +{ + return chunk << s->chunk_shift; +} + +static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs) +{ + /* + * There is only ever one instance of a particular block + * device so we can compare pointers safely. + */ + return lhs == rhs; +} + +#endif --- diff/drivers/md/kcopyd.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/kcopyd.c 2003-12-29 10:12:47.000000000 +0000 @@ -0,0 +1,652 @@ +/* + * Copyright (C) 2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kcopyd.h" +#include "dm-daemon.h" + +/* FIXME: this is only needed for the DMERR macros */ +#include "dm.h" + +static struct dm_daemon _kcopyd; + +/*----------------------------------------------------------------- + * Each kcopyd client has its own little pool of preallocated + * pages for kcopyd io. + *---------------------------------------------------------------*/ +struct kcopyd_client { + struct list_head list; + + spinlock_t lock; + struct list_head pages; + unsigned int nr_pages; + unsigned int nr_free_pages; +}; + +static inline void __push_page(struct kcopyd_client *kc, struct page *p) +{ + list_add(&p->list, &kc->pages); + kc->nr_free_pages++; +} + +static inline struct page *__pop_page(struct kcopyd_client *kc) +{ + struct page *p; + + p = list_entry(kc->pages.next, struct page, list); + list_del(&p->list); + kc->nr_free_pages--; + + return p; +} + +static int kcopyd_get_pages(struct kcopyd_client *kc, + unsigned int nr, struct list_head *pages) +{ + struct page *p; + INIT_LIST_HEAD(pages); + + spin_lock(&kc->lock); + if (kc->nr_free_pages < nr) { + spin_unlock(&kc->lock); + return -ENOMEM; + } + + while (nr--) { + p = __pop_page(kc); + list_add(&p->list, pages); + } + spin_unlock(&kc->lock); + + return 0; +} + +static void kcopyd_put_pages(struct kcopyd_client *kc, struct list_head *pages) +{ + struct list_head *tmp, *tmp2; + + spin_lock(&kc->lock); + list_for_each_safe (tmp, tmp2, pages) + __push_page(kc, list_entry(tmp, struct page, list)); + spin_unlock(&kc->lock); +} + +/* + * These three functions resize the page pool. + */ +static void drop_pages(struct list_head *pages) +{ + struct page *p; + struct list_head *tmp, *tmp2; + + list_for_each_safe (tmp, tmp2, pages) { + p = list_entry(tmp, struct page, list); + ClearPageLocked(p); + __free_page(p); + } +} + +static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr) +{ + unsigned int i; + struct page *p; + LIST_HEAD(new); + + for (i = 0; i < nr; i++) { + p = alloc_page(GFP_KERNEL); + if (!p) { + drop_pages(&new); + return -ENOMEM; + } + + SetPageLocked(p); + list_add(&p->list, &new); + } + + kcopyd_put_pages(kc, &new); + kc->nr_pages += nr; + return 0; +} + +static void client_free_pages(struct kcopyd_client *kc) +{ + BUG_ON(kc->nr_free_pages != kc->nr_pages); + drop_pages(&kc->pages); + kc->nr_free_pages = kc->nr_pages = 0; +} + +/*----------------------------------------------------------------- + * kcopyd_jobs need to be allocated by the *clients* of kcopyd, + * for this reason we use a mempool to prevent the client from + * ever having to do io (which could cause a deadlock). + *---------------------------------------------------------------*/ +struct kcopyd_job { + struct kcopyd_client *kc; + struct list_head list; + unsigned long flags; + + /* + * Error state of the job. + */ + int read_err; + unsigned int write_err; + + /* + * Either READ or WRITE + */ + int rw; + struct io_region source; + + /* + * The destinations for the transfer. + */ + unsigned int num_dests; + struct io_region dests[KCOPYD_MAX_REGIONS]; + + sector_t offset; + unsigned int nr_pages; + struct list_head pages; + + /* + * Set this to ensure you are notified when the job has + * completed. 'context' is for callback to use. + */ + kcopyd_notify_fn fn; + void *context; + + /* + * These fields are only used if the job has been split + * into more manageable parts. + */ + struct semaphore lock; + atomic_t sub_jobs; + sector_t progress; +}; + +/* FIXME: this should scale with the number of pages */ +#define MIN_JOBS 512 + +static kmem_cache_t *_job_cache; +static mempool_t *_job_pool; + +/* + * We maintain three lists of jobs: + * + * i) jobs waiting for pages + * ii) jobs that have pages, and are waiting for the io to be issued. + * iii) jobs that have completed. + * + * All three of these are protected by job_lock. + */ +static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED; + +static LIST_HEAD(_complete_jobs); +static LIST_HEAD(_io_jobs); +static LIST_HEAD(_pages_jobs); + +static int jobs_init(void) +{ + INIT_LIST_HEAD(&_complete_jobs); + INIT_LIST_HEAD(&_io_jobs); + INIT_LIST_HEAD(&_pages_jobs); + + _job_cache = kmem_cache_create("kcopyd-jobs", + sizeof(struct kcopyd_job), + __alignof__(struct kcopyd_job), + 0, NULL, NULL); + if (!_job_cache) + return -ENOMEM; + + _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, + mempool_free_slab, _job_cache); + if (!_job_pool) { + kmem_cache_destroy(_job_cache); + return -ENOMEM; + } + + return 0; +} + +static void jobs_exit(void) +{ + BUG_ON(!list_empty(&_complete_jobs)); + BUG_ON(!list_empty(&_io_jobs)); + BUG_ON(!list_empty(&_pages_jobs)); + + mempool_destroy(_job_pool); + kmem_cache_destroy(_job_cache); +} + +/* + * Functions to push and pop a job onto the head of a given job + * list. + */ +static inline struct kcopyd_job *pop(struct list_head *jobs) +{ + struct kcopyd_job *job = NULL; + unsigned long flags; + + spin_lock_irqsave(&_job_lock, flags); + + if (!list_empty(jobs)) { + job = list_entry(jobs->next, struct kcopyd_job, list); + list_del(&job->list); + } + spin_unlock_irqrestore(&_job_lock, flags); + + return job; +} + +static inline void push(struct list_head *jobs, struct kcopyd_job *job) +{ + unsigned long flags; + + spin_lock_irqsave(&_job_lock, flags); + list_add_tail(&job->list, jobs); + spin_unlock_irqrestore(&_job_lock, flags); +} + +/* + * These three functions process 1 item from the corresponding + * job list. + * + * They return: + * < 0: error + * 0: success + * > 0: can't process yet. + */ +static int run_complete_job(struct kcopyd_job *job) +{ + void *context = job->context; + int read_err = job->read_err; + unsigned int write_err = job->write_err; + kcopyd_notify_fn fn = job->fn; + + kcopyd_put_pages(job->kc, &job->pages); + mempool_free(job, _job_pool); + fn(read_err, write_err, context); + return 0; +} + +static unsigned _pending; +static void complete_io(unsigned long error, void *context) +{ + struct kcopyd_job *job = (struct kcopyd_job *) context; + + _pending--; + if (error) { + if (job->rw == WRITE) + job->write_err &= error; + else + job->read_err = 1; + + if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { + push(&_complete_jobs, job); + dm_daemon_wake(&_kcopyd); + return; + } + } + + if (job->rw == WRITE) + push(&_complete_jobs, job); + + else { + job->rw = WRITE; + push(&_io_jobs, job); + } + + dm_daemon_wake(&_kcopyd); +} + +/* + * Request io on as many buffer heads as we can currently get for + * a particular job. + */ +static int run_io_job(struct kcopyd_job *job) +{ + int r; + + if (job->rw == READ) + r = dm_io_async(1, &job->source, job->rw, + list_entry(job->pages.next, struct page, list), + job->offset, complete_io, job); + + else + r = dm_io_async(job->num_dests, job->dests, job->rw, + list_entry(job->pages.next, struct page, list), + job->offset, complete_io, job); + + if (!r) + _pending++; + + return r; +} + +static int run_pages_job(struct kcopyd_job *job) +{ + int r; + + job->nr_pages = dm_div_up(job->dests[0].count + job->offset, + PAGE_SIZE >> 9); + r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); + if (!r) { + /* this job is ready for io */ + push(&_io_jobs, job); + return 0; + } + + if (r == -ENOMEM) + /* can't complete now */ + return 1; + + return r; +} + +/* + * Run through a list for as long as possible. Returns the count + * of successful jobs. + */ +static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) +{ + struct kcopyd_job *job; + int r, count = 0; + + while ((job = pop(jobs))) { + + r = fn(job); + + if (r < 0) { + /* error this rogue job */ + if (job->rw == WRITE) + job->write_err = (unsigned int) -1; + else + job->read_err = 1; + push(&_complete_jobs, job); + break; + } + + if (r > 0) { + /* + * We couldn't service this job ATM, so + * push this job back onto the list. + */ + push(jobs, job); + break; + } + + count++; + } + + return count; +} + +/* + * kcopyd does this every time it's woken up. + */ +static void do_work(void) +{ + /* + * The order that these are called is *very* important. + * complete jobs can free some pages for pages jobs. + * Pages jobs when successful will jump onto the io jobs + * list. io jobs call wake when they complete and it all + * starts again. + */ + process_jobs(&_complete_jobs, run_complete_job); + process_jobs(&_pages_jobs, run_pages_job); + process_jobs(&_io_jobs, run_io_job); + + blk_run_queues(); +} + +/* + * If we are copying a small region we just dispatch a single job + * to do the copy, otherwise the io has to be split up into many + * jobs. + */ +static void dispatch_job(struct kcopyd_job *job) +{ + push(&_pages_jobs, job); + dm_daemon_wake(&_kcopyd); +} + +#define SUB_JOB_SIZE 128 +static void segment_complete(int read_err, + unsigned int write_err, void *context) +{ + /* FIXME: tidy this function */ + sector_t progress = 0; + sector_t count = 0; + struct kcopyd_job *job = (struct kcopyd_job *) context; + + down(&job->lock); + + /* update the error */ + if (read_err) + job->read_err = 1; + + if (write_err) + job->write_err &= write_err; + + /* + * Only dispatch more work if there hasn't been an error. + */ + if ((!job->read_err && !job->write_err) || + test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { + /* get the next chunk of work */ + progress = job->progress; + count = job->source.count - progress; + if (count) { + if (count > SUB_JOB_SIZE) + count = SUB_JOB_SIZE; + + job->progress += count; + } + } + up(&job->lock); + + if (count) { + int i; + struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO); + + memcpy(sub_job, job, sizeof(*job)); + sub_job->source.sector += progress; + sub_job->source.count = count; + + for (i = 0; i < job->num_dests; i++) { + sub_job->dests[i].sector += progress; + sub_job->dests[i].count = count; + } + + sub_job->fn = segment_complete; + sub_job->context = job; + dispatch_job(sub_job); + + } else if (atomic_dec_and_test(&job->sub_jobs)) { + + /* + * To avoid a race we must keep the job around + * until after the notify function has completed. + * Otherwise the client may try and stop the job + * after we've completed. + */ + job->fn(read_err, write_err, job->context); + mempool_free(job, _job_pool); + } +} + +/* + * Create some little jobs that will do the move between + * them. + */ +#define SPLIT_COUNT 8 +static void split_job(struct kcopyd_job *job) +{ + int i; + + atomic_set(&job->sub_jobs, SPLIT_COUNT); + for (i = 0; i < SPLIT_COUNT; i++) + segment_complete(0, 0u, job); +} + +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, + unsigned int num_dests, struct io_region *dests, + unsigned int flags, kcopyd_notify_fn fn, void *context) +{ + struct kcopyd_job *job; + + /* + * Allocate a new job. + */ + job = mempool_alloc(_job_pool, GFP_NOIO); + + /* + * set up for the read. + */ + job->kc = kc; + job->flags = flags; + job->read_err = 0; + job->write_err = 0; + job->rw = READ; + + memcpy(&job->source, from, sizeof(*from)); + + job->num_dests = num_dests; + memcpy(&job->dests, dests, sizeof(*dests) * num_dests); + + job->offset = 0; + job->nr_pages = 0; + INIT_LIST_HEAD(&job->pages); + + job->fn = fn; + job->context = context; + + if (job->source.count < SUB_JOB_SIZE) + dispatch_job(job); + + else { + init_MUTEX(&job->lock); + job->progress = 0; + split_job(job); + } + + return 0; +} + +/* + * Cancels a kcopyd job, eg. someone might be deactivating a + * mirror. + */ +int kcopyd_cancel(struct kcopyd_job *job, int block) +{ + /* FIXME: finish */ + return -1; +} + +/*----------------------------------------------------------------- + * Unit setup + *---------------------------------------------------------------*/ +static DECLARE_MUTEX(_client_lock); +static LIST_HEAD(_clients); + +static int client_add(struct kcopyd_client *kc) +{ + down(&_client_lock); + list_add(&kc->list, &_clients); + up(&_client_lock); + return 0; +} + +static void client_del(struct kcopyd_client *kc) +{ + down(&_client_lock); + list_del(&kc->list); + up(&_client_lock); +} + +int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result) +{ + int r = 0; + struct kcopyd_client *kc; + + kc = kmalloc(sizeof(*kc), GFP_KERNEL); + if (!kc) + return -ENOMEM; + + kc->lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&kc->pages); + kc->nr_pages = kc->nr_free_pages = 0; + r = client_alloc_pages(kc, nr_pages); + if (r) { + kfree(kc); + return r; + } + + r = dm_io_get(nr_pages); + if (r) { + client_free_pages(kc); + kfree(kc); + return r; + } + + r = client_add(kc); + if (r) { + dm_io_put(nr_pages); + client_free_pages(kc); + kfree(kc); + return r; + } + + *result = kc; + return 0; +} + +void kcopyd_client_destroy(struct kcopyd_client *kc) +{ + dm_io_put(kc->nr_pages); + client_free_pages(kc); + client_del(kc); + kfree(kc); +} + + +int __init kcopyd_init(void) +{ + int r; + + r = jobs_init(); + if (r) + return r; + + r = dm_daemon_start(&_kcopyd, "kcopyd", do_work); + if (r) + jobs_exit(); + + return r; +} + +void kcopyd_exit(void) +{ + jobs_exit(); + dm_daemon_stop(&_kcopyd); +} + +EXPORT_SYMBOL(kcopyd_client_create); +EXPORT_SYMBOL(kcopyd_client_destroy); +EXPORT_SYMBOL(kcopyd_copy); +EXPORT_SYMBOL(kcopyd_cancel); --- diff/drivers/md/kcopyd.h 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/kcopyd.h 2003-12-29 10:12:47.000000000 +0000 @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2001 Sistina Software + * + * This file is released under the GPL. + */ + +#ifndef DM_KCOPYD_H +#define DM_KCOPYD_H + +#include "dm-io.h" + +int kcopyd_init(void); +void kcopyd_exit(void); + +/* FIXME: make this configurable */ +#define KCOPYD_MAX_REGIONS 8 + +#define KCOPYD_IGNORE_ERROR 1 + +/* + * To use kcopyd you must first create a kcopyd client object. + */ +struct kcopyd_client; +int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result); +void kcopyd_client_destroy(struct kcopyd_client *kc); + +/* + * Submit a copy job to kcopyd. This is built on top of the + * previous three fns. + * + * read_err is a boolean, + * write_err is a bitset, with 1 bit for each destination region + */ +typedef void (*kcopyd_notify_fn)(int read_err, + unsigned int write_err, void *context); + +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, + unsigned int num_dests, struct io_region *dests, + unsigned int flags, kcopyd_notify_fn fn, void *context); + +#endif