Persistent log for mirroring. Compiles but not run yet. --- diff/drivers/md/dm-log.c 2003-11-26 10:18:32.000000000 +0000 +++ source/drivers/md/dm-log.c 2003-12-10 13:34:26.000000000 +0000 @@ -69,7 +69,7 @@ spin_unlock(&_lock); } -struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size, +struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti, unsigned int argc, char **argv) { struct dirty_log_type *type; @@ -86,7 +86,7 @@ } log->type = type; - if (type->ctr(log, dev_size, argc, argv)) { + if (type->ctr(log, ti, argc, argv)) { kfree(log); put_type(type); return NULL; @@ -102,37 +102,193 @@ kfree(log); } - /*----------------------------------------------------------------- - * In core log, ie. trivial, non-persistent - * - * For now we'll keep this simple and just have 2 bitsets, one - * for clean/dirty, the other for sync/nosync. The sync bitset - * will be freed when everything is in sync. - * - * FIXME: problems with a 64bit sector_t + * Persistent and core logs share a lot of their implementation. + * FIXME: need a reload method to be called from a resume *---------------------------------------------------------------*/ -struct core_log { +/* + * Magic for persistent mirrors: "MiRr" + */ +#define MIRROR_MAGIC 0x4D695272 + +/* + * The on-disk version of the metadata. + */ +#define MIRROR_DISK_VERSION 1 +#define LOG_OFFSET 2 + +struct log_header { + uint32_t magic; + + /* + * Simple, incrementing version. no backward + * compatibility. + */ + uint32_t version; + sector_t nr_regions; +}; + +struct log_c { + struct dm_target *ti; + int touched; sector_t region_size; unsigned int region_count; - unsigned long *clean_bits; - unsigned long *sync_bits; - unsigned long *recovering_bits; /* FIXME: this seems excessive */ + + unsigned bitset_uint32_count; + uint32_t *clean_bits; + uint32_t *sync_bits; + uint32_t *recovering_bits; /* FIXME: this seems excessive */ int sync_search; + + /* + * Disk log fields + */ + struct dm_dev *log_dev; + struct log_header header; + + struct io_region header_location; + struct page *header_pages; + struct log_header *disk_header; + + struct io_region bits_location; + struct page *bits_pages; + uint32_t *disk_bits; }; +/* + * The touched member needs to be updated every time we access + * one of the bitsets. + */ +static inline int log_test_bit(uint32_t *bs, unsigned bit) +{ + return test_bit(bit, (unsigned long *) bs) ? 1 : 0; +} + +static inline void log_set_bit(struct log_c *l, + uint32_t *bs, unsigned bit) +{ + set_bit(bit, (unsigned long *) bs); + l->touched = 1; +} + +static inline void log_clear_bit(struct log_c *l, + uint32_t *bs, unsigned bit) +{ + clear_bit(bit, (unsigned long *) bs); + l->touched = 1; +} + +/*---------------------------------------------------------------- + * Header IO + *--------------------------------------------------------------*/ +static void header_to_disk(struct log_header *core, struct log_header *disk) +{ + disk->magic = cpu_to_le32(core->magic); + disk->version = cpu_to_le32(core->version); + disk->nr_regions = cpu_to_le64(core->nr_regions); +} + +static void header_from_disk(struct log_header *core, struct log_header *disk) +{ + core->magic = le32_to_cpu(disk->magic); + core->version = le32_to_cpu(disk->version); + core->nr_regions = le64_to_cpu(disk->nr_regions); +} + +static int read_header(struct log_c *log) +{ + int r; + unsigned long ebits; + + r = dm_io_sync(1, &log->header_location, READ, + log->header_pages, 0, &ebits); + if (r) + return r; + + header_from_disk(&log->header, log->disk_header); + + if (log->header.magic != MIRROR_MAGIC) { + log->header.magic = MIRROR_MAGIC; + log->header.version = MIRROR_DISK_VERSION; + log->header.nr_regions = 0; + } + + if (log->header.version != MIRROR_DISK_VERSION) { + DMWARN("incompatible disk log version"); + return -EINVAL; + } + + return 0; +} + +static inline int write_header(struct log_c *log) +{ + unsigned long ebits; + + header_to_disk(&log->header, log->disk_header); + return dm_io_sync(1, &log->header_location, WRITE, + log->header_pages, 0, &ebits); +} + +/*---------------------------------------------------------------- + * Bits IO + *--------------------------------------------------------------*/ +static inline void bits_to_core(uint32_t *core, uint32_t *disk, unsigned count) +{ + unsigned i; + + for (i = 0; i < count; i++) + core[i] = le32_to_cpu(disk[i]); +} + +static inline void bits_to_disk(uint32_t *core, uint32_t *disk, unsigned count) +{ + unsigned i; + + /* copy across the clean/dirty bitset */ + for (i = 0; i < i; i++) + disk[i] = cpu_to_le32(core[i]); +} + +static int read_bits(struct log_c *log) +{ + int r; + unsigned long ebits; + + r = dm_io_sync(1, &log->bits_location, READ, + log->bits_pages, 0, &ebits); + if (r) + return r; + + bits_to_core(log->clean_bits, log->disk_bits, + log->bitset_uint32_count); + return 0; +} + +static int write_bits(struct log_c *log) +{ + unsigned long ebits; + bits_to_disk(log->clean_bits, log->disk_bits, + log->bitset_uint32_count); + return dm_io_sync(1, &log->bits_location, WRITE, + log->bits_pages, 0, &ebits); +} + +/*---------------------------------------------------------------- + * constructor/destructor + *--------------------------------------------------------------*/ #define BYTE_SHIFT 3 -static int core_ctr(struct dirty_log *log, sector_t dev_size, +static int core_ctr(struct dirty_log *log, struct dm_target *ti, unsigned int argc, char **argv) { - struct core_log *clog; + struct log_c *lc; sector_t region_size; unsigned int region_count; size_t bitset_size; if (argc != 1) { - DMWARN("wrong number of arguments to core_log"); + DMWARN("wrong number of arguments to log_c"); return -EINVAL; } @@ -141,81 +297,182 @@ return -EINVAL; } - region_count = dm_div_up(dev_size, region_size); + region_count = dm_div_up(ti->len, region_size); - clog = kmalloc(sizeof(*clog), GFP_KERNEL); - if (!clog) { + lc = kmalloc(sizeof(*lc), GFP_KERNEL); + if (!lc) { DMWARN("couldn't allocate core log"); return -ENOMEM; } - clog->region_size = region_size; - clog->region_count = region_count; + lc->ti = ti; + lc->touched = 0; + lc->region_size = region_size; + lc->region_count = region_count; /* * Work out how many words we need to hold the bitset. */ bitset_size = dm_round_up(region_count, - sizeof(*clog->clean_bits) << BYTE_SHIFT); + sizeof(*lc->clean_bits) << BYTE_SHIFT); bitset_size >>= BYTE_SHIFT; - clog->clean_bits = vmalloc(bitset_size); - if (!clog->clean_bits) { + lc->bitset_uint32_count = bitset_size / 4; + lc->clean_bits = vmalloc(bitset_size); + if (!lc->clean_bits) { DMWARN("couldn't allocate clean bitset"); - kfree(clog); + kfree(lc); return -ENOMEM; } - memset(clog->clean_bits, -1, bitset_size); + memset(lc->clean_bits, -1, bitset_size); - clog->sync_bits = vmalloc(bitset_size); - if (!clog->sync_bits) { + lc->sync_bits = vmalloc(bitset_size); + if (!lc->sync_bits) { DMWARN("couldn't allocate sync bitset"); - vfree(clog->clean_bits); - kfree(clog); + vfree(lc->clean_bits); + kfree(lc); return -ENOMEM; } - memset(clog->sync_bits, 0, bitset_size); + memset(lc->sync_bits, 0, bitset_size); - clog->recovering_bits = vmalloc(bitset_size); - if (!clog->recovering_bits) { + lc->recovering_bits = vmalloc(bitset_size); + if (!lc->recovering_bits) { DMWARN("couldn't allocate sync bitset"); - vfree(clog->sync_bits); - vfree(clog->clean_bits); - kfree(clog); + vfree(lc->sync_bits); + vfree(lc->clean_bits); + kfree(lc); return -ENOMEM; } - memset(clog->recovering_bits, 0, bitset_size); - clog->sync_search = 0; - log->context = clog; + memset(lc->recovering_bits, 0, bitset_size); + lc->sync_search = 0; + log->context = lc; return 0; } static void core_dtr(struct dirty_log *log) { - struct core_log *clog = (struct core_log *) log->context; - vfree(clog->clean_bits); - vfree(clog->sync_bits); - vfree(clog->recovering_bits); - kfree(clog); + struct log_c *lc = (struct log_c *) log->context; + vfree(lc->clean_bits); + vfree(lc->sync_bits); + vfree(lc->recovering_bits); + kfree(lc); +} + +static int disk_ctr(struct dirty_log *log, struct dm_target *ti, + unsigned int argc, char **argv) +{ + int r; + size_t size; + struct log_c *lc; + struct dm_dev *dev; + + r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */, + FMODE_READ | FMODE_WRITE, &dev); + if (r) + return r; + + r = core_ctr(log, ti, argc - 1, argv + 1); + if (r) { + dm_put_device(ti, dev); + return r; + } + + lc = (struct log_c *) log->context; + lc->log_dev = dev; + + /* setup the disk header fields */ + lc->header_location.bdev = lc->log_dev->bdev; + lc->header_location.sector = 0; + lc->header_location.count = 1; + lc->disk_header = vmalloc(sizeof(struct log_header)); + if (!lc->disk_header) + goto bad; + + lc->header_pages = vmalloc_to_page(lc->disk_header); + + /* setup the disk bitset fields */ + lc->bits_location.bdev = lc->log_dev->bdev; + lc->bits_location.sector = LOG_OFFSET; + + size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t), + 1 << SECTOR_SHIFT); + lc->bits_location.count = size >> SECTOR_SHIFT; + lc->disk_bits = vmalloc(size); + if (!lc->disk_bits) { + vfree(lc->disk_header); + goto bad; + } + lc->bits_pages = vmalloc_to_page(lc->disk_bits); + return 0; + + bad: + dm_put_device(ti, lc->log_dev); + core_dtr(log); + return -ENOMEM; +} + +static void disk_dtr(struct dirty_log *log) +{ + struct log_c *lc = (struct log_c *) log->context; + core_dtr(log); + dm_put_device(lc->ti, lc->log_dev); + vfree(lc->disk_header); + vfree(lc->disk_bits); +} + +static int disk_resume(struct dirty_log *log) +{ + int r; + unsigned i; + struct log_c *lc = (struct log_c *) log->context; + size_t size = lc->bitset_uint32_count * sizeof(uint32_t); + + /* read the disk header */ + r = read_header(lc); + if (r) + return r; + + /* read the bits */ + r = read_bits(lc); + if (r) + return r; + + /* zero any new bits if the mirror has grown */ + for (i = lc->header.nr_regions; i < lc->region_count; i++) + /* FIXME: amazingly inefficient */ + log_clear_bit(lc, lc->clean_bits, i); + + /* copy clean across to sync */ + memcpy(lc->sync_bits, lc->clean_bits, size); + + /* write the bits */ + r = write_bits(lc); + if (r) + return r; + + /* set the correct number of regions in the header */ + lc->header.nr_regions = lc->region_count; + + /* write the new header */ + return write_header(lc); } static sector_t core_get_region_size(struct dirty_log *log) { - struct core_log *clog = (struct core_log *) log->context; - return clog->region_size; + struct log_c *lc = (struct log_c *) log->context; + return lc->region_size; } static int core_is_clean(struct dirty_log *log, region_t region) { - struct core_log *clog = (struct core_log *) log->context; - return test_bit(region, clog->clean_bits); + struct log_c *lc = (struct log_c *) log->context; + return log_test_bit(lc->clean_bits, region); } static int core_in_sync(struct dirty_log *log, region_t region, int block) { - struct core_log *clog = (struct core_log *) log->context; - - return test_bit(region, clog->sync_bits) ? 1 : 0; + struct log_c *lc = (struct log_c *) log->context; + return log_test_bit(lc->sync_bits, region); } static int core_flush(struct dirty_log *log) @@ -224,53 +481,68 @@ return 0; } +static int disk_flush(struct dirty_log *log) +{ + int r; + struct log_c *lc = (struct log_c *) log->context; + + /* only write if the log has changed */ + if (!lc->touched) + return 0; + + r = write_bits(lc); + if (!r) + lc->touched = 0; + + return r; +} + static void core_mark_region(struct dirty_log *log, region_t region) { - struct core_log *clog = (struct core_log *) log->context; - clear_bit(region, clog->clean_bits); + struct log_c *lc = (struct log_c *) log->context; + log_clear_bit(lc, lc->clean_bits, region); } static void core_clear_region(struct dirty_log *log, region_t region) { - struct core_log *clog = (struct core_log *) log->context; - set_bit(region, clog->clean_bits); + struct log_c *lc = (struct log_c *) log->context; + log_set_bit(lc, lc->clean_bits, region); } static int core_get_resync_work(struct dirty_log *log, region_t *region) { - struct core_log *clog = (struct core_log *) log->context; + struct log_c *lc = (struct log_c *) log->context; - if (clog->sync_search >= clog->region_count) + if (lc->sync_search >= lc->region_count) return 0; do { - *region = find_next_zero_bit(clog->sync_bits, - clog->region_count, - clog->sync_search); - clog->sync_search = *region + 1; + *region = find_next_zero_bit((unsigned long *) lc->sync_bits, + lc->region_count, + lc->sync_search); + lc->sync_search = *region + 1; - if (*region == clog->region_count) + if (*region == lc->region_count) return 0; - } while (test_bit(*region, clog->recovering_bits)); + } while (log_test_bit(lc->recovering_bits, *region)); - set_bit(*region, clog->recovering_bits); + log_set_bit(lc, lc->recovering_bits, *region); return 1; } static void core_complete_resync_work(struct dirty_log *log, region_t region, int success) { - struct core_log *clog = (struct core_log *) log->context; + struct log_c *lc = (struct log_c *) log->context; - clear_bit(region, clog->recovering_bits); + log_clear_bit(lc, lc->recovering_bits, region); if (success) - set_bit(region, clog->sync_bits); + log_set_bit(lc, lc->sync_bits, region); } static struct dirty_log_type _core_type = { .name = "core", - .ctr = core_ctr, .dtr = core_dtr, .get_region_size = core_get_region_size, @@ -283,6 +555,21 @@ .complete_resync_work = core_complete_resync_work }; +static struct dirty_log_type _disk_type = { + .name = "disk", + .ctr = disk_ctr, + .dtr = disk_dtr, + .resume = disk_resume, + .get_region_size = core_get_region_size, + .is_clean = core_is_clean, + .in_sync = core_in_sync, + .flush = disk_flush, + .mark_region = core_mark_region, + .clear_region = core_clear_region, + .get_resync_work = core_get_resync_work, + .complete_resync_work = core_complete_resync_work +}; + __init int dm_dirty_log_init(void) { int r; @@ -291,11 +578,18 @@ if (r) DMWARN("couldn't register core log"); + r = dm_register_dirty_log_type(&_disk_type); + if (r) { + DMWARN("couldn't register disk type"); + dm_unregister_dirty_log_type(&_core_type); + } + return r; } void dm_dirty_log_exit(void) { + dm_unregister_dirty_log_type(&_disk_type); dm_unregister_dirty_log_type(&_core_type); } --- diff/drivers/md/dm-log.h 2003-11-26 10:18:32.000000000 +0000 +++ source/drivers/md/dm-log.h 2003-12-10 13:02:37.000000000 +0000 @@ -24,11 +24,18 @@ struct module *module; unsigned int use_count; - int (*ctr)(struct dirty_log *log, sector_t dev_size, + int (*ctr)(struct dirty_log *log, struct dm_target *ti, unsigned int argc, char **argv); void (*dtr)(struct dirty_log *log); /* + * There are times when we don't want the log to touch + * the disk. + */ + int (*suspend)(struct dirty_log *log); + int (*resume)(struct dirty_log *log); + + /* * Retrieves the smallest size of region that the log can * deal with. */ @@ -99,7 +106,7 @@ * Make sure you use these two functions, rather than calling * type->constructor/destructor() directly. */ -struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size, +struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti, unsigned int argc, char **argv); void dm_destroy_dirty_log(struct dirty_log *log); --- diff/drivers/md/dm-raid1.c 2003-12-04 11:18:37.000000000 +0000 +++ source/drivers/md/dm-raid1.c 2003-12-10 13:37:42.000000000 +0000 @@ -637,6 +637,8 @@ struct region *reg = (struct region *) context; struct mirror_set *ms = reg->rh->ms; + /* FIXME: we need to flush the log */ + /* FIXME: better error handling */ rh_recovery_end(reg, read_err || write_err); if (++ms->sync_count == ms->nr_regions) @@ -1025,7 +1027,7 @@ return NULL; } - dl = dm_create_dirty_log(argv[0], ti->len, param_count, argv + 2); + dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2); if (!dl) { ti->error = "dm-mirror: Error creating mirror dirty log"; return NULL; @@ -1198,12 +1200,20 @@ static void mirror_suspend(struct dm_target *ti) { struct mirror_set *ms = (struct mirror_set *) ti->private; + struct dirty_log *log = ms->rh.log; rh_stop_recovery(&ms->rh); + if (log->type->suspend && log->type->suspend(log)) + /* FIXME: need better error handling */ + DMWARN("log suspend failed"); } static void mirror_resume(struct dm_target *ti) { struct mirror_set *ms = (struct mirror_set *) ti->private; + struct dirty_log *log = ms->rh.log; + if (log->type->resume && log->type->resume(log)) + /* FIXME: need better error handling */ + DMWARN("log resume failed"); rh_start_recovery(&ms->rh); }