First some background.

A typical snapshot setup uses four devices:

Visible device-mapper devices:   origin (ORIGIN)           snapshot (SNAP)
Hidden devices:               original_device (DEV)   copy_on_write_device (COW)

The device-mapper snapshot target constructs the virtual devices ORIGIN 
and SNAP out of the real devices DEV and COW.
SNAP is an image of DEV at the instant the snapshot was created.

When you write to each part of ORIGIN for the first time, a copy of
the data you're about to change is first stashed away in COW so that SNAP 
can reconstruct it later.


Now for the first race in the existing implementation.

If you attempt to read from SNAP data that has not yet changed, the request
gets passed through mapped to DEV.  The snapshot code does not track it
beyond that point so it has no way of knowing whether or not the I/O to
DEV is complete nor even whether it has been submitted yet.  If, subsequently,
a write is issued to ORIGIN for the same blocks, the code does not know about
the outstanding read and blithely goes ahead and copies the original data
across to COW and then allows the write to change DEV.  It does not wait 
for the read to complete first, so if the read from SNAP got delayed for 
any reason it could return changed data from DEV!  Fortunately this is
very rare because reading from DEV is simple and quick but writing to ORIGIN 
involves a series of slow steps.


This patch begins to address this by adding code to track reads to SNAP.

The pending_exception structure (previously only used for ORIGIN and SNAP
writes) is now also allocated for SNAP reads that get mapped to DEV.

A new pending_exception variable read_count is introduced to hold the 
number of outstanding reads against SNAP.  It is incremented whenever
there is a read that uses the pending_exception structure and
decremented when the read completes.

A new pending_exception variable write_count is introduced which is 
non-zero if any writes to either the snapshot or the origin using
that pending_exception structure have been issued.

Device-mapper offers us private per-bio storage in map_context->ptr,
so we store the pending_exception structure there when mapping the bio
so we can retrieve it in our end_io function.

pending_complete() now checks for any outstanding SNAP reads.
If it finds any, it no longer writes to DEV and no longer destroys
the pending exception.  It sets the new pending_exception variable 
flush_required.  It still updates the exception tables though so
that all further I/O will see the real exception instead of the 
pending one.

When the SNAP read is complete, snapshot_end_io will be called.
If there are no more SNAP reads outstanding, then this checks
to see if flush_required was set.  If so, it triggers the flush.
If not, it checks whether there are any writes (to ORIGIN or SNAP)
outstanding.  If not, it can safely destroy the pending exception.

The scope of pe_lock is expanded to protect read_count, write_count, 
flush_required and calls to lookup_exception(), insert_exception() and
remove_exception() for pending_exceptions.  (The existing per-snapshot locks
permit sleeping so are unsuitable without more re-working than I want to 
do at present.)

Index: linux-2.6.14-rc2/drivers/md/dm-snap.c
===================================================================
--- linux-2.6.14-rc2.orig/drivers/md/dm-snap.c	2006-01-09 15:38:37.000000000 +0000
+++ linux-2.6.14-rc2/drivers/md/dm-snap.c	2006-01-09 15:38:47.000000000 +0000
@@ -67,6 +67,15 @@ struct pending_exception {
 	 * kcopyd.
 	 */
 	int started;
+
+	/* Number of outstanding snapshot reads */
+	int read_count;
+
+	/* Number of outstanding snapshot and origin writes */
+	int write_count;
+
+	/* Are there snapshot bios to flush when read_count becomes zero? */
+	int flush_required;
 };
 
 /*
@@ -640,6 +649,8 @@ static void pending_complete(struct pend
 	struct exception *e;
 	struct dm_snapshot *s = pe->snap;
 	struct bio *flush = NULL;
+	unsigned long flags;
+	int free_pe = 1;	/* Should we free the pending_exception? */
 
 	if (success) {
 		e = alloc_exception();
@@ -668,7 +679,19 @@ static void pending_complete(struct pend
 		/* Submit any pending write bios */
 		up_write(&s->lock);
 
-		flush_bios(bio_list_get(&pe->snapshot_bios));
+		/*
+		 * We must wait for any outstanding snapshot reads to complete
+		 * before we can change the origin.
+		 */
+		spin_lock_irqsave(&s->pe_lock, flags);
+		if (pe->read_count) {
+			pe->flush_required = 1;
+			free_pe = 0;
+		}
+		spin_unlock_irqrestore(&s->pe_lock, flags);
+
+		if (free_pe)
+			flush_bios(bio_list_get(&pe->snapshot_bios));
 	} else {
 		/* Read/write error - snapshot is unusable */
 		down_write(&s->lock);
@@ -686,7 +709,8 @@ static void pending_complete(struct pend
 	}
 
  out:
-	free_pending_exception(pe);
+	if (free_pe)
+		free_pending_exception(pe);
 
 	if (flush)
 		flush_bios(flush);
@@ -754,29 +778,43 @@ __find_pending_exception(struct dm_snaps
 {
 	struct exception *e;
 	struct pending_exception *pe;
+	unsigned long flags;
 	chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
 
 	/*
 	 * Is there a pending exception for this already ?
 	 */
+	spin_lock_irqsave(&s->pe_lock, flags);
 	e = lookup_exception(&s->pending, chunk);
 	if (e) {
 		/* cast the exception to a pending exception */
 		pe = container_of(e, struct pending_exception, e);
+		if (bio_rw(bio) == WRITE)
+			pe->write_count++;
+		else
+			pe->read_count++;
+		spin_unlock_irqrestore(&s->pe_lock, flags);
 
 	} else {
 		/*
 		 * Create a new pending exception, we don't want
 		 * to hold the lock while we do this.
 		 */
+		spin_unlock_irqrestore(&s->pe_lock, flags);
 		up_write(&s->lock);
 		pe = alloc_pending_exception();
 		down_write(&s->lock);
+		spin_lock_irqsave(&s->pe_lock, flags);
 
 		e = lookup_exception(&s->pending, chunk);
 		if (e) {
 			free_pending_exception(pe);
 			pe = container_of(e, struct pending_exception, e);
+			if (bio_rw(bio) == WRITE)
+				pe->write_count++;
+			else
+				pe->read_count++;
+			spin_unlock_irqrestore(&s->pe_lock, flags);
 		} else {
 			pe->e.old_chunk = chunk;
 			bio_list_init(&pe->origin_bios);
@@ -784,6 +822,14 @@ __find_pending_exception(struct dm_snaps
 			INIT_LIST_HEAD(&pe->siblings);
 			pe->snap = s;
 			pe->started = 0;
+			pe->read_count = 0;
+			pe->write_count = 0;
+			pe->flush_required = 0;
+
+			if (bio_rw(bio) == WRITE)
+				pe->write_count++;
+			else
+				pe->read_count++;
 
 			if (s->store.prepare_exception(&s->store, &pe->e)) {
 				free_pending_exception(pe);
@@ -792,6 +838,7 @@ __find_pending_exception(struct dm_snaps
 			}
 
 			insert_exception(&s->pending, &pe->e);
+			spin_unlock_irqrestore(&s->pe_lock, flags);
 		}
 	}
 
@@ -841,18 +888,20 @@ static int snapshot_map(struct dm_target
 		goto out_unlock;
 	}
 
-	if (bio_rw(bio) == WRITE) {
-		pe = __find_pending_exception(s, bio);
-		if (!pe) {
-			if (s->store.drop_snapshot)
-				s->store.drop_snapshot(&s->store);
-			s->valid = 0;
-			r = -EIO;
-			goto out_unlock;
-		}
+	pe = __find_pending_exception(s, bio);
+	if (!pe) {
+		if (s->store.drop_snapshot)
+			s->store.drop_snapshot(&s->store);
+		s->valid = 0;
+		r = -EIO;
+		goto out_unlock;
+	}
 
+	if (bio_rw(bio) == WRITE) {
 		remap_exception(s, &pe->e, bio);
+		spin_lock_irqsave(&s->pe_lock, flags);
 		bio_list_add(&pe->snapshot_bios, bio);
+		spin_unlock_irqrestore(&s->pe_lock, flags);
 
 		r = 0;
 		if (!pe->started) {
@@ -863,14 +912,8 @@ static int snapshot_map(struct dm_target
 			goto out;
 		}
 	} else {
-		/*
-		 * FIXME: this read path scares me because we
-		 * always use the origin when we have a pending
-		 * exception.  However I can't think of a
-		 * situation where this is wrong - ejt.
-		 */
-
 		bio->bi_bdev = s->origin->bdev;
+		map_context->ptr = pe;
 	}
 
  out_unlock:
@@ -879,6 +922,34 @@ static int snapshot_map(struct dm_target
 	return r;
 }
 
+static void snapshot_end_io(struct dm_target *ti, struct bio *bio,
+			    int error, union map_info *map_context)
+{
+	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+	struct pending_exception *pe = (struct pending_exception *)
+					    map_context->ptr;
+	unsigned long flags;
+
+	if (bio_rw(bio) == WRITE)
+		return error;
+
+	spin_lock_irqsave(&s->pe_lock, flags);
+	if (!--pe->read_count) {
+		if (pe->flush_required) {
+			bio_list_merge(&s->snapshot_bios, &pe->snapshot_bios);
+
+			queue_work(ksnapd, &process_snapshot_bios);
+		} else if (!pe->write_count)
+			/* No conflicting writes so we remove the pe. */
+			remove_exception(&pe->e);
+	}
+	spin_unlock_irqrestore(&s->pe_lock, flags);
+
+	free_pending_exception(pe);
+
+	return error;
+}
+
 static void snapshot_resume(struct dm_target *ti)
 {
 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
@@ -1146,6 +1217,7 @@ static struct target_type snapshot_targe
 	.ctr     = snapshot_ctr,
 	.dtr     = snapshot_dtr,
 	.map     = snapshot_map,
+	.end_io  = snapshot_end_io,
 	.resume  = snapshot_resume,
 	.status  = snapshot_status,
 };