Re-vamp how I/O is handled in volumes and plexes.

Analogous to the drive level, give each volume and plex a worker thread that picks up and processes incoming and completed BIOs. This should fix the data corruption issues that have come up a few weeks ago and improve performance, especially of RAID5 plexes. The volume level needs a little work, though.
svn path=/head/; revision=135426
2024-12-23 11:18:54 +00:00 · 2004-09-18 13:44:43 +00:00 · 2004-09-18 13:44:43 +00:00 · 67e3ab6ee5 · 2020-12-20 02:59:44 +00:00
commit 67e3ab6ee5
parent 54516c29e8
9 changed files with 788 additions and 679 deletions
--- a/sys/geom/vinum/geom_vinum.h
+++ b/sys/geom/vinum/geom_vinum.h
@ -70,6 +70,7 @@ int	gv_is_striped(struct gv_plex *);
 int	gv_is_open(struct g_geom *);
 void	gv_kill_drive_thread(struct gv_drive *);
 void	gv_kill_plex_thread(struct gv_plex *);
+void	gv_kill_vol_thread(struct gv_volume *);
 int	gv_object_type(struct gv_softc *, char *);
 void	gv_parse_config(struct gv_softc *, u_char *, int);
 const char	*gv_roughlength(off_t, int);
--- a/sys/geom/vinum/geom_vinum_init.c
+++ b/sys/geom/vinum/geom_vinum_init.c
@ -293,7 +293,7 @@ gv_sync_td(void *arg)
 		 * This hack declare this bio as part of an initialization
 		 * process, so that the lower levels allow it to get through.
 		 */
-		bp->bio_caller1 = p;
+		bp->bio_cflags |= GV_BIO_SYNCREQ;

 		/* Schedule it down ... */
 		g_io_request(bp, to);
--- a/sys/geom/vinum/geom_vinum_plex.c
+++ b/sys/geom/vinum/geom_vinum_plex.c
@ -43,6 +43,10 @@ __FBSDID("$FreeBSD$");
 #include <geom/vinum/geom_vinum_raid5.h>
 #include <geom/vinum/geom_vinum.h>

+static void gv_plex_completed_request(struct gv_plex *, struct bio *);
+static void gv_plex_normal_request(struct gv_plex *, struct bio *);
+static void gv_plex_worker(void *);
+
 /* XXX: is this the place to catch dying subdisks? */
 static void
 gv_plex_orphan(struct g_consumer *cp)
@ -76,48 +80,39 @@ gv_plex_orphan(struct g_consumer *cp)
 	g_wither_geom(gp, error);
 }

-static void
+void
 gv_plex_done(struct bio *bp)
 {
-	struct g_geom *gp;
-	struct gv_sd *s;
-	
-	gp = bp->bio_to->geom;
+	struct gv_plex *p;
+	struct gv_bioq *bq;

-	s = bp->bio_caller1;
-	KASSERT(s != NULL, ("gv_plex_done: NULL s"));
-
-	if (bp->bio_error == 0)
-		s->initialized += bp->bio_length;
-	
-	if (s->initialized >= s->size) {
-		gv_set_sd_state(s, GV_SD_UP, 0);
-		s->initialized = 0;
-	}
-
-	g_std_done(bp);
+	p = bp->bio_from->geom->softc;
+	bp->bio_cflags |= GV_BIO_DONE;
+	bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
+	bq->bp = bp;
+	mtx_lock(&p->bqueue_mtx);
+	TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
+	wakeup(p);
+	mtx_unlock(&p->bqueue_mtx);
 }

 /* Find the correct subdisk to send the bio to and build a bio to send. */
 static int
-gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,
-    caddr_t addr, long bcount, off_t boff)
+gv_plexbuffer(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
 {
 	struct g_geom *gp;
-	struct gv_plex *p;
 	struct gv_sd *s;
-	struct bio *cbp;
+	struct bio *cbp, *pbp;
 	int i, sdno;
-	off_t len_left, real_len, real_off, stripeend, stripeno, stripestart;
-
-	s = NULL;
-
-	gp = bp->bio_to->geom;
-	p = gp->softc;
+	off_t len_left, real_len, real_off;
+	off_t stripeend, stripeno, stripestart;

 	if (p == NULL || LIST_EMPTY(&p->subdisks))
 		return (ENXIO);

+	s = NULL;
+	gp = bp->bio_to->geom;
+
 	/*
 	 * We only handle concatenated and striped plexes here.  RAID5 plexes
 	 * are handled in build_raid5_request().
@ -190,10 +185,10 @@ gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,
 		break;

 	case GV_SD_STALE:
-		if (bp->bio_caller1 != p)
+		if (!(bp->bio_cflags & GV_BIO_SYNCREQ))
 			return (ENXIO);

-		printf("FOO: setting sd %s to GV_SD_INITIALIZING\n", s->name);
+		printf("GEOM_VINUM: sd %s is initializing\n", s->name);
 		gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
 		break;

@ -214,104 +209,366 @@ gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,
 	cbp->bio_offset = real_off;
 	cbp->bio_length = real_len;
 	cbp->bio_data = addr;
-	if (bp->bio_caller1 == p) {
-		cbp->bio_caller1 = s;
+	cbp->bio_done = g_std_done;
+	cbp->bio_caller2 = s->consumer;
+	if ((bp->bio_cflags & GV_BIO_SYNCREQ)) {
+		cbp->bio_cflags |= GV_BIO_SYNCREQ;
 		cbp->bio_done = gv_plex_done;
-	} else
-		cbp->bio_done = g_std_done;
-	*bp2 = cbp;
-	*cp = s->consumer;
+	}
+
+	if (bp->bio_driver1 == NULL) {
+		bp->bio_driver1 = cbp;
+	} else {
+		pbp = bp->bio_driver1;
+		while (pbp->bio_caller1 != NULL)
+			pbp = pbp->bio_caller1;
+		pbp->bio_caller1 = cbp;
+	}
+
 	return (0);
 }

 static void
 gv_plex_start(struct bio *bp)
 {
-	struct g_geom *gp;
-	struct g_consumer *cp;
 	struct gv_plex *p;
-	struct gv_raid5_packet *wp;
-	struct bio *bp2;
-	caddr_t addr;
-	off_t boff;
-	long bcount, rcount;
-	int err;
-
-	gp = bp->bio_to->geom;
-	p = gp->softc;
-
-	/*
-	 * We cannot handle this request if too many of our subdisks are
-	 * inaccessible.
-	 */
-	if ((p->state < GV_PLEX_DEGRADED) && (bp->bio_caller1 != p)) {
-		g_io_deliver(bp, ENXIO);  /* XXX: correct way? */
-		return;
-	}
+	struct gv_bioq *bq;

 	switch(bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
-		/*
-		 * We split up the request in smaller packets and hand them
-		 * down to our subdisks.
-		 */
-		wp = NULL;
-		addr = bp->bio_data;
-		boff = bp->bio_offset;
-		for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) {
-			/*
-			 * RAID5 requests usually need to be split up in
-			 * several subrequests.
-			 */
-			if (p->org == GV_PLEX_RAID5) {
-				wp = gv_new_raid5_packet();
-				wp->bio = bp;
-				err = gv_build_raid5_req(wp, bp, addr, bcount,
-				    boff);
-			} else
-				err = gv_plexbuffer(bp, &bp2, &cp, addr, bcount,
-				    boff);
-
-			if (err) {
-				if (p->org == GV_PLEX_RAID5)
-					gv_free_raid5_packet(wp);
-				bp->bio_completed += bcount;
-				if (bp->bio_error == 0)
-					bp->bio_error = err;
-				if (bp->bio_completed == bp->bio_length)
-					g_io_deliver(bp, bp->bio_error);
-				return;
-			}
-		
-			if (p->org != GV_PLEX_RAID5) {
-				rcount = bp2->bio_length;
-				g_io_request(bp2, cp);
-
-			/*
-			 * RAID5 subrequests are queued on a worklist
-			 * and picked up from the worker thread.  This
-			 * ensures correct order.
-			 */
-			} else {
-				mtx_lock(&p->worklist_mtx);
-				TAILQ_INSERT_TAIL(&p->worklist, wp,
-				    list);
-				mtx_unlock(&p->worklist_mtx);
-				wakeup(&p);
-				rcount = wp->length;
-			}
-
-			boff += rcount;
-			addr += rcount;
-		}
-		return;
-
+		break;
+	case BIO_GETATTR:
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
+
+	/*
+	 * We cannot handle this request if too many of our subdisks are
+	 * inaccessible.
+	 */
+	p = bp->bio_to->geom->softc;
+	if ((p->state < GV_PLEX_DEGRADED) &&
+	    !(bp->bio_cflags & GV_BIO_SYNCREQ)) {
+		g_io_deliver(bp, ENXIO);
+		return;
+	}
+
+	bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
+	bq->bp = bp;
+	mtx_lock(&p->bqueue_mtx);
+	TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
+	wakeup(p);
+	mtx_unlock(&p->bqueue_mtx);
+}
+
+static void
+gv_plex_worker(void *arg)
+{
+	struct bio *bp;
+	struct gv_plex *p;
+	struct gv_sd *s;
+	struct gv_bioq *bq;
+
+	p = arg;
+	KASSERT(p != NULL, ("NULL p"));
+
+	mtx_lock(&p->bqueue_mtx);
+	for (;;) {
+		/* We were signaled to exit. */
+		if (p->flags & GV_PLEX_THREAD_DIE)
+			break;
+
+		/* Take the first BIO from our queue. */
+		bq = TAILQ_FIRST(&p->bqueue);
+		if (bq == NULL) {
+			msleep(p, &p->bqueue_mtx, PRIBIO, "-", hz/10);
+			continue;
+		}
+		TAILQ_REMOVE(&p->bqueue, bq, queue);
+		mtx_unlock(&p->bqueue_mtx);
+
+		bp = bq->bp;
+
+		/* A completed request. */
+		if (bp->bio_cflags & GV_BIO_DONE) {
+			g_free(bq);
+			if (bp->bio_cflags & GV_BIO_SYNCREQ) {
+				s = bp->bio_to->private;
+				if (bp->bio_error == 0)
+					s->initialized += bp->bio_length;
+				if (s->initialized >= s->size) {
+					g_topology_lock();
+					gv_set_sd_state(s, GV_SD_UP,
+					    GV_SETSTATE_CONFIG);
+					g_topology_unlock();
+					s->initialized = 0;
+				}
+				g_std_done(bp);
+			} else
+				gv_plex_completed_request(p, bp);
+		/*
+		 * A sub-request that was hold back because it interfered with
+		 * another sub-request.
+		 */
+		} else if (bp->bio_cflags & GV_BIO_ONHOLD) {
+			/* Is it still locked out? */
+			if (gv_stripe_active(p, bp)) {
+				mtx_lock(&p->bqueue_mtx);
+				TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
+				mtx_unlock(&p->bqueue_mtx);
+			} else {
+				g_free(bq);
+				bp->bio_cflags &= ~GV_BIO_ONHOLD;
+				g_io_request(bp, bp->bio_caller2);
+			}
+
+		/* A normal request to this plex. */
+		} else {
+			g_free(bq);
+			gv_plex_normal_request(p, bp);
+		}
+
+		mtx_lock(&p->bqueue_mtx);
+	}
+	mtx_unlock(&p->bqueue_mtx);
+	p->flags |= GV_PLEX_THREAD_DEAD;
+	wakeup(p);
+
+	kthread_exit(ENXIO);
+}
+
+void
+gv_plex_completed_request(struct gv_plex *p, struct bio *bp)
+{
+	struct bio *cbp, *pbp;
+	struct gv_bioq *bq, *bq2;
+	struct gv_raid5_packet *wp;
+	int i;
+
+	wp = bp->bio_driver1;
+
+	switch (bp->bio_parent->bio_cmd) {
+	case BIO_READ:
+		if (wp == NULL)
+			break;
+
+		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
+			if (bq->bp == bp) {
+				TAILQ_REMOVE(&wp->bits, bq, queue);
+				g_free(bq);
+				for (i = 0; i < wp->length; i++)
+					wp->data[i] ^= bp->bio_data[i];
+				break;
+			}
+		}
+		if (TAILQ_EMPTY(&wp->bits)) {
+			bp->bio_parent->bio_completed += wp->length;
+			if (wp->lockbase != -1)
+				TAILQ_REMOVE(&p->packets, wp, list);
+			g_free(wp);
+		}
+
+		break;
+
+ 	case BIO_WRITE:
+		if (wp == NULL)
+			break;
+
+		/* Check if we need to handle parity data. */
+		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
+			if (bq->bp == bp) {
+				TAILQ_REMOVE(&wp->bits, bq, queue);
+				g_free(bq);
+				cbp = wp->parity;
+				if (cbp != NULL) {
+					for (i = 0; i < wp->length; i++)
+						cbp->bio_data[i] ^=
+						    bp->bio_data[i];
+				}
+				break;
+			}
+		}
+
+		/* Handle parity data. */
+		if (TAILQ_EMPTY(&wp->bits)) {
+			if (wp->waiting != NULL) {
+				pbp = wp->waiting;
+				wp->waiting = NULL;
+				cbp = wp->parity;
+				for (i = 0; i < wp->length; i++)
+					cbp->bio_data[i] ^= pbp->bio_data[i];
+				g_io_request(pbp, pbp->bio_caller2);
+			} else if (wp->parity != NULL) {
+				cbp = wp->parity;
+				wp->parity = NULL;
+				g_io_request(cbp, cbp->bio_caller2);
+			} else {
+				bp->bio_parent->bio_completed += wp->length;
+				TAILQ_REMOVE(&p->packets, wp, list);
+				g_free(wp);
+			}
+		}
+
+		break;
+	}
+
+	pbp = bp->bio_parent;
+	if (pbp->bio_error == 0)
+		pbp->bio_error = bp->bio_error;
+
+	/* When the original request is finished, we deliver it. */
+	pbp->bio_inbed++;
+	if (pbp->bio_inbed == pbp->bio_children)
+		g_io_deliver(pbp, pbp->bio_error);
+
+	/* Clean up what we allocated. */
+	if (bp->bio_cflags & GV_BIO_MALLOC)
+		g_free(bp->bio_data);
+	g_destroy_bio(bp);
+}
+
+void
+gv_plex_normal_request(struct gv_plex *p, struct bio *bp)
+{
+	struct bio *cbp, *pbp;
+	struct gv_bioq *bq, *bq2;
+	struct gv_raid5_packet *wp, *wp2;
+	caddr_t addr;
+	off_t bcount, boff;
+	int err;
+
+	bcount = bp->bio_length;
+	addr = bp->bio_data;
+	boff = bp->bio_offset;
+
+	/* Walk over the whole length of the request, we might split it up. */
+	while (bcount > 0) {
+		wp = NULL;
+
+ 		/*
+		 * RAID5 plexes need special treatment, as a single write
+		 * request involves several read/write sub-requests.
+ 		 */
+		if (p->org == GV_PLEX_RAID5) {
+			wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
+			wp->bio = bp;
+			TAILQ_INIT(&wp->bits);
+
+			err = gv_build_raid5_req(p, wp, bp, addr, boff, bcount);
+
+ 			/*
+			 * Building the sub-request failed, we probably need to
+			 * clean up a lot.
+ 			 */
+ 			if (err) {
+				printf("GEOM_VINUM: plex request failed for ");
+				g_print_bio(bp);
+				printf("\n");
+				TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
+					TAILQ_REMOVE(&wp->bits, bq, queue);
+					g_free(bq);
+				}
+				if (wp->waiting != NULL) {
+					if (wp->waiting->bio_cflags &
+					    GV_BIO_MALLOC)
+						g_free(wp->waiting->bio_data);
+					g_destroy_bio(wp->waiting);
+				}
+				if (wp->parity != NULL) {
+					if (wp->parity->bio_cflags &
+					    GV_BIO_MALLOC)
+						g_free(wp->parity->bio_data);
+					g_destroy_bio(wp->parity);
+				}
+				g_free(wp);
+
+				TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
+					if (wp->bio == bp) {
+						TAILQ_REMOVE(&p->packets, wp,
+						    list);
+						TAILQ_FOREACH_SAFE(bq,
+						    &wp->bits, queue, bq2) {
+							TAILQ_REMOVE(&wp->bits,
+							    bq, queue);
+							g_free(bq);
+						}
+						g_free(wp);
+					}
+				}
+
+				cbp = bp->bio_driver1;
+				while (cbp != NULL) {
+					pbp = cbp->bio_caller1;
+					if (cbp->bio_cflags & GV_BIO_MALLOC)
+						g_free(cbp->bio_data);
+					g_destroy_bio(cbp);
+					cbp = pbp;
+				}
+
+				g_io_deliver(bp, err);
+ 				return;
+ 			}
+ 
+			if (TAILQ_EMPTY(&wp->bits))
+				g_free(wp);
+			else if (wp->lockbase != -1)
+				TAILQ_INSERT_TAIL(&p->packets, wp, list);
+
+		/*
+		 * Requests to concatenated and striped plexes go straight
+		 * through.
+		 */
+		} else {
+			err = gv_plexbuffer(p, bp, addr, boff, bcount);
+
+			/* Building the sub-request failed. */
+			if (err) {
+				printf("GEOM_VINUM: plex request failed for ");
+				g_print_bio(bp);
+				printf("\n");
+				cbp = bp->bio_driver1;
+				while (cbp != NULL) {
+					pbp = cbp->bio_caller1;
+					g_destroy_bio(cbp);
+					cbp = pbp;
+				}
+				g_io_deliver(bp, err);
+				return;
+			}
+		}
+ 
+		/* Abuse bio_caller1 as linked list. */
+		pbp = bp->bio_driver1;
+		while (pbp->bio_caller1 != NULL)
+			pbp = pbp->bio_caller1;
+		bcount -= pbp->bio_length;
+		addr += pbp->bio_length;
+		boff += pbp->bio_length;
+	}
+
+	/* Fire off all sub-requests. */
+	pbp = bp->bio_driver1;
+	while (pbp != NULL) {
+		/*
+		 * RAID5 sub-requests need to come in correct order, otherwise
+		 * we trip over the parity, as it might be overwritten by
+		 * another sub-request.
+		 */
+		if (pbp->bio_driver1 != NULL &&
+		    gv_stripe_active(p, pbp)) {
+			pbp->bio_cflags |= GV_BIO_ONHOLD;
+			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+			bq->bp = pbp;
+			mtx_lock(&p->bqueue_mtx);
+			TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
+			mtx_unlock(&p->bqueue_mtx);
+		} else
+			g_io_request(pbp, pbp->bio_caller2);
+		pbp = pbp->bio_caller1;
+	}
 }

 static int
@ -425,16 +682,12 @@ gv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 		gp->softc = p;
 		p->geom = gp;

-		/* RAID5 plexes need a 'worker' thread, where IO is handled. */
-		if (p->org == GV_PLEX_RAID5) {
-			TAILQ_INIT(&p->worklist);
-			mtx_init(&p->worklist_mtx, "gvinum_worklist", NULL,
-			    MTX_DEF);
-			p->flags &= ~GV_PLEX_THREAD_DIE;
-			kthread_create(gv_raid5_worker, gp, NULL, 0, 0,
-			    "gv_raid5");
-			p->flags |= GV_PLEX_THREAD_ACTIVE;
-		}
+		TAILQ_INIT(&p->packets);
+		TAILQ_INIT(&p->bqueue);
+		mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF);
+		kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s",
+		    p->name);
+		p->flags |= GV_PLEX_THREAD_ACTIVE;

 		/* Attach a consumer to this provider. */
 		cp = g_new_consumer(gp);
--- a/sys/geom/vinum/geom_vinum_raid5.c
+++ b/sys/geom/vinum/geom_vinum_raid5.c
@ -44,243 +44,62 @@ __FBSDID("$FreeBSD$");
 #include <geom/vinum/geom_vinum_raid5.h>
 #include <geom/vinum/geom_vinum.h>

-int	gv_raid5_parity(struct gv_raid5_packet *);
-int	gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *);
-
-struct gv_raid5_bit *
-gv_new_raid5_bit(void)
-{
-	struct gv_raid5_bit *r;
-	r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO);
-	KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r"));
-	return (r);
-}
-
-struct gv_raid5_packet *
-gv_new_raid5_packet(void)
-{
-	struct gv_raid5_packet *wp;
-
-	wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO);
-	KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp"));
-	wp->state = SETUP;
-	wp->type = JUNK;
-	TAILQ_INIT(&wp->bits);
-
-	return (wp);
-}
-
-void
-gv_free_raid5_packet(struct gv_raid5_packet *wp)
-{
-	struct gv_raid5_bit *r, *r2;
-
-	/* Remove all the bits from this work packet. */
-	TAILQ_FOREACH_SAFE(r, &wp->bits, list, r2) {
-		TAILQ_REMOVE(&wp->bits, r, list);
-		if (r->malloc)
-			g_free(r->buf);
-		if (r->bio != NULL)
-			g_destroy_bio(r->bio);
-		g_free(r);
-	}
-
-	if (wp->bufmalloc == 1)
-		g_free(wp->buf);
-	g_free(wp);
-}
-
 /*
 * Check if the stripe that the work packet wants is already being used by
 * some other work packet.
 */
 int
-gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc)
+gv_stripe_active(struct gv_plex *p, struct bio *bp)
 {
-	struct gv_raid5_packet *wpa;
+	struct gv_raid5_packet *wp, *owp;
+	int overlap;

-	TAILQ_FOREACH(wpa, &sc->worklist, list) {
-		if (wpa->lockbase == wp->lockbase) {
-			if (wpa == wp)
-				return (0);
-			return (1);
+	wp = bp->bio_driver1;
+	if (wp->lockbase == -1)
+		return (0);
+
+	overlap = 0;
+	TAILQ_FOREACH(owp, &p->packets, list) {
+		if (owp == wp)
+			break;
+		if ((wp->lockbase >= owp->lockbase) &&
+		    (wp->lockbase <= owp->lockbase + owp->length)) {
+			overlap++;
+			break;
+		}
+		if ((wp->lockbase <= owp->lockbase) &&
+		    (wp->lockbase + wp->length >= owp->lockbase)) {
+			overlap++;
+			break;
 		}
 	}
-	return (0);
-}

-/*
- * The "worker" thread that runs through the worklist and fires off the
- * "subrequests" needed to fulfill a RAID5 read or write request.
- */
-void
-gv_raid5_worker(void *arg)
-{
-	struct bio *bp;
-	struct g_geom *gp;
-	struct gv_plex *p;
-	struct gv_raid5_packet *wp, *wpt;
-	struct gv_raid5_bit *rbp, *rbpt;
-	int error, restart;
-
-	gp = arg;
-	p = gp->softc;
-
-	mtx_lock(&p->worklist_mtx);
-	for (;;) {
-		restart = 0;
-		TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) {
-			/* This request packet is already being processed. */
-			if (wp->state == IO)
-				continue;
-			/* This request packet is ready for processing. */
-			if (wp->state == VALID) {
-				/* Couldn't get the lock, try again. */
-				if ((wp->lockbase != -1) &&
-				    gv_stripe_active(wp, p))
-					continue;
-
-				wp->state = IO;
-				mtx_unlock(&p->worklist_mtx);
-				TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt)
-					g_io_request(rbp->bio, rbp->consumer);
-				mtx_lock(&p->worklist_mtx);
-				continue;
-			}
-			if (wp->state == FINISH) {
-				bp = wp->bio;
-				bp->bio_completed += wp->length;
-				/*
-				 * Deliver the original request if we have
-				 * finished.
-				 */
-				if (bp->bio_completed == bp->bio_length) {
-					mtx_unlock(&p->worklist_mtx);
-					g_io_deliver(bp, 0);
-					mtx_lock(&p->worklist_mtx);
-				}
-				TAILQ_REMOVE(&p->worklist, wp, list);
-				gv_free_raid5_packet(wp);
-				restart++;
-				/*break;*/
-			}
-		}
-		if (!restart) {
-			/* Self-destruct. */
-			if (p->flags & GV_PLEX_THREAD_DIE)
-				break;
-			error = msleep(p, &p->worklist_mtx, PRIBIO, "-",
-			    hz/100);
-		}
-	}
-	mtx_unlock(&p->worklist_mtx);
-
-	g_trace(G_T_TOPOLOGY, "gv_raid5_worker die");
-
-	/* Signal our plex that we are dead. */
-	p->flags |= GV_PLEX_THREAD_DEAD;
-	wakeup(p);
-	kthread_exit(0);
-}
-
-/* Final bio transaction to write out the parity data. */
-int
-gv_raid5_parity(struct gv_raid5_packet *wp)
-{
-	struct bio *bp;
-
-	bp = g_new_bio();
-	if (bp == NULL)
-		return (ENOMEM);
-
-	wp->type = ISPARITY;
-	bp->bio_cmd = BIO_WRITE;
-	bp->bio_data = wp->buf;
-	bp->bio_offset = wp->offset;
-	bp->bio_length = wp->length;
-	bp->bio_done = gv_raid5_done;
-	bp->bio_caller1 = wp;
-	bp->bio_caller2 = NULL;
-	g_io_request(bp, wp->parity);
-
-	return (0);
-}
-
-/* We end up here after each subrequest. */
-void
-gv_raid5_done(struct bio *bp)
-{
-	struct bio *obp;
-	struct g_geom *gp;
-	struct gv_plex *p;
-	struct gv_raid5_packet *wp;
-	struct gv_raid5_bit *rbp;
-	off_t i;
-	int error;
-
-	wp = bp->bio_caller1;
-	rbp = bp->bio_caller2;
-	obp = wp->bio;
-	gp = bp->bio_from->geom;
-	p = gp->softc;
-
-	/* One less active subrequest. */
-	wp->active--;
-
-	switch (obp->bio_cmd) {
-	case BIO_READ:
-		/* Degraded reads need to handle parity data. */
-		if (wp->type == DEGRADED) {
-			for (i = 0; i < wp->length; i++)
-				wp->buf[i] ^= bp->bio_data[i];
-
-			/* When we're finished copy back the data we want. */
-			if (wp->active == 0)
-				bcopy(wp->buf, wp->data, wp->length);
-		}
-
-		break;
-
-	case BIO_WRITE:
-		/* Handle the parity data, if needed. */
-		if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) {
-			for (i = 0; i < wp->length; i++)
-				wp->buf[i] ^= bp->bio_data[i];
-
-			/* Write out the parity data we calculated. */
-			if (wp->active == 0) {
-				wp->active++;
-				error = gv_raid5_parity(wp);
-			}
-		}
-		break;
-	}
-
-	/* This request group is done. */
-	if (wp->active == 0)
-		wp->state = FINISH;
+	return (overlap);
 }

 /* Build a request group to perform (part of) a RAID5 request. */
 int
-gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
-    long bcount, off_t boff)
+gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
+    struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
 {
 	struct g_geom *gp;
-	struct gv_plex *p;
-	struct gv_raid5_bit *rbp;
 	struct gv_sd *broken, *original, *parity, *s;
-	int i, psdno, sdno;
-	off_t len_left, real_off, stripeend, stripeoff, stripestart;
+	struct gv_bioq *bq;
+	struct bio *cbp, *pbp;
+	int i, psdno, sdno, type;
+	off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart;

 	gp = bp->bio_to->geom;
-	p = gp->softc;	

 	if (p == NULL || LIST_EMPTY(&p->subdisks))
 		return (ENXIO);

 	/* We are optimistic and assume that this request will be OK. */
-	wp->type = NORMAL;
+#define	REQ_TYPE_NORMAL		0
+#define	REQ_TYPE_DEGRADED	1
+#define	REQ_TYPE_NOPARITY	2
+
+	type = REQ_TYPE_NORMAL;
 	original = parity = broken = NULL;

 	/* The number of the subdisk containing the parity stripe. */
@ -330,29 +149,20 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,

 	/* Our data stripe is missing. */
 	if (original->state != GV_SD_UP)
-		wp->type = DEGRADED;
+		type = REQ_TYPE_DEGRADED;
 	/* Our parity stripe is missing. */
 	if (parity->state != GV_SD_UP) {
 		/* We cannot take another failure if we're already degraded. */
-		if (wp->type != NORMAL)
+		if (type != REQ_TYPE_NORMAL)
 			return (ENXIO);
 		else
-			wp->type = NOPARITY;
+			type = REQ_TYPE_NOPARITY;
 	}

-	/*
-	 * A combined write is necessary when the original data subdisk and the
-	 * parity subdisk are both up, but one of the other subdisks isn't.
-	 */
-	if ((broken != NULL) && (broken != parity) && (broken != original))
-		wp->type = COMBINED;
-
-	wp->offset = real_off;
-	wp->length = (bcount <= len_left) ? bcount : len_left;
+	real_len = (bcount <= len_left) ? bcount : len_left;
+	wp->length = real_len;
 	wp->data = addr;
-	wp->original = original->consumer;
-	wp->parity = parity->consumer;
-	wp->lockbase = stripestart;
+	wp->lockbase = real_off;

 	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));

@ -363,58 +173,45 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
 		 * the broken one plus the parity stripe and then recalculate
 		 * the desired data.
 		 */
-		if (wp->type == DEGRADED) {
-			wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
-			if (wp->buf == NULL)
-				return (ENOMEM);
-			wp->bufmalloc = 1;
+		if (type == REQ_TYPE_DEGRADED) {
+			bzero(wp->data, wp->length);
 			LIST_FOREACH(s, &p->subdisks, in_plex) {
 				/* Skip the broken subdisk. */
 				if (s == broken)
 					continue;
-				rbp = gv_new_raid5_bit();
-				rbp->consumer = s->consumer;
-				rbp->bio = g_new_bio();
-				if (rbp->bio == NULL)
+				cbp = g_clone_bio(bp);
+				if (cbp == NULL)
 					return (ENOMEM);
-				rbp->buf = g_malloc(wp->length,
-					M_NOWAIT | M_ZERO);
-				if (rbp->buf == NULL)
-					return (ENOMEM);
-				rbp->malloc = 1;
-				rbp->bio->bio_cmd = BIO_READ;
-				rbp->bio->bio_offset = wp->offset;
-				rbp->bio->bio_length = wp->length;
-				rbp->bio->bio_data = rbp->buf;
-				rbp->bio->bio_done = gv_raid5_done;
-				rbp->bio->bio_caller1 = wp;
-				rbp->bio->bio_caller2 = rbp;
-				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
-				wp->active++;
-				wp->rqcount++;
+				cbp->bio_data = g_malloc(real_len, M_WAITOK);
+				cbp->bio_cflags |= GV_BIO_MALLOC;
+				cbp->bio_offset = real_off;
+				cbp->bio_length = real_len;
+				cbp->bio_done = gv_plex_done;
+				cbp->bio_caller2 = s->consumer;
+				cbp->bio_driver1 = wp;
+
+				GV_ENQUEUE(bp, cbp, pbp);
+
+				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+				bq->bp = cbp;
+				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
 			}

 		/* A normal read can be fulfilled with the original subdisk. */
 		} else {
-			rbp = gv_new_raid5_bit();
-			rbp->consumer = wp->original;
-			rbp->bio = g_new_bio();
-			if (rbp->bio == NULL)
+			cbp = g_clone_bio(bp);
+			if (cbp == NULL)
 				return (ENOMEM);
-			rbp->bio->bio_cmd = BIO_READ;
-			rbp->bio->bio_offset = wp->offset;
-			rbp->bio->bio_length = wp->length;
-			rbp->buf = addr;
-			rbp->bio->bio_data = rbp->buf;
-			rbp->bio->bio_done = gv_raid5_done;
-			rbp->bio->bio_caller1 = wp;
-			rbp->bio->bio_caller2 = rbp;
-			TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
-			wp->active++;
-			wp->rqcount++;
+			cbp->bio_offset = real_off;
+			cbp->bio_length = real_len;
+			cbp->bio_data = addr;
+			cbp->bio_done = g_std_done;
+			cbp->bio_caller2 = original->consumer;
+
+			GV_ENQUEUE(bp, cbp, pbp);
 		}
-		if (wp->type != COMBINED)
-			wp->lockbase = -1;
+		wp->lockbase = -1;
+
 		break;

 	case BIO_WRITE:
@ -424,164 +221,65 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
 		 * recalculate the parity from the original data, and then
 		 * write the parity stripe back out.
 		 */
-		if (wp->type == DEGRADED) {
-			wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
-			if (wp->buf == NULL)
-				return (ENOMEM);
-			wp->bufmalloc = 1;
-
-			/* Copy the original data. */
-			bcopy(wp->data, wp->buf, wp->length);
-
+		if (type == REQ_TYPE_DEGRADED) {
+			/* Read all subdisks. */
 			LIST_FOREACH(s, &p->subdisks, in_plex) {
 				/* Skip the broken and the parity subdisk. */
-				if ((s == broken) ||
-				    (s->consumer == wp->parity))
+				if ((s == broken) || (s == parity))
 					continue;

-				rbp = gv_new_raid5_bit();
-				rbp->consumer = s->consumer;
-				rbp->bio = g_new_bio();
-				if (rbp->bio == NULL)
+				cbp = g_clone_bio(bp);
+				if (cbp == NULL)
 					return (ENOMEM);
-				rbp->buf = g_malloc(wp->length,
-				    M_NOWAIT | M_ZERO);
-				if (rbp->buf == NULL)
-					return (ENOMEM);
-				rbp->malloc = 1;
-				rbp->bio->bio_cmd = BIO_READ;
-				rbp->bio->bio_data = rbp->buf;
-				rbp->bio->bio_offset = wp->offset;
-				rbp->bio->bio_length = wp->length;
-				rbp->bio->bio_done = gv_raid5_done;
-				rbp->bio->bio_caller1 = wp;
-				rbp->bio->bio_caller2 = rbp;
-				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
-				wp->active++;
-				wp->rqcount++;
+				cbp->bio_cmd = BIO_READ;
+				cbp->bio_data = g_malloc(real_len, M_WAITOK);
+				cbp->bio_cflags |= GV_BIO_MALLOC;
+				cbp->bio_offset = real_off;
+				cbp->bio_length = real_len;
+				cbp->bio_done = gv_plex_done;
+				cbp->bio_caller2 = s->consumer;
+				cbp->bio_driver1 = wp;
+
+				GV_ENQUEUE(bp, cbp, pbp);
+
+				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+				bq->bp = cbp;
+				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
 			}

+			/* Write the parity data. */
+			cbp = g_clone_bio(bp);
+			if (cbp == NULL)
+				return (ENOMEM);
+			cbp->bio_data = g_malloc(real_len, M_WAITOK);
+			cbp->bio_cflags |= GV_BIO_MALLOC;
+			bcopy(addr, cbp->bio_data, real_len);
+			cbp->bio_offset = real_off;
+			cbp->bio_length = real_len;
+			cbp->bio_done = gv_plex_done;
+			cbp->bio_caller2 = parity->consumer;
+			cbp->bio_driver1 = wp;
+			wp->parity = cbp;
+
 		/*
-		 * When we don't have the parity stripe we just write out the
-		 * data.
+		 * When the parity stripe is missing we just write out the data.
 		 */
-		} else if (wp->type == NOPARITY) {
-			rbp = gv_new_raid5_bit();
-			rbp->consumer = wp->original;
-			rbp->bio = g_new_bio();
-			if (rbp->bio == NULL)
+		} else if (type == REQ_TYPE_NOPARITY) {
+			cbp = g_clone_bio(bp);
+			if (cbp == NULL)
 				return (ENOMEM);
-			rbp->bio->bio_cmd = BIO_WRITE;
-			rbp->bio->bio_offset = wp->offset;
-			rbp->bio->bio_length = wp->length;
-			rbp->bio->bio_data = addr;
-			rbp->bio->bio_done = gv_raid5_done;
-			rbp->bio->bio_caller1 = wp;
-			rbp->bio->bio_caller2 = rbp;
-			TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
-			wp->active++;
-			wp->rqcount++;
+			cbp->bio_offset = real_off;
+			cbp->bio_length = real_len;
+			cbp->bio_data = addr;
+			cbp->bio_done = gv_plex_done;
+			cbp->bio_caller2 = original->consumer;
+			cbp->bio_driver1 = wp;

-		/*
-		 * A combined write means that our data subdisk and the parity
-		 * subdisks are both up, but another subdisk isn't.  We need to
-		 * read all valid stripes including the parity to recalculate
-		 * the data of the stripe that is missing.  Then we write our
-		 * original data, and together with the other data stripes
-		 * recalculate the parity again.
-		 */
-		} else if (wp->type == COMBINED) {
-			wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
-			if (wp->buf == NULL)
-				return (ENOMEM);
-			wp->bufmalloc = 1;
+			GV_ENQUEUE(bp, cbp, pbp);

-			/* Get the data from all subdisks. */
-			LIST_FOREACH(s, &p->subdisks, in_plex) {
-				/* Skip the broken subdisk. */
-				if (s == broken)
-					continue;
-
-				rbp = gv_new_raid5_bit();
-				rbp->consumer = s->consumer;
-				rbp->bio = g_new_bio();
-				if (rbp->bio == NULL)
-					return (ENOMEM);
-				rbp->bio->bio_cmd = BIO_READ;
-				rbp->buf = g_malloc(wp->length,
-				    M_NOWAIT | M_ZERO);
-				if (rbp->buf == NULL)
-					return (ENOMEM);
-				rbp->malloc = 1;
-				rbp->bio->bio_data = rbp->buf;
-				rbp->bio->bio_offset = wp->offset;
-				rbp->bio->bio_length = wp->length;
-				rbp->bio->bio_done = gv_raid5_done;
-				rbp->bio->bio_caller1 = wp;
-				rbp->bio->bio_caller2 = rbp;
-				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
-				wp->active++;
-				wp->rqcount++;
-			}
-
-			/* Write the original data. */
-			rbp = gv_new_raid5_bit();
-			rbp->consumer = wp->original;
-			rbp->buf = addr;
-			rbp->bio = g_new_bio();
-			if (rbp->bio == NULL)
-				return (ENOMEM);
-			rbp->bio->bio_cmd = BIO_WRITE;
-			rbp->bio->bio_data = rbp->buf;
-			rbp->bio->bio_offset = wp->offset;
-			rbp->bio->bio_length = wp->length;
-			rbp->bio->bio_done = gv_raid5_done;
-			rbp->bio->bio_caller1 = wp;
-			rbp->bio->bio_caller2 = rbp;
-			/*
-			 * Insert at the tail, because we want to read the old
-			 * data first.
-			 */
-			TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
-			wp->active++;
-			wp->rqcount++;
-
-			/* Get the rest of the data again. */
-			LIST_FOREACH(s, &p->subdisks, in_plex) {
-				/*
-				 * Skip the broken subdisk, the parity, and the
-				 * one we just wrote.
-				 */
-				if ((s == broken) ||
-				    (s->consumer == wp->parity) ||
-				    (s->consumer == wp->original))
-					continue;
-				rbp = gv_new_raid5_bit();
-				rbp->consumer = s->consumer;
-				rbp->bio = g_new_bio();
-				if (rbp->bio == NULL)
-					return (ENOMEM);
-				rbp->bio->bio_cmd = BIO_READ;
-				rbp->buf = g_malloc(wp->length,
-				    M_NOWAIT | M_ZERO);
-				if (rbp->buf == NULL)
-					return (ENOMEM);
-				rbp->malloc = 1;
-				rbp->bio->bio_data = rbp->buf;
-				rbp->bio->bio_offset = wp->offset;
-				rbp->bio->bio_length = wp->length;
-				rbp->bio->bio_done = gv_raid5_done;
-				rbp->bio->bio_caller1 = wp;
-				rbp->bio->bio_caller2 = rbp;
-				/*
-				 * Again, insert at the tail to keep correct
-				 * order.
-				 */
-				TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
-				wp->active++;
-				wp->rqcount++;
-			}
-			
+			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+			bq->bp = cbp;
+			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);

 		/*
 		 * A normal write request goes to the original subdisk, then we
@ -589,52 +287,83 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
 		 * out the parity again.
 		 */
 		} else {
-			wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
-			if (wp->buf == NULL)
+			/* Read old parity. */
+			cbp = g_clone_bio(bp);
+			if (cbp == NULL)
 				return (ENOMEM);
-			wp->bufmalloc = 1;
-			LIST_FOREACH(s, &p->subdisks, in_plex) {
-				/* Skip the parity stripe. */
-				if (s->consumer == wp->parity)
-					continue;
+			cbp->bio_cmd = BIO_READ;
+			cbp->bio_data = g_malloc(real_len, M_WAITOK);
+			cbp->bio_cflags |= GV_BIO_MALLOC;
+			cbp->bio_offset = real_off;
+			cbp->bio_length = real_len;
+			cbp->bio_done = gv_plex_done;
+			cbp->bio_caller2 = parity->consumer;
+			cbp->bio_driver1 = wp;

-				rbp = gv_new_raid5_bit();
-				rbp->consumer = s->consumer;
-				rbp->bio = g_new_bio();
-				if (rbp->bio == NULL)
-					return (ENOMEM);
-				/*
-				 * The data for the original stripe is written,
-				 * the others need to be read in for the parity
-				 * calculation.
-				 */
-				if (s->consumer == wp->original) {
-					rbp->bio->bio_cmd = BIO_WRITE;
-					rbp->buf = addr;
-				} else {
-					rbp->bio->bio_cmd = BIO_READ;
-					rbp->buf = g_malloc(wp->length,
-					    M_NOWAIT | M_ZERO);
-					if (rbp->buf == NULL)
-						return (ENOMEM);
-					rbp->malloc = 1;
-				}
-				rbp->bio->bio_data = rbp->buf;
-				rbp->bio->bio_offset = wp->offset;
-				rbp->bio->bio_length = wp->length;
-				rbp->bio->bio_done = gv_raid5_done;
-				rbp->bio->bio_caller1 = wp;
-				rbp->bio->bio_caller2 = rbp;
-				TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
-				wp->active++;
-				wp->rqcount++;
-			}
+			GV_ENQUEUE(bp, cbp, pbp);
+
+			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+			bq->bp = cbp;
+			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
+
+			/* Read old data. */
+			cbp = g_clone_bio(bp);
+			if (cbp == NULL)
+				return (ENOMEM);
+			cbp->bio_cmd = BIO_READ;
+			cbp->bio_data = g_malloc(real_len, M_WAITOK);
+			cbp->bio_cflags |= GV_BIO_MALLOC;
+			cbp->bio_offset = real_off;
+			cbp->bio_length = real_len;
+			cbp->bio_done = gv_plex_done;
+			cbp->bio_caller2 = original->consumer;
+			cbp->bio_driver1 = wp;
+
+			GV_ENQUEUE(bp, cbp, pbp);
+
+			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+			bq->bp = cbp;
+			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
+
+			/* Write new data. */
+			cbp = g_clone_bio(bp);
+			if (cbp == NULL)
+				return (ENOMEM);
+			cbp->bio_data = addr;
+			cbp->bio_offset = real_off;
+			cbp->bio_length = real_len;
+			cbp->bio_done = gv_plex_done;
+			cbp->bio_caller2 = original->consumer;
+
+			cbp->bio_driver1 = wp;
+
+			/*
+			 * We must not write the new data until the old data
+			 * was read, so hold this BIO back until we're ready
+			 * for it.
+			 */
+			wp->waiting = cbp;
+
+			/* The final bio for the parity. */
+			cbp = g_clone_bio(bp);
+			if (cbp == NULL)
+				return (ENOMEM);
+			cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
+			cbp->bio_cflags |= GV_BIO_MALLOC;
+			cbp->bio_offset = real_off;
+			cbp->bio_length = real_len;
+			cbp->bio_done = gv_plex_done;
+			cbp->bio_caller2 = parity->consumer;
+			cbp->bio_driver1 = wp;
+
+			/* Remember that this is the BIO for the parity data. */
+			wp->parity = cbp;
 		}
 		break;
+
 	default:
 		return (EINVAL);
 	}

-	wp->state = VALID;
 	return (0);
 }
--- a/sys/geom/vinum/geom_vinum_raid5.h
+++ b/sys/geom/vinum/geom_vinum_raid5.h
@ -32,22 +32,23 @@
 /*
 * A single RAID5 request usually needs more than one I/O transaction,
 * depending on the state of the associated subdisks and the direction of the
- * transaction (read or write).  Every subrequest of a RAID5 request,
- * represented by a gv_raid_packet, is defined by a gv_raid5_bit.
+ * transaction (read or write).
 */

-/* A subrequest of a RAID5 read/write operation. */
-struct gv_raid5_bit {
-	struct bio	*bio;		/* BIO of this subrequest. */
-	caddr_t		buf;		/* Data buffer of this subrequest. */
-	int		malloc;		/* Flag if data buffer was malloced. */
-	struct g_consumer *consumer;	/* Consumer to send the BIO to. */
-	TAILQ_ENTRY(gv_raid5_bit) list;	/* Entry in the list of this request. */
-};
+#define	GV_ENQUEUE(bp, cbp, pbp)				\
+	do { 							\
+		if (bp->bio_driver1 == NULL) {			\
+			bp->bio_driver1 = cbp;			\
+		} else {					\
+			pbp = bp->bio_driver1;			\
+			while (pbp->bio_caller1 != NULL)	\
+				pbp = pbp->bio_caller1;		\
+			pbp->bio_caller1 = cbp;			\
+		}						\
+	} while (0);

-/* Container for one or more gv_raid5_bits; represents a RAID5 I/O request. */
 struct gv_raid5_packet {
-	caddr_t	buf;		/* Data buffer of this RAID5 request. */
+	caddr_t	data;		/* Data buffer of this sub-request- */
 	off_t	length;		/* Size of data buffer. */
 	off_t	lockbase;	/* Deny access to our plex offset. */
 	off_t	offset;		/* The drive offset of the subdisk. */
@ -56,39 +57,17 @@ struct gv_raid5_packet {
 	int	rqcount;	/* Count of subrequests. */

 	struct bio	*bio;	/* Pointer to the original bio. */
-	caddr_t		 data;	/* Pointer to the original data. */
+	struct bio	*parity;  /* The bio containing the parity data. */
+	struct bio	*waiting; /* A bio that need to wait for other bios. */

-	struct g_consumer *original;	/* Consumer to the data stripe. */
-	struct g_consumer *parity;	/* Consumer to the parity stripe. */
-
-	/* State of this RAID5 packet. */
-	enum {
-	    SETUP,		/* Newly created. */
-	    VALID,		/* Ready for processing. */
-	    IO,			/* Currently doing I/O. */
-	    FINISH		/* Packet has finished. */
-	} state;
-
-	/* Type of this RAID5 transaction. */
-	enum {
-	    JUNK,		/* Newly created, not valid. */
-	    NORMAL,		/* Normal read or write. */
-	    ISPARITY,		/* Containing only parity data. */
-	    NOPARITY,		/* Parity stripe not available. */
-	    DEGRADED,		/* Data stripe not available. */
-	    COMBINED		/* Data and parity stripes ok, others not. */
-	} type;
-
-	TAILQ_HEAD(,gv_raid5_bit)    bits; /* List of subrequests. */
-	TAILQ_ENTRY(gv_raid5_packet) list; /* Entry in plex's packet list. */
+	TAILQ_HEAD(,gv_bioq)		bits; /* List of subrequests. */
+	TAILQ_ENTRY(gv_raid5_packet)	list; /* Entry in plex's packet list. */
 };

-int	gv_build_raid5_req(struct gv_raid5_packet *, struct bio *, caddr_t,
-	    long, off_t);
-void	gv_free_raid5_packet(struct gv_raid5_packet *);
-void	gv_raid5_done(struct bio *);
+int	gv_stripe_active(struct gv_plex *, struct bio *);
+int	gv_build_raid5_req(struct gv_plex *, struct gv_raid5_packet *,
+	    struct bio *, caddr_t, off_t, off_t);
 void	gv_raid5_worker(void *);
-struct gv_raid5_packet  *gv_new_raid5_packet(void);
-struct gv_raid5_bit	*gv_new_raid5_bit(void);
+void	gv_plex_done(struct bio *);

 #endif /* !_GEOM_VINUM_RAID5_H_ */
--- a/sys/geom/vinum/geom_vinum_rm.c
+++ b/sys/geom/vinum/geom_vinum_rm.c
@ -166,6 +166,7 @@ gv_rm_vol(struct gv_softc *sc, struct gctl_req *req, struct gv_volume *v, int fl

 	/* Clean up and let our geom fade away. */
 	LIST_REMOVE(v, volume);
+	gv_kill_vol_thread(v);
 	g_free(v);
 	if (gp != NULL) {
 		gp->softc = NULL;
--- a/sys/geom/vinum/geom_vinum_subr.c
+++ b/sys/geom/vinum/geom_vinum_subr.c
@ -832,12 +832,25 @@ gv_kill_drive_thread(struct gv_drive *d)
 void
 gv_kill_plex_thread(struct gv_plex *p)
 {
-	if ((p->org == GV_PLEX_RAID5) && (p->flags & GV_PLEX_THREAD_ACTIVE)) {
+	if (p->flags & GV_PLEX_THREAD_ACTIVE) {
 		p->flags |= GV_PLEX_THREAD_DIE;
 		wakeup(p);
 		while (!(p->flags & GV_PLEX_THREAD_DEAD))
 			tsleep(p, PRIBIO, "gv_die", hz);
 		p->flags &= ~GV_PLEX_THREAD_ACTIVE;
-		mtx_destroy(&p->worklist_mtx);
+		mtx_destroy(&p->bqueue_mtx);
+	}
+}
+
+void
+gv_kill_vol_thread(struct gv_volume *v)
+{
+	if (v->flags & GV_VOL_THREAD_ACTIVE) {
+		v->flags |= GV_VOL_THREAD_DIE;
+		wakeup(v);
+		while (!(v->flags & GV_VOL_THREAD_DEAD))
+			tsleep(v, PRIBIO, "gv_die", hz);
+		v->flags &= ~GV_VOL_THREAD_ACTIVE;
+		mtx_destroy(&v->bqueue_mtx);
 	}
 }
--- a/sys/geom/vinum/geom_vinum_var.h
+++ b/sys/geom/vinum/geom_vinum_var.h
@ -111,6 +111,8 @@
 #define	GV_BIO_DONE	0x01
 #define	GV_BIO_MALLOC	0x02
 #define	GV_BIO_ONHOLD	0x04
+#define	GV_BIO_SYNCREQ	0x08
+#define	GV_BIO_SUCCEED	0x10

 /*
 * hostname is 256 bytes long, but we don't need to shlep multiple copies in
@ -269,8 +271,9 @@ struct gv_plex {

 	off_t	synced;			/* Count of synced bytes. */

-	struct mtx worklist_mtx;	/* Mutex for RAID5 worklist. */
-	TAILQ_HEAD(,gv_raid5_packet) worklist; /* List of RAID5 work packets. */
+	struct mtx		bqueue_mtx; /* Lock for the BIO queue. */
+	TAILQ_HEAD(,gv_bioq)	bqueue;	/* BIO queue. */
+	TAILQ_HEAD(,gv_raid5_packet)	packets; /* RAID5 sub-requests. */

 	LIST_HEAD(,gv_sd)   subdisks;	/* List of attached subdisks. */
 	LIST_ENTRY(gv_plex) in_volume;	/* Plex list of associated volume. */
@ -292,6 +295,14 @@ struct gv_volume {
 #define	GV_VOL_DOWN	0
 #define	GV_VOL_UP	1

+	int	flags;
+#define	GV_VOL_THREAD_ACTIVE	0x01	/* Volume has an active thread. */
+#define	GV_VOL_THREAD_DIE	0x02	/* Signal the thread to die. */
+#define	GV_VOL_THREAD_DEAD	0x04	/* The thread has died. */
+
+	struct mtx		bqueue_mtx; /* Lock for the BIO queue. */
+	TAILQ_HEAD(,gv_bioq)	bqueue;	/* BIO queue. */
+
 	LIST_HEAD(,gv_plex)   plexes;	/* List of attached plexes. */
 	LIST_ENTRY(gv_volume) volume;	/* Entry in vinum config. */

--- a/sys/geom/vinum/geom_vinum_volume.c
+++ b/sys/geom/vinum/geom_vinum_volume.c
@ -31,6 +31,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
+#include <sys/kthread.h>
 #include <sys/libkern.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@ -42,6 +43,9 @@ __FBSDID("$FreeBSD$");
 #include <geom/vinum/geom_vinum_var.h>
 #include <geom/vinum/geom_vinum.h>

+static void gv_vol_completed_request(struct gv_volume *, struct bio *);
+static void gv_vol_normal_request(struct gv_volume *, struct bio *);
+
 static void
 gv_volume_orphan(struct g_consumer *cp)
 {
@ -62,8 +66,10 @@ gv_volume_orphan(struct g_consumer *cp)
 	if (!LIST_EMPTY(&gp->consumer))
 		return;
 	v = gp->softc;
-	if (v != NULL)
+	if (v != NULL) {
+		gv_kill_vol_thread(v);
 		v->geom = NULL;
+	}
 	gp->softc = NULL;
 	g_wither_geom(gp, error);
 }
@ -72,79 +78,186 @@ gv_volume_orphan(struct g_consumer *cp)
 static void
 gv_volume_done(struct bio *bp)
 {
-	struct g_consumer *cp;
-	
-	/* The next plex in this volume. */
-	cp = LIST_NEXT(bp->bio_from, consumer);
+	struct gv_volume *v;
+	struct gv_bioq *bq;

-	switch (bp->bio_cmd) {
-	case BIO_READ:
-		/*
-		 * If no error occured on this request, or if we have no plex
-		 * left, finish here...
-		 */
-		if ((bp->bio_error == 0) || (cp == NULL)) {
-			g_std_done(bp);
-			return;
-		}
-
-		/* ... or try to read from the next plex. */
-		g_io_request(bp, cp);
-		return;
-
-	case BIO_WRITE:
-	case BIO_DELETE:
-		/* No more plexes left. */
-		if (cp == NULL) {
-			/*
-			 * Clear any errors if one of the previous writes
-			 * succeeded.
-			 */
-			if (bp->bio_caller1 == (int *)1)
-				bp->bio_error = 0;
-			g_std_done(bp);
-			return;
-		}
-
-		/* If this write request had no errors, remember that fact... */
-		if (bp->bio_error == 0)
-			bp->bio_caller1 = (int *)1;
-
-		/* ... and write to the next plex. */
-		g_io_request(bp, cp);
-		return;
-	}
+	v = bp->bio_from->geom->softc;
+	bp->bio_cflags |= GV_BIO_DONE;
+	bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
+	bq->bp = bp;
+	mtx_lock(&v->bqueue_mtx);
+	TAILQ_INSERT_TAIL(&v->bqueue, bq, queue);
+	wakeup(v);
+	mtx_unlock(&v->bqueue_mtx);
 }

 static void
 gv_volume_start(struct bio *bp)
 {
-	struct g_geom *gp;
-	struct bio *bp2;
 	struct gv_volume *v;
+	struct gv_bioq *bq;

-	gp = bp->bio_to->geom;
-	v = gp->softc;
-	if (v->state != GV_VOL_UP) {
-		g_io_deliver(bp, ENXIO);
-		return;
-	}
 	switch(bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
-		bp2 = g_clone_bio(bp);
-		if (bp2 == NULL) {
-			g_io_deliver(bp, ENOMEM);
-			return;
-		}
-		bp2->bio_done = gv_volume_done;
-		g_io_request(bp2, LIST_FIRST(&gp->consumer));
-		return;
+		break;
+	case BIO_GETATTR:
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
+
+	v = bp->bio_to->geom->softc;
+	if (v->state != GV_VOL_UP) {
+		g_io_deliver(bp, ENXIO);
+		return;
+	}
+
+	bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
+	bq->bp = bp;
+	mtx_lock(&v->bqueue_mtx);
+	TAILQ_INSERT_TAIL(&v->bqueue, bq, queue);
+	wakeup(v);
+	mtx_unlock(&v->bqueue_mtx);
+}
+
+static void
+gv_vol_worker(void *arg)
+{
+	struct bio *bp;
+	struct gv_volume *v;
+	struct gv_bioq *bq;
+
+	v = arg;
+	KASSERT(v != NULL, ("NULL v"));
+	mtx_lock(&v->bqueue_mtx);
+	for (;;) {
+		/* We were signaled to exit. */
+		if (v->flags & GV_VOL_THREAD_DIE)
+			break;
+
+		/* Take the first BIO from our queue. */
+		bq = TAILQ_FIRST(&v->bqueue);
+		if (bq == NULL) {
+			msleep(v, &v->bqueue_mtx, PRIBIO, "-", hz/10);
+			continue;
+		}
+		TAILQ_REMOVE(&v->bqueue, bq, queue);
+		mtx_unlock(&v->bqueue_mtx);
+
+		bp = bq->bp;
+		g_free(bq);
+
+		if (bp->bio_cflags & GV_BIO_DONE)
+			gv_vol_completed_request(v, bp);
+		else
+			gv_vol_normal_request(v, bp);
+
+		mtx_lock(&v->bqueue_mtx);
+	}
+	mtx_unlock(&v->bqueue_mtx);
+	v->flags |= GV_VOL_THREAD_DEAD;
+	wakeup(v);
+
+	kthread_exit(ENXIO);
+}
+
+static void
+gv_vol_completed_request(struct gv_volume *v, struct bio *bp)
+{
+	struct bio *pbp;
+	struct gv_bioq *bq;
+
+	pbp = bp->bio_parent;
+
+	if (pbp->bio_error == 0)
+		pbp->bio_error = bp->bio_error;
+
+	switch (pbp->bio_cmd) {
+	case BIO_READ:
+		if (bp->bio_error) {
+			g_destroy_bio(bp);
+			pbp->bio_children--;
+			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
+			bq->bp = pbp;
+			mtx_lock(&v->bqueue_mtx);
+			TAILQ_INSERT_TAIL(&v->bqueue, bq, queue);
+			mtx_unlock(&v->bqueue_mtx);
+			return;
+		}
+		break;
+	case BIO_WRITE:
+	case BIO_DELETE:
+		break;
+	}
+
+	/* When the original request is finished, we deliver it. */
+	pbp->bio_inbed++;
+	if (pbp->bio_inbed == pbp->bio_children) {
+		pbp->bio_completed = bp->bio_length;
+		g_io_deliver(pbp, pbp->bio_error);
+	}
+
+	g_destroy_bio(bp);
+}
+
+static void
+gv_vol_normal_request(struct gv_volume *v, struct bio *bp)
+{
+	struct g_geom *gp;
+	struct gv_plex *p;
+	struct bio *cbp, *pbp;
+
+	gp = v->geom;
+
+	switch (bp->bio_cmd) {
+	case BIO_READ:
+		cbp = g_clone_bio(bp);
+		if (cbp == NULL) {
+			g_io_deliver(bp, ENOMEM);
+			return;
+		}
+		cbp->bio_done = gv_volume_done;
+		LIST_FOREACH(p, &v->plexes, in_volume) {
+			if (p->state >= GV_PLEX_DEGRADED)
+				break;
+		}
+		g_io_request(cbp, p->consumer);
+
+		break;
+
+	case BIO_WRITE:
+	case BIO_DELETE:
+		LIST_FOREACH(p, &v->plexes, in_volume) {
+			if (p->state < GV_PLEX_DEGRADED)
+				continue;
+
+			cbp = g_clone_bio(bp);
+			if (cbp == NULL)	/* XXX */
+				g_io_deliver(bp, ENOMEM);
+			cbp->bio_done = gv_volume_done;
+			cbp->bio_caller2 = p->consumer;
+
+			if (bp->bio_driver1 == NULL) {
+				bp->bio_driver1 = cbp;
+			} else {
+				pbp = bp->bio_driver1;
+				while (pbp->bio_caller1 != NULL)
+					pbp = pbp->bio_caller1;
+				pbp->bio_caller1 = cbp;
+			}
+		}
+
+		/* Fire off all sub-requests. */
+		pbp = bp->bio_driver1;
+		while (pbp != NULL) {
+			g_io_request(pbp, pbp->bio_caller2);
+			pbp = pbp->bio_caller1;
+		}
+
+		break;
+	}
 }

 static int
@ -211,6 +324,11 @@ gv_volume_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 		gp->access = gv_volume_access;
 		gp->softc = v;
 		first++;
+		TAILQ_INIT(&v->bqueue);
+		mtx_init(&v->bqueue_mtx, "gv_plex", NULL, MTX_DEF);
+		kthread_create(gv_vol_worker, v, NULL, 0, 0, "gv_v %s",
+		    v->name);
+		v->flags |= GV_VOL_THREAD_ACTIVE;
 	} else
 		gp = v->geom;

@ -261,9 +379,13 @@ static int
 gv_volume_destroy_geom(struct gctl_req *req, struct g_class *mp,
    struct g_geom *gp)
 {
+	struct gv_volume *v;
+
 	g_trace(G_T_TOPOLOGY, "gv_volume_destroy_geom: %s", gp->name);
 	g_topology_assert();

+	v = gp->softc;
+	gv_kill_vol_thread(v);
 	g_wither_geom(gp, ENXIO);
 	return (0);
 }