1
0
mirror of https://git.FreeBSD.org/src.git synced 2024-12-23 11:18:54 +00:00

Re-vamp how I/O is handled in volumes and plexes.

Analogous to the drive level, give each volume and plex a worker thread
that picks up and processes incoming and completed BIOs.

This should fix the data corruption issues that have come up a few
weeks ago and improve performance, especially of RAID5 plexes.

The volume level needs a little work, though.
This commit is contained in:
Lukas Ertl 2004-09-18 13:44:43 +00:00
parent 54516c29e8
commit 67e3ab6ee5
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=135426
9 changed files with 788 additions and 679 deletions

View File

@ -70,6 +70,7 @@ int gv_is_striped(struct gv_plex *);
int gv_is_open(struct g_geom *);
void gv_kill_drive_thread(struct gv_drive *);
void gv_kill_plex_thread(struct gv_plex *);
void gv_kill_vol_thread(struct gv_volume *);
int gv_object_type(struct gv_softc *, char *);
void gv_parse_config(struct gv_softc *, u_char *, int);
const char *gv_roughlength(off_t, int);

View File

@ -293,7 +293,7 @@ gv_sync_td(void *arg)
* This hack declare this bio as part of an initialization
* process, so that the lower levels allow it to get through.
*/
bp->bio_caller1 = p;
bp->bio_cflags |= GV_BIO_SYNCREQ;
/* Schedule it down ... */
g_io_request(bp, to);

View File

@ -43,6 +43,10 @@ __FBSDID("$FreeBSD$");
#include <geom/vinum/geom_vinum_raid5.h>
#include <geom/vinum/geom_vinum.h>
static void gv_plex_completed_request(struct gv_plex *, struct bio *);
static void gv_plex_normal_request(struct gv_plex *, struct bio *);
static void gv_plex_worker(void *);
/* XXX: is this the place to catch dying subdisks? */
static void
gv_plex_orphan(struct g_consumer *cp)
@ -76,48 +80,39 @@ gv_plex_orphan(struct g_consumer *cp)
g_wither_geom(gp, error);
}
static void
void
gv_plex_done(struct bio *bp)
{
struct g_geom *gp;
struct gv_sd *s;
gp = bp->bio_to->geom;
struct gv_plex *p;
struct gv_bioq *bq;
s = bp->bio_caller1;
KASSERT(s != NULL, ("gv_plex_done: NULL s"));
if (bp->bio_error == 0)
s->initialized += bp->bio_length;
if (s->initialized >= s->size) {
gv_set_sd_state(s, GV_SD_UP, 0);
s->initialized = 0;
}
g_std_done(bp);
p = bp->bio_from->geom->softc;
bp->bio_cflags |= GV_BIO_DONE;
bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
bq->bp = bp;
mtx_lock(&p->bqueue_mtx);
TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
wakeup(p);
mtx_unlock(&p->bqueue_mtx);
}
/* Find the correct subdisk to send the bio to and build a bio to send. */
static int
gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,
caddr_t addr, long bcount, off_t boff)
gv_plexbuffer(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
{
struct g_geom *gp;
struct gv_plex *p;
struct gv_sd *s;
struct bio *cbp;
struct bio *cbp, *pbp;
int i, sdno;
off_t len_left, real_len, real_off, stripeend, stripeno, stripestart;
s = NULL;
gp = bp->bio_to->geom;
p = gp->softc;
off_t len_left, real_len, real_off;
off_t stripeend, stripeno, stripestart;
if (p == NULL || LIST_EMPTY(&p->subdisks))
return (ENXIO);
s = NULL;
gp = bp->bio_to->geom;
/*
* We only handle concatenated and striped plexes here. RAID5 plexes
* are handled in build_raid5_request().
@ -190,10 +185,10 @@ gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,
break;
case GV_SD_STALE:
if (bp->bio_caller1 != p)
if (!(bp->bio_cflags & GV_BIO_SYNCREQ))
return (ENXIO);
printf("FOO: setting sd %s to GV_SD_INITIALIZING\n", s->name);
printf("GEOM_VINUM: sd %s is initializing\n", s->name);
gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
break;
@ -214,104 +209,366 @@ gv_plexbuffer(struct bio *bp, struct bio **bp2, struct g_consumer **cp,
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_data = addr;
if (bp->bio_caller1 == p) {
cbp->bio_caller1 = s;
cbp->bio_done = g_std_done;
cbp->bio_caller2 = s->consumer;
if ((bp->bio_cflags & GV_BIO_SYNCREQ)) {
cbp->bio_cflags |= GV_BIO_SYNCREQ;
cbp->bio_done = gv_plex_done;
} else
cbp->bio_done = g_std_done;
*bp2 = cbp;
*cp = s->consumer;
}
if (bp->bio_driver1 == NULL) {
bp->bio_driver1 = cbp;
} else {
pbp = bp->bio_driver1;
while (pbp->bio_caller1 != NULL)
pbp = pbp->bio_caller1;
pbp->bio_caller1 = cbp;
}
return (0);
}
static void
gv_plex_start(struct bio *bp)
{
struct g_geom *gp;
struct g_consumer *cp;
struct gv_plex *p;
struct gv_raid5_packet *wp;
struct bio *bp2;
caddr_t addr;
off_t boff;
long bcount, rcount;
int err;
gp = bp->bio_to->geom;
p = gp->softc;
/*
* We cannot handle this request if too many of our subdisks are
* inaccessible.
*/
if ((p->state < GV_PLEX_DEGRADED) && (bp->bio_caller1 != p)) {
g_io_deliver(bp, ENXIO); /* XXX: correct way? */
return;
}
struct gv_bioq *bq;
switch(bp->bio_cmd) {
case BIO_READ:
case BIO_WRITE:
case BIO_DELETE:
/*
* We split up the request in smaller packets and hand them
* down to our subdisks.
*/
wp = NULL;
addr = bp->bio_data;
boff = bp->bio_offset;
for (bcount = bp->bio_length; bcount > 0; bcount -= rcount) {
/*
* RAID5 requests usually need to be split up in
* several subrequests.
*/
if (p->org == GV_PLEX_RAID5) {
wp = gv_new_raid5_packet();
wp->bio = bp;
err = gv_build_raid5_req(wp, bp, addr, bcount,
boff);
} else
err = gv_plexbuffer(bp, &bp2, &cp, addr, bcount,
boff);
if (err) {
if (p->org == GV_PLEX_RAID5)
gv_free_raid5_packet(wp);
bp->bio_completed += bcount;
if (bp->bio_error == 0)
bp->bio_error = err;
if (bp->bio_completed == bp->bio_length)
g_io_deliver(bp, bp->bio_error);
return;
}
if (p->org != GV_PLEX_RAID5) {
rcount = bp2->bio_length;
g_io_request(bp2, cp);
/*
* RAID5 subrequests are queued on a worklist
* and picked up from the worker thread. This
* ensures correct order.
*/
} else {
mtx_lock(&p->worklist_mtx);
TAILQ_INSERT_TAIL(&p->worklist, wp,
list);
mtx_unlock(&p->worklist_mtx);
wakeup(&p);
rcount = wp->length;
}
boff += rcount;
addr += rcount;
}
return;
break;
case BIO_GETATTR:
default:
g_io_deliver(bp, EOPNOTSUPP);
return;
}
/*
* We cannot handle this request if too many of our subdisks are
* inaccessible.
*/
p = bp->bio_to->geom->softc;
if ((p->state < GV_PLEX_DEGRADED) &&
!(bp->bio_cflags & GV_BIO_SYNCREQ)) {
g_io_deliver(bp, ENXIO);
return;
}
bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
bq->bp = bp;
mtx_lock(&p->bqueue_mtx);
TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
wakeup(p);
mtx_unlock(&p->bqueue_mtx);
}
static void
gv_plex_worker(void *arg)
{
struct bio *bp;
struct gv_plex *p;
struct gv_sd *s;
struct gv_bioq *bq;
p = arg;
KASSERT(p != NULL, ("NULL p"));
mtx_lock(&p->bqueue_mtx);
for (;;) {
/* We were signaled to exit. */
if (p->flags & GV_PLEX_THREAD_DIE)
break;
/* Take the first BIO from our queue. */
bq = TAILQ_FIRST(&p->bqueue);
if (bq == NULL) {
msleep(p, &p->bqueue_mtx, PRIBIO, "-", hz/10);
continue;
}
TAILQ_REMOVE(&p->bqueue, bq, queue);
mtx_unlock(&p->bqueue_mtx);
bp = bq->bp;
/* A completed request. */
if (bp->bio_cflags & GV_BIO_DONE) {
g_free(bq);
if (bp->bio_cflags & GV_BIO_SYNCREQ) {
s = bp->bio_to->private;
if (bp->bio_error == 0)
s->initialized += bp->bio_length;
if (s->initialized >= s->size) {
g_topology_lock();
gv_set_sd_state(s, GV_SD_UP,
GV_SETSTATE_CONFIG);
g_topology_unlock();
s->initialized = 0;
}
g_std_done(bp);
} else
gv_plex_completed_request(p, bp);
/*
* A sub-request that was hold back because it interfered with
* another sub-request.
*/
} else if (bp->bio_cflags & GV_BIO_ONHOLD) {
/* Is it still locked out? */
if (gv_stripe_active(p, bp)) {
mtx_lock(&p->bqueue_mtx);
TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
mtx_unlock(&p->bqueue_mtx);
} else {
g_free(bq);
bp->bio_cflags &= ~GV_BIO_ONHOLD;
g_io_request(bp, bp->bio_caller2);
}
/* A normal request to this plex. */
} else {
g_free(bq);
gv_plex_normal_request(p, bp);
}
mtx_lock(&p->bqueue_mtx);
}
mtx_unlock(&p->bqueue_mtx);
p->flags |= GV_PLEX_THREAD_DEAD;
wakeup(p);
kthread_exit(ENXIO);
}
void
gv_plex_completed_request(struct gv_plex *p, struct bio *bp)
{
struct bio *cbp, *pbp;
struct gv_bioq *bq, *bq2;
struct gv_raid5_packet *wp;
int i;
wp = bp->bio_driver1;
switch (bp->bio_parent->bio_cmd) {
case BIO_READ:
if (wp == NULL)
break;
TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
if (bq->bp == bp) {
TAILQ_REMOVE(&wp->bits, bq, queue);
g_free(bq);
for (i = 0; i < wp->length; i++)
wp->data[i] ^= bp->bio_data[i];
break;
}
}
if (TAILQ_EMPTY(&wp->bits)) {
bp->bio_parent->bio_completed += wp->length;
if (wp->lockbase != -1)
TAILQ_REMOVE(&p->packets, wp, list);
g_free(wp);
}
break;
case BIO_WRITE:
if (wp == NULL)
break;
/* Check if we need to handle parity data. */
TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
if (bq->bp == bp) {
TAILQ_REMOVE(&wp->bits, bq, queue);
g_free(bq);
cbp = wp->parity;
if (cbp != NULL) {
for (i = 0; i < wp->length; i++)
cbp->bio_data[i] ^=
bp->bio_data[i];
}
break;
}
}
/* Handle parity data. */
if (TAILQ_EMPTY(&wp->bits)) {
if (wp->waiting != NULL) {
pbp = wp->waiting;
wp->waiting = NULL;
cbp = wp->parity;
for (i = 0; i < wp->length; i++)
cbp->bio_data[i] ^= pbp->bio_data[i];
g_io_request(pbp, pbp->bio_caller2);
} else if (wp->parity != NULL) {
cbp = wp->parity;
wp->parity = NULL;
g_io_request(cbp, cbp->bio_caller2);
} else {
bp->bio_parent->bio_completed += wp->length;
TAILQ_REMOVE(&p->packets, wp, list);
g_free(wp);
}
}
break;
}
pbp = bp->bio_parent;
if (pbp->bio_error == 0)
pbp->bio_error = bp->bio_error;
/* When the original request is finished, we deliver it. */
pbp->bio_inbed++;
if (pbp->bio_inbed == pbp->bio_children)
g_io_deliver(pbp, pbp->bio_error);
/* Clean up what we allocated. */
if (bp->bio_cflags & GV_BIO_MALLOC)
g_free(bp->bio_data);
g_destroy_bio(bp);
}
void
gv_plex_normal_request(struct gv_plex *p, struct bio *bp)
{
struct bio *cbp, *pbp;
struct gv_bioq *bq, *bq2;
struct gv_raid5_packet *wp, *wp2;
caddr_t addr;
off_t bcount, boff;
int err;
bcount = bp->bio_length;
addr = bp->bio_data;
boff = bp->bio_offset;
/* Walk over the whole length of the request, we might split it up. */
while (bcount > 0) {
wp = NULL;
/*
* RAID5 plexes need special treatment, as a single write
* request involves several read/write sub-requests.
*/
if (p->org == GV_PLEX_RAID5) {
wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO);
wp->bio = bp;
TAILQ_INIT(&wp->bits);
err = gv_build_raid5_req(p, wp, bp, addr, boff, bcount);
/*
* Building the sub-request failed, we probably need to
* clean up a lot.
*/
if (err) {
printf("GEOM_VINUM: plex request failed for ");
g_print_bio(bp);
printf("\n");
TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
TAILQ_REMOVE(&wp->bits, bq, queue);
g_free(bq);
}
if (wp->waiting != NULL) {
if (wp->waiting->bio_cflags &
GV_BIO_MALLOC)
g_free(wp->waiting->bio_data);
g_destroy_bio(wp->waiting);
}
if (wp->parity != NULL) {
if (wp->parity->bio_cflags &
GV_BIO_MALLOC)
g_free(wp->parity->bio_data);
g_destroy_bio(wp->parity);
}
g_free(wp);
TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) {
if (wp->bio == bp) {
TAILQ_REMOVE(&p->packets, wp,
list);
TAILQ_FOREACH_SAFE(bq,
&wp->bits, queue, bq2) {
TAILQ_REMOVE(&wp->bits,
bq, queue);
g_free(bq);
}
g_free(wp);
}
}
cbp = bp->bio_driver1;
while (cbp != NULL) {
pbp = cbp->bio_caller1;
if (cbp->bio_cflags & GV_BIO_MALLOC)
g_free(cbp->bio_data);
g_destroy_bio(cbp);
cbp = pbp;
}
g_io_deliver(bp, err);
return;
}
if (TAILQ_EMPTY(&wp->bits))
g_free(wp);
else if (wp->lockbase != -1)
TAILQ_INSERT_TAIL(&p->packets, wp, list);
/*
* Requests to concatenated and striped plexes go straight
* through.
*/
} else {
err = gv_plexbuffer(p, bp, addr, boff, bcount);
/* Building the sub-request failed. */
if (err) {
printf("GEOM_VINUM: plex request failed for ");
g_print_bio(bp);
printf("\n");
cbp = bp->bio_driver1;
while (cbp != NULL) {
pbp = cbp->bio_caller1;
g_destroy_bio(cbp);
cbp = pbp;
}
g_io_deliver(bp, err);
return;
}
}
/* Abuse bio_caller1 as linked list. */
pbp = bp->bio_driver1;
while (pbp->bio_caller1 != NULL)
pbp = pbp->bio_caller1;
bcount -= pbp->bio_length;
addr += pbp->bio_length;
boff += pbp->bio_length;
}
/* Fire off all sub-requests. */
pbp = bp->bio_driver1;
while (pbp != NULL) {
/*
* RAID5 sub-requests need to come in correct order, otherwise
* we trip over the parity, as it might be overwritten by
* another sub-request.
*/
if (pbp->bio_driver1 != NULL &&
gv_stripe_active(p, pbp)) {
pbp->bio_cflags |= GV_BIO_ONHOLD;
bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
bq->bp = pbp;
mtx_lock(&p->bqueue_mtx);
TAILQ_INSERT_TAIL(&p->bqueue, bq, queue);
mtx_unlock(&p->bqueue_mtx);
} else
g_io_request(pbp, pbp->bio_caller2);
pbp = pbp->bio_caller1;
}
}
static int
@ -425,16 +682,12 @@ gv_plex_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
gp->softc = p;
p->geom = gp;
/* RAID5 plexes need a 'worker' thread, where IO is handled. */
if (p->org == GV_PLEX_RAID5) {
TAILQ_INIT(&p->worklist);
mtx_init(&p->worklist_mtx, "gvinum_worklist", NULL,
MTX_DEF);
p->flags &= ~GV_PLEX_THREAD_DIE;
kthread_create(gv_raid5_worker, gp, NULL, 0, 0,
"gv_raid5");
p->flags |= GV_PLEX_THREAD_ACTIVE;
}
TAILQ_INIT(&p->packets);
TAILQ_INIT(&p->bqueue);
mtx_init(&p->bqueue_mtx, "gv_plex", NULL, MTX_DEF);
kthread_create(gv_plex_worker, p, NULL, 0, 0, "gv_p %s",
p->name);
p->flags |= GV_PLEX_THREAD_ACTIVE;
/* Attach a consumer to this provider. */
cp = g_new_consumer(gp);

View File

@ -44,243 +44,62 @@ __FBSDID("$FreeBSD$");
#include <geom/vinum/geom_vinum_raid5.h>
#include <geom/vinum/geom_vinum.h>
int gv_raid5_parity(struct gv_raid5_packet *);
int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *);
struct gv_raid5_bit *
gv_new_raid5_bit(void)
{
struct gv_raid5_bit *r;
r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO);
KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r"));
return (r);
}
struct gv_raid5_packet *
gv_new_raid5_packet(void)
{
struct gv_raid5_packet *wp;
wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO);
KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp"));
wp->state = SETUP;
wp->type = JUNK;
TAILQ_INIT(&wp->bits);
return (wp);
}
void
gv_free_raid5_packet(struct gv_raid5_packet *wp)
{
struct gv_raid5_bit *r, *r2;
/* Remove all the bits from this work packet. */
TAILQ_FOREACH_SAFE(r, &wp->bits, list, r2) {
TAILQ_REMOVE(&wp->bits, r, list);
if (r->malloc)
g_free(r->buf);
if (r->bio != NULL)
g_destroy_bio(r->bio);
g_free(r);
}
if (wp->bufmalloc == 1)
g_free(wp->buf);
g_free(wp);
}
/*
* Check if the stripe that the work packet wants is already being used by
* some other work packet.
*/
int
gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc)
gv_stripe_active(struct gv_plex *p, struct bio *bp)
{
struct gv_raid5_packet *wpa;
struct gv_raid5_packet *wp, *owp;
int overlap;
TAILQ_FOREACH(wpa, &sc->worklist, list) {
if (wpa->lockbase == wp->lockbase) {
if (wpa == wp)
return (0);
return (1);
wp = bp->bio_driver1;
if (wp->lockbase == -1)
return (0);
overlap = 0;
TAILQ_FOREACH(owp, &p->packets, list) {
if (owp == wp)
break;
if ((wp->lockbase >= owp->lockbase) &&
(wp->lockbase <= owp->lockbase + owp->length)) {
overlap++;
break;
}
if ((wp->lockbase <= owp->lockbase) &&
(wp->lockbase + wp->length >= owp->lockbase)) {
overlap++;
break;
}
}
return (0);
}
/*
* The "worker" thread that runs through the worklist and fires off the
* "subrequests" needed to fulfill a RAID5 read or write request.
*/
void
gv_raid5_worker(void *arg)
{
struct bio *bp;
struct g_geom *gp;
struct gv_plex *p;
struct gv_raid5_packet *wp, *wpt;
struct gv_raid5_bit *rbp, *rbpt;
int error, restart;
gp = arg;
p = gp->softc;
mtx_lock(&p->worklist_mtx);
for (;;) {
restart = 0;
TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) {
/* This request packet is already being processed. */
if (wp->state == IO)
continue;
/* This request packet is ready for processing. */
if (wp->state == VALID) {
/* Couldn't get the lock, try again. */
if ((wp->lockbase != -1) &&
gv_stripe_active(wp, p))
continue;
wp->state = IO;
mtx_unlock(&p->worklist_mtx);
TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt)
g_io_request(rbp->bio, rbp->consumer);
mtx_lock(&p->worklist_mtx);
continue;
}
if (wp->state == FINISH) {
bp = wp->bio;
bp->bio_completed += wp->length;
/*
* Deliver the original request if we have
* finished.
*/
if (bp->bio_completed == bp->bio_length) {
mtx_unlock(&p->worklist_mtx);
g_io_deliver(bp, 0);
mtx_lock(&p->worklist_mtx);
}
TAILQ_REMOVE(&p->worklist, wp, list);
gv_free_raid5_packet(wp);
restart++;
/*break;*/
}
}
if (!restart) {
/* Self-destruct. */
if (p->flags & GV_PLEX_THREAD_DIE)
break;
error = msleep(p, &p->worklist_mtx, PRIBIO, "-",
hz/100);
}
}
mtx_unlock(&p->worklist_mtx);
g_trace(G_T_TOPOLOGY, "gv_raid5_worker die");
/* Signal our plex that we are dead. */
p->flags |= GV_PLEX_THREAD_DEAD;
wakeup(p);
kthread_exit(0);
}
/* Final bio transaction to write out the parity data. */
int
gv_raid5_parity(struct gv_raid5_packet *wp)
{
struct bio *bp;
bp = g_new_bio();
if (bp == NULL)
return (ENOMEM);
wp->type = ISPARITY;
bp->bio_cmd = BIO_WRITE;
bp->bio_data = wp->buf;
bp->bio_offset = wp->offset;
bp->bio_length = wp->length;
bp->bio_done = gv_raid5_done;
bp->bio_caller1 = wp;
bp->bio_caller2 = NULL;
g_io_request(bp, wp->parity);
return (0);
}
/* We end up here after each subrequest. */
void
gv_raid5_done(struct bio *bp)
{
struct bio *obp;
struct g_geom *gp;
struct gv_plex *p;
struct gv_raid5_packet *wp;
struct gv_raid5_bit *rbp;
off_t i;
int error;
wp = bp->bio_caller1;
rbp = bp->bio_caller2;
obp = wp->bio;
gp = bp->bio_from->geom;
p = gp->softc;
/* One less active subrequest. */
wp->active--;
switch (obp->bio_cmd) {
case BIO_READ:
/* Degraded reads need to handle parity data. */
if (wp->type == DEGRADED) {
for (i = 0; i < wp->length; i++)
wp->buf[i] ^= bp->bio_data[i];
/* When we're finished copy back the data we want. */
if (wp->active == 0)
bcopy(wp->buf, wp->data, wp->length);
}
break;
case BIO_WRITE:
/* Handle the parity data, if needed. */
if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) {
for (i = 0; i < wp->length; i++)
wp->buf[i] ^= bp->bio_data[i];
/* Write out the parity data we calculated. */
if (wp->active == 0) {
wp->active++;
error = gv_raid5_parity(wp);
}
}
break;
}
/* This request group is done. */
if (wp->active == 0)
wp->state = FINISH;
return (overlap);
}
/* Build a request group to perform (part of) a RAID5 request. */
int
gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
long bcount, off_t boff)
gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
{
struct g_geom *gp;
struct gv_plex *p;
struct gv_raid5_bit *rbp;
struct gv_sd *broken, *original, *parity, *s;
int i, psdno, sdno;
off_t len_left, real_off, stripeend, stripeoff, stripestart;
struct gv_bioq *bq;
struct bio *cbp, *pbp;
int i, psdno, sdno, type;
off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart;
gp = bp->bio_to->geom;
p = gp->softc;
if (p == NULL || LIST_EMPTY(&p->subdisks))
return (ENXIO);
/* We are optimistic and assume that this request will be OK. */
wp->type = NORMAL;
#define REQ_TYPE_NORMAL 0
#define REQ_TYPE_DEGRADED 1
#define REQ_TYPE_NOPARITY 2
type = REQ_TYPE_NORMAL;
original = parity = broken = NULL;
/* The number of the subdisk containing the parity stripe. */
@ -330,29 +149,20 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
/* Our data stripe is missing. */
if (original->state != GV_SD_UP)
wp->type = DEGRADED;
type = REQ_TYPE_DEGRADED;
/* Our parity stripe is missing. */
if (parity->state != GV_SD_UP) {
/* We cannot take another failure if we're already degraded. */
if (wp->type != NORMAL)
if (type != REQ_TYPE_NORMAL)
return (ENXIO);
else
wp->type = NOPARITY;
type = REQ_TYPE_NOPARITY;
}
/*
* A combined write is necessary when the original data subdisk and the
* parity subdisk are both up, but one of the other subdisks isn't.
*/
if ((broken != NULL) && (broken != parity) && (broken != original))
wp->type = COMBINED;
wp->offset = real_off;
wp->length = (bcount <= len_left) ? bcount : len_left;
real_len = (bcount <= len_left) ? bcount : len_left;
wp->length = real_len;
wp->data = addr;
wp->original = original->consumer;
wp->parity = parity->consumer;
wp->lockbase = stripestart;
wp->lockbase = real_off;
KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
@ -363,58 +173,45 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
* the broken one plus the parity stripe and then recalculate
* the desired data.
*/
if (wp->type == DEGRADED) {
wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
if (wp->buf == NULL)
return (ENOMEM);
wp->bufmalloc = 1;
if (type == REQ_TYPE_DEGRADED) {
bzero(wp->data, wp->length);
LIST_FOREACH(s, &p->subdisks, in_plex) {
/* Skip the broken subdisk. */
if (s == broken)
continue;
rbp = gv_new_raid5_bit();
rbp->consumer = s->consumer;
rbp->bio = g_new_bio();
if (rbp->bio == NULL)
cbp = g_clone_bio(bp);
if (cbp == NULL)
return (ENOMEM);
rbp->buf = g_malloc(wp->length,
M_NOWAIT | M_ZERO);
if (rbp->buf == NULL)
return (ENOMEM);
rbp->malloc = 1;
rbp->bio->bio_cmd = BIO_READ;
rbp->bio->bio_offset = wp->offset;
rbp->bio->bio_length = wp->length;
rbp->bio->bio_data = rbp->buf;
rbp->bio->bio_done = gv_raid5_done;
rbp->bio->bio_caller1 = wp;
rbp->bio->bio_caller2 = rbp;
TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
wp->active++;
wp->rqcount++;
cbp->bio_data = g_malloc(real_len, M_WAITOK);
cbp->bio_cflags |= GV_BIO_MALLOC;
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_done = gv_plex_done;
cbp->bio_caller2 = s->consumer;
cbp->bio_driver1 = wp;
GV_ENQUEUE(bp, cbp, pbp);
bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
bq->bp = cbp;
TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
}
/* A normal read can be fulfilled with the original subdisk. */
} else {
rbp = gv_new_raid5_bit();
rbp->consumer = wp->original;
rbp->bio = g_new_bio();
if (rbp->bio == NULL)
cbp = g_clone_bio(bp);
if (cbp == NULL)
return (ENOMEM);
rbp->bio->bio_cmd = BIO_READ;
rbp->bio->bio_offset = wp->offset;
rbp->bio->bio_length = wp->length;
rbp->buf = addr;
rbp->bio->bio_data = rbp->buf;
rbp->bio->bio_done = gv_raid5_done;
rbp->bio->bio_caller1 = wp;
rbp->bio->bio_caller2 = rbp;
TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
wp->active++;
wp->rqcount++;
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_data = addr;
cbp->bio_done = g_std_done;
cbp->bio_caller2 = original->consumer;
GV_ENQUEUE(bp, cbp, pbp);
}
if (wp->type != COMBINED)
wp->lockbase = -1;
wp->lockbase = -1;
break;
case BIO_WRITE:
@ -424,164 +221,65 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
* recalculate the parity from the original data, and then
* write the parity stripe back out.
*/
if (wp->type == DEGRADED) {
wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
if (wp->buf == NULL)
return (ENOMEM);
wp->bufmalloc = 1;
/* Copy the original data. */
bcopy(wp->data, wp->buf, wp->length);
if (type == REQ_TYPE_DEGRADED) {
/* Read all subdisks. */
LIST_FOREACH(s, &p->subdisks, in_plex) {
/* Skip the broken and the parity subdisk. */
if ((s == broken) ||
(s->consumer == wp->parity))
if ((s == broken) || (s == parity))
continue;
rbp = gv_new_raid5_bit();
rbp->consumer = s->consumer;
rbp->bio = g_new_bio();
if (rbp->bio == NULL)
cbp = g_clone_bio(bp);
if (cbp == NULL)
return (ENOMEM);
rbp->buf = g_malloc(wp->length,
M_NOWAIT | M_ZERO);
if (rbp->buf == NULL)
return (ENOMEM);
rbp->malloc = 1;
rbp->bio->bio_cmd = BIO_READ;
rbp->bio->bio_data = rbp->buf;
rbp->bio->bio_offset = wp->offset;
rbp->bio->bio_length = wp->length;
rbp->bio->bio_done = gv_raid5_done;
rbp->bio->bio_caller1 = wp;
rbp->bio->bio_caller2 = rbp;
TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
wp->active++;
wp->rqcount++;
cbp->bio_cmd = BIO_READ;
cbp->bio_data = g_malloc(real_len, M_WAITOK);
cbp->bio_cflags |= GV_BIO_MALLOC;
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_done = gv_plex_done;
cbp->bio_caller2 = s->consumer;
cbp->bio_driver1 = wp;
GV_ENQUEUE(bp, cbp, pbp);
bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
bq->bp = cbp;
TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
}
/* Write the parity data. */
cbp = g_clone_bio(bp);
if (cbp == NULL)
return (ENOMEM);
cbp->bio_data = g_malloc(real_len, M_WAITOK);
cbp->bio_cflags |= GV_BIO_MALLOC;
bcopy(addr, cbp->bio_data, real_len);
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_done = gv_plex_done;
cbp->bio_caller2 = parity->consumer;
cbp->bio_driver1 = wp;
wp->parity = cbp;
/*
* When we don't have the parity stripe we just write out the
* data.
* When the parity stripe is missing we just write out the data.
*/
} else if (wp->type == NOPARITY) {
rbp = gv_new_raid5_bit();
rbp->consumer = wp->original;
rbp->bio = g_new_bio();
if (rbp->bio == NULL)
} else if (type == REQ_TYPE_NOPARITY) {
cbp = g_clone_bio(bp);
if (cbp == NULL)
return (ENOMEM);
rbp->bio->bio_cmd = BIO_WRITE;
rbp->bio->bio_offset = wp->offset;
rbp->bio->bio_length = wp->length;
rbp->bio->bio_data = addr;
rbp->bio->bio_done = gv_raid5_done;
rbp->bio->bio_caller1 = wp;
rbp->bio->bio_caller2 = rbp;
TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
wp->active++;
wp->rqcount++;
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_data = addr;
cbp->bio_done = gv_plex_done;
cbp->bio_caller2 = original->consumer;
cbp->bio_driver1 = wp;
/*
* A combined write means that our data subdisk and the parity
* subdisks are both up, but another subdisk isn't. We need to
* read all valid stripes including the parity to recalculate
* the data of the stripe that is missing. Then we write our
* original data, and together with the other data stripes
* recalculate the parity again.
*/
} else if (wp->type == COMBINED) {
wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
if (wp->buf == NULL)
return (ENOMEM);
wp->bufmalloc = 1;
GV_ENQUEUE(bp, cbp, pbp);
/* Get the data from all subdisks. */
LIST_FOREACH(s, &p->subdisks, in_plex) {
/* Skip the broken subdisk. */
if (s == broken)
continue;
rbp = gv_new_raid5_bit();
rbp->consumer = s->consumer;
rbp->bio = g_new_bio();
if (rbp->bio == NULL)
return (ENOMEM);
rbp->bio->bio_cmd = BIO_READ;
rbp->buf = g_malloc(wp->length,
M_NOWAIT | M_ZERO);
if (rbp->buf == NULL)
return (ENOMEM);
rbp->malloc = 1;
rbp->bio->bio_data = rbp->buf;
rbp->bio->bio_offset = wp->offset;
rbp->bio->bio_length = wp->length;
rbp->bio->bio_done = gv_raid5_done;
rbp->bio->bio_caller1 = wp;
rbp->bio->bio_caller2 = rbp;
TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
wp->active++;
wp->rqcount++;
}
/* Write the original data. */
rbp = gv_new_raid5_bit();
rbp->consumer = wp->original;
rbp->buf = addr;
rbp->bio = g_new_bio();
if (rbp->bio == NULL)
return (ENOMEM);
rbp->bio->bio_cmd = BIO_WRITE;
rbp->bio->bio_data = rbp->buf;
rbp->bio->bio_offset = wp->offset;
rbp->bio->bio_length = wp->length;
rbp->bio->bio_done = gv_raid5_done;
rbp->bio->bio_caller1 = wp;
rbp->bio->bio_caller2 = rbp;
/*
* Insert at the tail, because we want to read the old
* data first.
*/
TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
wp->active++;
wp->rqcount++;
/* Get the rest of the data again. */
LIST_FOREACH(s, &p->subdisks, in_plex) {
/*
* Skip the broken subdisk, the parity, and the
* one we just wrote.
*/
if ((s == broken) ||
(s->consumer == wp->parity) ||
(s->consumer == wp->original))
continue;
rbp = gv_new_raid5_bit();
rbp->consumer = s->consumer;
rbp->bio = g_new_bio();
if (rbp->bio == NULL)
return (ENOMEM);
rbp->bio->bio_cmd = BIO_READ;
rbp->buf = g_malloc(wp->length,
M_NOWAIT | M_ZERO);
if (rbp->buf == NULL)
return (ENOMEM);
rbp->malloc = 1;
rbp->bio->bio_data = rbp->buf;
rbp->bio->bio_offset = wp->offset;
rbp->bio->bio_length = wp->length;
rbp->bio->bio_done = gv_raid5_done;
rbp->bio->bio_caller1 = wp;
rbp->bio->bio_caller2 = rbp;
/*
* Again, insert at the tail to keep correct
* order.
*/
TAILQ_INSERT_TAIL(&wp->bits, rbp, list);
wp->active++;
wp->rqcount++;
}
bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
bq->bp = cbp;
TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
/*
* A normal write request goes to the original subdisk, then we
@ -589,52 +287,83 @@ gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr,
* out the parity again.
*/
} else {
wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO);
if (wp->buf == NULL)
/* Read old parity. */
cbp = g_clone_bio(bp);
if (cbp == NULL)
return (ENOMEM);
wp->bufmalloc = 1;
LIST_FOREACH(s, &p->subdisks, in_plex) {
/* Skip the parity stripe. */
if (s->consumer == wp->parity)
continue;
cbp->bio_cmd = BIO_READ;
cbp->bio_data = g_malloc(real_len, M_WAITOK);
cbp->bio_cflags |= GV_BIO_MALLOC;
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_done = gv_plex_done;
cbp->bio_caller2 = parity->consumer;
cbp->bio_driver1 = wp;
rbp = gv_new_raid5_bit();
rbp->consumer = s->consumer;
rbp->bio = g_new_bio();
if (rbp->bio == NULL)
return (ENOMEM);
/*
* The data for the original stripe is written,
* the others need to be read in for the parity
* calculation.
*/
if (s->consumer == wp->original) {
rbp->bio->bio_cmd = BIO_WRITE;
rbp->buf = addr;
} else {
rbp->bio->bio_cmd = BIO_READ;
rbp->buf = g_malloc(wp->length,
M_NOWAIT | M_ZERO);
if (rbp->buf == NULL)
return (ENOMEM);
rbp->malloc = 1;
}
rbp->bio->bio_data = rbp->buf;
rbp->bio->bio_offset = wp->offset;
rbp->bio->bio_length = wp->length;
rbp->bio->bio_done = gv_raid5_done;
rbp->bio->bio_caller1 = wp;
rbp->bio->bio_caller2 = rbp;
TAILQ_INSERT_HEAD(&wp->bits, rbp, list);
wp->active++;
wp->rqcount++;
}
GV_ENQUEUE(bp, cbp, pbp);
bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
bq->bp = cbp;
TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
/* Read old data. */
cbp = g_clone_bio(bp);
if (cbp == NULL)
return (ENOMEM);
cbp->bio_cmd = BIO_READ;
cbp->bio_data = g_malloc(real_len, M_WAITOK);
cbp->bio_cflags |= GV_BIO_MALLOC;
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_done = gv_plex_done;
cbp->bio_caller2 = original->consumer;
cbp->bio_driver1 = wp;
GV_ENQUEUE(bp, cbp, pbp);
bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
bq->bp = cbp;
TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
/* Write new data. */
cbp = g_clone_bio(bp);
if (cbp == NULL)
return (ENOMEM);
cbp->bio_data = addr;
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_done = gv_plex_done;
cbp->bio_caller2 = original->consumer;
cbp->bio_driver1 = wp;
/*
* We must not write the new data until the old data
* was read, so hold this BIO back until we're ready
* for it.
*/
wp->waiting = cbp;
/* The final bio for the parity. */
cbp = g_clone_bio(bp);
if (cbp == NULL)
return (ENOMEM);
cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
cbp->bio_cflags |= GV_BIO_MALLOC;
cbp->bio_offset = real_off;
cbp->bio_length = real_len;
cbp->bio_done = gv_plex_done;
cbp->bio_caller2 = parity->consumer;
cbp->bio_driver1 = wp;
/* Remember that this is the BIO for the parity data. */
wp->parity = cbp;
}
break;
default:
return (EINVAL);
}
wp->state = VALID;
return (0);
}

View File

@ -32,22 +32,23 @@
/*
* A single RAID5 request usually needs more than one I/O transaction,
* depending on the state of the associated subdisks and the direction of the
* transaction (read or write). Every subrequest of a RAID5 request,
* represented by a gv_raid_packet, is defined by a gv_raid5_bit.
* transaction (read or write).
*/
/* A subrequest of a RAID5 read/write operation. */
struct gv_raid5_bit {
struct bio *bio; /* BIO of this subrequest. */
caddr_t buf; /* Data buffer of this subrequest. */
int malloc; /* Flag if data buffer was malloced. */
struct g_consumer *consumer; /* Consumer to send the BIO to. */
TAILQ_ENTRY(gv_raid5_bit) list; /* Entry in the list of this request. */
};
#define GV_ENQUEUE(bp, cbp, pbp) \
do { \
if (bp->bio_driver1 == NULL) { \
bp->bio_driver1 = cbp; \
} else { \
pbp = bp->bio_driver1; \
while (pbp->bio_caller1 != NULL) \
pbp = pbp->bio_caller1; \
pbp->bio_caller1 = cbp; \
} \
} while (0);
/* Container for one or more gv_raid5_bits; represents a RAID5 I/O request. */
struct gv_raid5_packet {
caddr_t buf; /* Data buffer of this RAID5 request. */
caddr_t data; /* Data buffer of this sub-request- */
off_t length; /* Size of data buffer. */
off_t lockbase; /* Deny access to our plex offset. */
off_t offset; /* The drive offset of the subdisk. */
@ -56,39 +57,17 @@ struct gv_raid5_packet {
int rqcount; /* Count of subrequests. */
struct bio *bio; /* Pointer to the original bio. */
caddr_t data; /* Pointer to the original data. */
struct bio *parity; /* The bio containing the parity data. */
struct bio *waiting; /* A bio that need to wait for other bios. */
struct g_consumer *original; /* Consumer to the data stripe. */
struct g_consumer *parity; /* Consumer to the parity stripe. */
/* State of this RAID5 packet. */
enum {
SETUP, /* Newly created. */
VALID, /* Ready for processing. */
IO, /* Currently doing I/O. */
FINISH /* Packet has finished. */
} state;
/* Type of this RAID5 transaction. */
enum {
JUNK, /* Newly created, not valid. */
NORMAL, /* Normal read or write. */
ISPARITY, /* Containing only parity data. */
NOPARITY, /* Parity stripe not available. */
DEGRADED, /* Data stripe not available. */
COMBINED /* Data and parity stripes ok, others not. */
} type;
TAILQ_HEAD(,gv_raid5_bit) bits; /* List of subrequests. */
TAILQ_ENTRY(gv_raid5_packet) list; /* Entry in plex's packet list. */
TAILQ_HEAD(,gv_bioq) bits; /* List of subrequests. */
TAILQ_ENTRY(gv_raid5_packet) list; /* Entry in plex's packet list. */
};
int gv_build_raid5_req(struct gv_raid5_packet *, struct bio *, caddr_t,
long, off_t);
void gv_free_raid5_packet(struct gv_raid5_packet *);
void gv_raid5_done(struct bio *);
int gv_stripe_active(struct gv_plex *, struct bio *);
int gv_build_raid5_req(struct gv_plex *, struct gv_raid5_packet *,
struct bio *, caddr_t, off_t, off_t);
void gv_raid5_worker(void *);
struct gv_raid5_packet *gv_new_raid5_packet(void);
struct gv_raid5_bit *gv_new_raid5_bit(void);
void gv_plex_done(struct bio *);
#endif /* !_GEOM_VINUM_RAID5_H_ */

View File

@ -166,6 +166,7 @@ gv_rm_vol(struct gv_softc *sc, struct gctl_req *req, struct gv_volume *v, int fl
/* Clean up and let our geom fade away. */
LIST_REMOVE(v, volume);
gv_kill_vol_thread(v);
g_free(v);
if (gp != NULL) {
gp->softc = NULL;

View File

@ -832,12 +832,25 @@ gv_kill_drive_thread(struct gv_drive *d)
void
gv_kill_plex_thread(struct gv_plex *p)
{
if ((p->org == GV_PLEX_RAID5) && (p->flags & GV_PLEX_THREAD_ACTIVE)) {
if (p->flags & GV_PLEX_THREAD_ACTIVE) {
p->flags |= GV_PLEX_THREAD_DIE;
wakeup(p);
while (!(p->flags & GV_PLEX_THREAD_DEAD))
tsleep(p, PRIBIO, "gv_die", hz);
p->flags &= ~GV_PLEX_THREAD_ACTIVE;
mtx_destroy(&p->worklist_mtx);
mtx_destroy(&p->bqueue_mtx);
}
}
void
gv_kill_vol_thread(struct gv_volume *v)
{
if (v->flags & GV_VOL_THREAD_ACTIVE) {
v->flags |= GV_VOL_THREAD_DIE;
wakeup(v);
while (!(v->flags & GV_VOL_THREAD_DEAD))
tsleep(v, PRIBIO, "gv_die", hz);
v->flags &= ~GV_VOL_THREAD_ACTIVE;
mtx_destroy(&v->bqueue_mtx);
}
}

View File

@ -111,6 +111,8 @@
#define GV_BIO_DONE 0x01
#define GV_BIO_MALLOC 0x02
#define GV_BIO_ONHOLD 0x04
#define GV_BIO_SYNCREQ 0x08
#define GV_BIO_SUCCEED 0x10
/*
* hostname is 256 bytes long, but we don't need to shlep multiple copies in
@ -269,8 +271,9 @@ struct gv_plex {
off_t synced; /* Count of synced bytes. */
struct mtx worklist_mtx; /* Mutex for RAID5 worklist. */
TAILQ_HEAD(,gv_raid5_packet) worklist; /* List of RAID5 work packets. */
struct mtx bqueue_mtx; /* Lock for the BIO queue. */
TAILQ_HEAD(,gv_bioq) bqueue; /* BIO queue. */
TAILQ_HEAD(,gv_raid5_packet) packets; /* RAID5 sub-requests. */
LIST_HEAD(,gv_sd) subdisks; /* List of attached subdisks. */
LIST_ENTRY(gv_plex) in_volume; /* Plex list of associated volume. */
@ -292,6 +295,14 @@ struct gv_volume {
#define GV_VOL_DOWN 0
#define GV_VOL_UP 1
int flags;
#define GV_VOL_THREAD_ACTIVE 0x01 /* Volume has an active thread. */
#define GV_VOL_THREAD_DIE 0x02 /* Signal the thread to die. */
#define GV_VOL_THREAD_DEAD 0x04 /* The thread has died. */
struct mtx bqueue_mtx; /* Lock for the BIO queue. */
TAILQ_HEAD(,gv_bioq) bqueue; /* BIO queue. */
LIST_HEAD(,gv_plex) plexes; /* List of attached plexes. */
LIST_ENTRY(gv_volume) volume; /* Entry in vinum config. */

View File

@ -31,6 +31,7 @@ __FBSDID("$FreeBSD$");
#include <sys/bio.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/libkern.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@ -42,6 +43,9 @@ __FBSDID("$FreeBSD$");
#include <geom/vinum/geom_vinum_var.h>
#include <geom/vinum/geom_vinum.h>
static void gv_vol_completed_request(struct gv_volume *, struct bio *);
static void gv_vol_normal_request(struct gv_volume *, struct bio *);
static void
gv_volume_orphan(struct g_consumer *cp)
{
@ -62,8 +66,10 @@ gv_volume_orphan(struct g_consumer *cp)
if (!LIST_EMPTY(&gp->consumer))
return;
v = gp->softc;
if (v != NULL)
if (v != NULL) {
gv_kill_vol_thread(v);
v->geom = NULL;
}
gp->softc = NULL;
g_wither_geom(gp, error);
}
@ -72,79 +78,186 @@ gv_volume_orphan(struct g_consumer *cp)
static void
gv_volume_done(struct bio *bp)
{
struct g_consumer *cp;
/* The next plex in this volume. */
cp = LIST_NEXT(bp->bio_from, consumer);
struct gv_volume *v;
struct gv_bioq *bq;
switch (bp->bio_cmd) {
case BIO_READ:
/*
* If no error occured on this request, or if we have no plex
* left, finish here...
*/
if ((bp->bio_error == 0) || (cp == NULL)) {
g_std_done(bp);
return;
}
/* ... or try to read from the next plex. */
g_io_request(bp, cp);
return;
case BIO_WRITE:
case BIO_DELETE:
/* No more plexes left. */
if (cp == NULL) {
/*
* Clear any errors if one of the previous writes
* succeeded.
*/
if (bp->bio_caller1 == (int *)1)
bp->bio_error = 0;
g_std_done(bp);
return;
}
/* If this write request had no errors, remember that fact... */
if (bp->bio_error == 0)
bp->bio_caller1 = (int *)1;
/* ... and write to the next plex. */
g_io_request(bp, cp);
return;
}
v = bp->bio_from->geom->softc;
bp->bio_cflags |= GV_BIO_DONE;
bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
bq->bp = bp;
mtx_lock(&v->bqueue_mtx);
TAILQ_INSERT_TAIL(&v->bqueue, bq, queue);
wakeup(v);
mtx_unlock(&v->bqueue_mtx);
}
static void
gv_volume_start(struct bio *bp)
{
struct g_geom *gp;
struct bio *bp2;
struct gv_volume *v;
struct gv_bioq *bq;
gp = bp->bio_to->geom;
v = gp->softc;
if (v->state != GV_VOL_UP) {
g_io_deliver(bp, ENXIO);
return;
}
switch(bp->bio_cmd) {
case BIO_READ:
case BIO_WRITE:
case BIO_DELETE:
bp2 = g_clone_bio(bp);
if (bp2 == NULL) {
g_io_deliver(bp, ENOMEM);
return;
}
bp2->bio_done = gv_volume_done;
g_io_request(bp2, LIST_FIRST(&gp->consumer));
return;
break;
case BIO_GETATTR:
default:
g_io_deliver(bp, EOPNOTSUPP);
return;
}
v = bp->bio_to->geom->softc;
if (v->state != GV_VOL_UP) {
g_io_deliver(bp, ENXIO);
return;
}
bq = g_malloc(sizeof(*bq), M_NOWAIT | M_ZERO);
bq->bp = bp;
mtx_lock(&v->bqueue_mtx);
TAILQ_INSERT_TAIL(&v->bqueue, bq, queue);
wakeup(v);
mtx_unlock(&v->bqueue_mtx);
}
static void
gv_vol_worker(void *arg)
{
struct bio *bp;
struct gv_volume *v;
struct gv_bioq *bq;
v = arg;
KASSERT(v != NULL, ("NULL v"));
mtx_lock(&v->bqueue_mtx);
for (;;) {
/* We were signaled to exit. */
if (v->flags & GV_VOL_THREAD_DIE)
break;
/* Take the first BIO from our queue. */
bq = TAILQ_FIRST(&v->bqueue);
if (bq == NULL) {
msleep(v, &v->bqueue_mtx, PRIBIO, "-", hz/10);
continue;
}
TAILQ_REMOVE(&v->bqueue, bq, queue);
mtx_unlock(&v->bqueue_mtx);
bp = bq->bp;
g_free(bq);
if (bp->bio_cflags & GV_BIO_DONE)
gv_vol_completed_request(v, bp);
else
gv_vol_normal_request(v, bp);
mtx_lock(&v->bqueue_mtx);
}
mtx_unlock(&v->bqueue_mtx);
v->flags |= GV_VOL_THREAD_DEAD;
wakeup(v);
kthread_exit(ENXIO);
}
static void
gv_vol_completed_request(struct gv_volume *v, struct bio *bp)
{
struct bio *pbp;
struct gv_bioq *bq;
pbp = bp->bio_parent;
if (pbp->bio_error == 0)
pbp->bio_error = bp->bio_error;
switch (pbp->bio_cmd) {
case BIO_READ:
if (bp->bio_error) {
g_destroy_bio(bp);
pbp->bio_children--;
bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
bq->bp = pbp;
mtx_lock(&v->bqueue_mtx);
TAILQ_INSERT_TAIL(&v->bqueue, bq, queue);
mtx_unlock(&v->bqueue_mtx);
return;
}
break;
case BIO_WRITE:
case BIO_DELETE:
break;
}
/* When the original request is finished, we deliver it. */
pbp->bio_inbed++;
if (pbp->bio_inbed == pbp->bio_children) {
pbp->bio_completed = bp->bio_length;
g_io_deliver(pbp, pbp->bio_error);
}
g_destroy_bio(bp);
}
static void
gv_vol_normal_request(struct gv_volume *v, struct bio *bp)
{
struct g_geom *gp;
struct gv_plex *p;
struct bio *cbp, *pbp;
gp = v->geom;
switch (bp->bio_cmd) {
case BIO_READ:
cbp = g_clone_bio(bp);
if (cbp == NULL) {
g_io_deliver(bp, ENOMEM);
return;
}
cbp->bio_done = gv_volume_done;
LIST_FOREACH(p, &v->plexes, in_volume) {
if (p->state >= GV_PLEX_DEGRADED)
break;
}
g_io_request(cbp, p->consumer);
break;
case BIO_WRITE:
case BIO_DELETE:
LIST_FOREACH(p, &v->plexes, in_volume) {
if (p->state < GV_PLEX_DEGRADED)
continue;
cbp = g_clone_bio(bp);
if (cbp == NULL) /* XXX */
g_io_deliver(bp, ENOMEM);
cbp->bio_done = gv_volume_done;
cbp->bio_caller2 = p->consumer;
if (bp->bio_driver1 == NULL) {
bp->bio_driver1 = cbp;
} else {
pbp = bp->bio_driver1;
while (pbp->bio_caller1 != NULL)
pbp = pbp->bio_caller1;
pbp->bio_caller1 = cbp;
}
}
/* Fire off all sub-requests. */
pbp = bp->bio_driver1;
while (pbp != NULL) {
g_io_request(pbp, pbp->bio_caller2);
pbp = pbp->bio_caller1;
}
break;
}
}
static int
@ -211,6 +324,11 @@ gv_volume_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
gp->access = gv_volume_access;
gp->softc = v;
first++;
TAILQ_INIT(&v->bqueue);
mtx_init(&v->bqueue_mtx, "gv_plex", NULL, MTX_DEF);
kthread_create(gv_vol_worker, v, NULL, 0, 0, "gv_v %s",
v->name);
v->flags |= GV_VOL_THREAD_ACTIVE;
} else
gp = v->geom;
@ -261,9 +379,13 @@ static int
gv_volume_destroy_geom(struct gctl_req *req, struct g_class *mp,
struct g_geom *gp)
{
struct gv_volume *v;
g_trace(G_T_TOPOLOGY, "gv_volume_destroy_geom: %s", gp->name);
g_topology_assert();
v = gp->softc;
gv_kill_vol_thread(v);
g_wither_geom(gp, ENXIO);
return (0);
}