From 3830659e99640001c09d26dfc0e1bbd77d919a62 Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Sat, 20 Jun 2020 06:23:31 +0000 Subject: [PATCH] loader: create single zfs nextboot implementation We should have nextboot feature implemented in libsa zfs code. To get there, I have created zfs_nextboot() implementation based on two sources, our current simple textual string based approach with added structured boot label PAD structure from OpenZFS. Secondly, all nvlist details are moved to separate source file and restructured a bit. This is done to provide base support to add nvlist add/update feature in followup updates. And finally, the zfsboot/gptzfsboot disk access functions are swapped to use libi386 and libsa. Sponsored by: Netflix, Klara Inc. Differential Revision: https://reviews.freebsd.org/D25324 --- stand/efi/libefi/Makefile | 2 + stand/efi/loader/main.c | 15 +- stand/i386/gptzfsboot/Makefile | 19 +- stand/i386/libi386/Makefile | 2 + stand/i386/zfsboot/Makefile | 22 +- stand/i386/zfsboot/zfsboot.c | 1156 ++++++++---------------------- stand/libsa/zfs/Makefile.inc | 2 +- stand/libsa/zfs/libzfs.h | 79 +- stand/libsa/zfs/nvlist.c | 601 ++++++++++++++++ stand/libsa/zfs/zfs.c | 264 ++++++- stand/libsa/zfs/zfsimpl.c | 414 +++-------- stand/loader.mk | 1 + stand/userboot/userboot/Makefile | 1 + sys/cddl/boot/zfs/zfsimpl.h | 35 +- 14 files changed, 1432 insertions(+), 1181 deletions(-) create mode 100644 stand/libsa/zfs/nvlist.c diff --git a/stand/efi/libefi/Makefile b/stand/efi/libefi/Makefile index 6e5628047e17..8bbd421486ca 100644 --- a/stand/efi/libefi/Makefile +++ b/stand/efi/libefi/Makefile @@ -52,6 +52,8 @@ CFLAGS.efi_console.c+= -I${SRCTOP}/sys/teken CFLAGS.teken.c+= -I${SRCTOP}/sys/teken .if ${MK_LOADER_ZFS} != "no" CFLAGS+= -I${ZFSSRC} +CFLAGS+= -I${SYSDIR}/cddl/boot/zfs +CFLAGS+= -I${SYSDIR}/cddl/contrib/opensolaris/uts/common CFLAGS+= -DEFI_ZFS_BOOT .endif diff --git a/stand/efi/loader/main.c b/stand/efi/loader/main.c index 14c320a09811..6ad626b2df39 100644 --- a/stand/efi/loader/main.c +++ b/stand/efi/loader/main.c @@ -260,6 +260,8 @@ probe_zfs_currdev(uint64_t guid) { char *devname; struct zfs_devdesc currdev; + char *buf = NULL; + bool rv; currdev.dd.d_dev = &zfs_dev; currdev.dd.d_unit = 0; @@ -269,7 +271,18 @@ probe_zfs_currdev(uint64_t guid) devname = efi_fmtdev(&currdev); init_zfs_bootenv(devname); - return (sanity_check_currdev()); + rv = sanity_check_currdev(); + if (rv) { + buf = malloc(VDEV_PAD_SIZE); + if (buf != NULL) { + if (zfs_nextboot(&currdev, buf, VDEV_PAD_SIZE) == 0) { + printf("zfs nextboot: %s\n", buf); + set_currdev(buf); + } + free(buf); + } + } + return (rv); } #endif diff --git a/stand/i386/gptzfsboot/Makefile b/stand/i386/gptzfsboot/Makefile index fb5c801d8bcd..6aa7464e1503 100644 --- a/stand/i386/gptzfsboot/Makefile +++ b/stand/i386/gptzfsboot/Makefile @@ -4,7 +4,7 @@ .PATH: ${BOOTSRC}/i386/boot2 ${BOOTSRC}/i386/gptboot \ ${BOOTSRC}/i386/zfsboot ${BOOTSRC}/i386/common \ - ${SASRC} + ${BOOTSRC}/common FILES= gptzfsboot MAN= gptzfsboot.8 @@ -19,12 +19,16 @@ ORG2= 0x0 CFLAGS+=-DBOOTPROG=\"gptzfsboot\" \ -O1 \ - -DGPT -DZFS -DBOOT2 \ + -DBOOT2 \ + -DLOADER_GPT_SUPPORT \ + -DLOADER_MBR_SUPPORT \ + -DLOADER_ZFS_SUPPORT \ -DSIOPRT=${BOOT_COMCONSOLE_PORT} \ -DSIOFMT=${B2SIOFMT} \ -DSIOSPD=${BOOT_COMCONSOLE_SPEED} \ -I${LDRSRC} \ -I${BOOTSRC}/i386/common \ + -I${BOOTSRC}/i386/libi386 \ -I${ZFSSRC} \ -I${SYSDIR}/crypto/skein \ -I${SYSDIR}/cddl/boot/zfs \ @@ -60,15 +64,18 @@ gptldr.bin: gptldr.out gptldr.out: gptldr.o ${LD} ${LD_FLAGS} -e start --defsym ORG=${ORG1} -T ${LDSCRIPT} -o ${.TARGET} gptldr.o -CLEANFILES+= gptzfsboot.bin gptzfsboot.out zfsboot.o sio.o cons.o \ - drv.o gpt.o ${OPENCRYPTO_XTS} +OBJS= zfsboot.o sio.o cons.o bcache.o devopen.o disk.o part.o zfs_cmd.o +CLEANFILES+= gptzfsboot.bin gptzfsboot.out ${OBJS} ${OPENCRYPTO_XTS} + +# i386 standalone support library +LIBI386= ${BOOTOBJ}/i386/libi386/libi386.a gptzfsboot.bin: gptzfsboot.out ${OBJCOPY} -S -O binary gptzfsboot.out ${.TARGET} -gptzfsboot.out: ${BTXCRT} zfsboot.o sio.o gpt.o drv.o cons.o \ +gptzfsboot.out: ${BTXCRT} ${OBJS} \ ${OPENCRYPTO_XTS} - ${LD} ${LD_FLAGS} --defsym ORG=${ORG2} -T ${LDSCRIPT} -o ${.TARGET} ${.ALLSRC} ${LIBSA32} + ${LD} ${LD_FLAGS} --defsym ORG=${ORG2} -T ${LDSCRIPT} -o ${.TARGET} ${.ALLSRC} ${LIBI386} ${LIBSA32} zfsboot.o: ${ZFSSRC}/zfsimpl.c diff --git a/stand/i386/libi386/Makefile b/stand/i386/libi386/Makefile index a4485b60a28e..779575bff071 100644 --- a/stand/i386/libi386/Makefile +++ b/stand/i386/libi386/Makefile @@ -37,6 +37,8 @@ CFLAGS+= -Dalloca=__builtin_alloca CFLAGS+= -I${BOOTSRC}/ficl -I${BOOTSRC}/ficl/i386 \ -I${LDRSRC} -I${BOOTSRC}/i386/common \ + -I${SYSDIR}/cddl/boot/zfs \ + -I${SYSDIR}/cddl/contrib/opensolaris/uts/common \ -I${SYSDIR}/contrib/dev/acpica/include # Handle FreeBSD specific %b and %D printf format specifiers diff --git a/stand/i386/zfsboot/Makefile b/stand/i386/zfsboot/Makefile index 80303cb8fde0..8c0527848478 100644 --- a/stand/i386/zfsboot/Makefile +++ b/stand/i386/zfsboot/Makefile @@ -2,7 +2,7 @@ .include -.PATH: ${BOOTSRC}/i386/boot2 ${BOOTSRC}/i386/common ${SASRC} +.PATH: ${BOOTSRC}/i386/boot2 ${BOOTSRC}/i386/common ${BOOTSRC}/common FILES= zfsboot MAN= zfsboot.8 @@ -17,13 +17,17 @@ ORG2= 0x2000 CFLAGS+=-DBOOTPROG=\"zfsboot\" \ -O1 \ - -DZFS -DBOOT2 \ + -DBOOT2 \ + -DLOADER_GPT_SUPPORT \ + -DLOADER_MBR_SUPPORT \ + -DLOADER_ZFS_SUPPORT \ + -DLOADER_UFS_SUPPORT \ -DSIOPRT=${BOOT_COMCONSOLE_PORT} \ -DSIOFMT=${B2SIOFMT} \ -DSIOSPD=${BOOT_COMCONSOLE_SPEED} \ -I${LDRSRC} \ -I${BOOTSRC}/i386/common \ - -I${BOOTSRC}/i386 \ + -I${BOOTSRC}/i386/libi386 \ -I${ZFSSRC} \ -I${SYSDIR}/crypto/skein \ -I${SYSDIR}/cddl/boot/zfs \ @@ -34,6 +38,8 @@ CFLAGS+=-DBOOTPROG=\"zfsboot\" \ -Wmissing-declarations -Wmissing-prototypes -Wnested-externs \ -Wpointer-arith -Wshadow -Wstrict-prototypes -Wwrite-strings +CFLAGS.part.c+= -DHAVE_MEMCPY -I${SRCTOP}/sys/contrib/zlib + CFLAGS.gcc+= --param max-inline-insns-single=100 LD_FLAGS+=${LD_FLAGS_BIN} @@ -51,14 +57,18 @@ zfsboot1: zfsldr.out zfsldr.out: zfsldr.o ${LD} ${LD_FLAGS} -e start --defsym ORG=${ORG1} -T ${LDSCRIPT} -o ${.TARGET} zfsldr.o +OBJS= zfsboot.o sio.o cons.o bcache.o devopen.o disk.o part.o zfs_cmd.o CLEANFILES+= zfsboot2 zfsboot.ld zfsboot.ldr zfsboot.bin zfsboot.out \ - zfsboot.o zfsboot.s zfsboot.s.tmp sio.o cons.o drv.o + ${OBJS} # We currently allow 256k bytes for zfsboot - in practice it could be # any size up to 3.5Mb but keeping it fixed size simplifies zfsldr. # BOOT2SIZE= 262144 +# i386 standalone support library +LIBI386= ${BOOTOBJ}/i386/libi386/libi386.a + zfsboot2: zfsboot.ld @set -- `ls -l ${.ALLSRC}`; x=$$((${BOOT2SIZE}-$$5)); \ echo "$$x bytes available"; test $$x -ge 0 @@ -74,8 +84,8 @@ zfsboot.ldr: zfsboot.bin: zfsboot.out ${OBJCOPY} -S -O binary zfsboot.out ${.TARGET} -zfsboot.out: ${BTXCRT} zfsboot.o sio.o drv.o cons.o - ${LD} ${LD_FLAGS} --defsym ORG=${ORG2} -T ${LDSCRIPT} -o ${.TARGET} ${.ALLSRC} ${LIBSA32} +zfsboot.out: ${BTXCRT} ${OBJS} + ${LD} ${LD_FLAGS} --defsym ORG=${ORG2} -T ${LDSCRIPT} -o ${.TARGET} ${.ALLSRC} ${LIBI386} ${LIBSA32} SRCS= zfsboot.c diff --git a/stand/i386/zfsboot/zfsboot.c b/stand/i386/zfsboot/zfsboot.c index a51c69267fbc..e387d4a47c9d 100644 --- a/stand/i386/zfsboot/zfsboot.c +++ b/stand/i386/zfsboot/zfsboot.c @@ -16,7 +16,7 @@ #include __FBSDID("$FreeBSD$"); -#include "stand.h" +#include #include #include @@ -35,15 +35,16 @@ __FBSDID("$FreeBSD$"); #include #include - +#include "bootstrap.h" +#include "libi386.h" #include #include "lib.h" #include "rbx.h" -#include "drv.h" -#include "edd.h" #include "cons.h" #include "bootargs.h" +#include "disk.h" +#include "part.h" #include "paths.h" #include "libzfs.h" @@ -61,13 +62,8 @@ __FBSDID("$FreeBSD$"); #define TYPE_MAXHARD TYPE_DA #define TYPE_FD 2 -#define DEV_GELIBOOT_BSIZE 4096 - extern uint32_t _end; -#ifdef GPT -static const uuid_t freebsd_zfs_uuid = GPT_ENT_TYPE_FREEBSD_ZFS; -#endif static const char optstr[NOPT] = "DhaCcdgmnpqrsv"; /* Also 'P', 'S' */ static const unsigned char flags[NOPT] = { RBX_DUAL, @@ -107,785 +103,153 @@ static const struct string { static const unsigned char dev_maj[NDEV] = {30, 4, 2}; +static struct i386_devdesc *bdev; static char cmd[512]; static char cmddup[512]; static char kname[1024]; -static char rootname[256]; static int comspeed = SIOSPD; static struct bootinfo bootinfo; static uint32_t bootdev; static struct zfs_boot_args zfsargs; +#ifdef LOADER_GELI_SUPPORT +static struct geli_boot_args geliargs; +#endif -vm_offset_t high_heap_base; -uint32_t bios_basemem, bios_extmem, high_heap_size; +extern vm_offset_t high_heap_base; +extern uint32_t bios_basemem, bios_extmem, high_heap_size; -static struct bios_smap smap; - -/* - * The minimum amount of memory to reserve in bios_extmem for the heap. - */ -#define HEAP_MIN (64 * 1024 * 1024) - -static char *heap_next; -static char *heap_end; - -/* Buffers that must not span a 64k boundary. */ -#define READ_BUF_SIZE 8192 -struct dmadat { - char rdbuf[READ_BUF_SIZE]; /* for reading large things */ - char secbuf[READ_BUF_SIZE]; /* for MBR/disklabel */ -}; -static struct dmadat *dmadat; +static char *heap_top; +static char *heap_bottom; void exit(int); -void reboot(void); +static void i386_zfs_probe(void); static void load(void); static int parse_cmd(void); -static void bios_getmem(void); -int main(void); #ifdef LOADER_GELI_SUPPORT #include "geliboot.h" static char gelipw[GELI_PW_MAXLEN]; #endif -struct zfsdsk { - struct dsk dsk; -#ifdef LOADER_GELI_SUPPORT - struct geli_dev *gdev; +struct arch_switch archsw; /* MI/MD interface boundary */ +static char boot_devname[2 * ZFS_MAXNAMELEN + 8]; /* disk or pool:dataset */ + +struct devsw *devsw[] = { + &bioshd, +#if defined(LOADER_ZFS_SUPPORT) + &zfs_dev, #endif + NULL }; -#include "zfsimpl.c" - -/* - * Read from a dnode (which must be from a ZPL filesystem). - */ -static int -zfs_read(spa_t *spa, const dnode_phys_t *dnode, off_t *offp, void *start, - size_t size) -{ - const znode_phys_t *zp = (const znode_phys_t *) dnode->dn_bonus; - size_t n; - int rc; - - n = size; - if (*offp + n > zp->zp_size) - n = zp->zp_size - *offp; - - rc = dnode_read(spa, dnode, *offp, start, n); - if (rc) - return (-1); - *offp += n; - - return (n); -} - -/* - * Current ZFS pool - */ -static spa_t *spa; -static spa_t *primary_spa; -static vdev_t *primary_vdev; - -/* - * A wrapper for dskread that doesn't have to worry about whether the - * buffer pointer crosses a 64k boundary. - */ -static int -vdev_read(void *xvdev, void *priv, off_t off, void *buf, size_t bytes) -{ - char *p; - daddr_t lba, alignlba; - off_t diff; - unsigned int nb, alignnb; - struct zfsdsk *zdsk = priv; - - if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1))) - return (-1); - - p = buf; - lba = off / DEV_BSIZE; - lba += zdsk->dsk.start; - /* - * Align reads to 4k else 4k sector GELIs will not decrypt. - * Round LBA down to nearest multiple of DEV_GELIBOOT_BSIZE bytes. - */ - alignlba = rounddown2(off, DEV_GELIBOOT_BSIZE) / DEV_BSIZE; - /* - * The read must be aligned to DEV_GELIBOOT_BSIZE bytes relative to the - * start of the GELI partition, not the start of the actual disk. - */ - alignlba += zdsk->dsk.start; - diff = (lba - alignlba) * DEV_BSIZE; - - while (bytes > 0) { - nb = bytes / DEV_BSIZE; - /* - * Ensure that the read size plus the leading offset does not - * exceed the size of the read buffer. - */ - if (nb > (READ_BUF_SIZE - diff) / DEV_BSIZE) - nb = (READ_BUF_SIZE - diff) / DEV_BSIZE; - /* - * Round the number of blocks to read up to the nearest multiple - * of DEV_GELIBOOT_BSIZE. - */ - alignnb = roundup2(nb * DEV_BSIZE + diff, DEV_GELIBOOT_BSIZE) - / DEV_BSIZE; - - if (zdsk->dsk.size > 0 && alignlba + alignnb > - zdsk->dsk.size + zdsk->dsk.start) { - printf("Shortening read at %lld from %d to %lld\n", - alignlba, alignnb, - (zdsk->dsk.size + zdsk->dsk.start) - alignlba); - alignnb = (zdsk->dsk.size + zdsk->dsk.start) - alignlba; - } - - if (drvread(&zdsk->dsk, dmadat->rdbuf, alignlba, alignnb)) - return (-1); -#ifdef LOADER_GELI_SUPPORT - /* decrypt */ - if (zdsk->gdev != NULL) { - if (geli_read(zdsk->gdev, - ((alignlba - zdsk->dsk.start) * DEV_BSIZE), - dmadat->rdbuf, alignnb * DEV_BSIZE)) - return (-1); - } +struct fs_ops *file_system[] = { +#if defined(LOADER_ZFS_SUPPORT) + &zfs_fsops, #endif - memcpy(p, dmadat->rdbuf + diff, nb * DEV_BSIZE); - p += nb * DEV_BSIZE; - lba += nb; - alignlba += alignnb; - bytes -= nb * DEV_BSIZE; - /* Don't need the leading offset after the first block. */ - diff = 0; - } - - return (0); -} -/* Match the signature exactly due to signature madness */ -static int -vdev_read2(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes) -{ - return (vdev_read(vdev, priv, off, buf, bytes)); -} - - -static int -vdev_write(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes) -{ - char *p; - daddr_t lba; - unsigned int nb; - struct zfsdsk *zdsk = priv; - - if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1))) - return (-1); - - p = buf; - lba = off / DEV_BSIZE; - lba += zdsk->dsk.start; - while (bytes > 0) { - nb = bytes / DEV_BSIZE; - if (nb > READ_BUF_SIZE / DEV_BSIZE) - nb = READ_BUF_SIZE / DEV_BSIZE; - memcpy(dmadat->rdbuf, p, nb * DEV_BSIZE); - if (drvwrite(&zdsk->dsk, dmadat->rdbuf, lba, nb)) - return (-1); - p += nb * DEV_BSIZE; - lba += nb; - bytes -= nb * DEV_BSIZE; - } - - return (0); -} - -static int -xfsread(const dnode_phys_t *dnode, off_t *offp, void *buf, size_t nbyte) -{ - if ((size_t)zfs_read(spa, dnode, offp, buf, nbyte) != nbyte) { - printf("Invalid format\n"); - return (-1); - } - return (0); -} - -/* - * Read Pad2 (formerly "Boot Block Header") area of the first - * vdev label of the given vdev. - */ -static int -vdev_read_pad2(vdev_t *vdev, char *buf, size_t size) -{ - blkptr_t bp; - char *tmp; - off_t off = offsetof(vdev_label_t, vl_pad2); - int rc; - - if (size > VDEV_PAD_SIZE) - size = VDEV_PAD_SIZE; - - tmp = malloc(VDEV_PAD_SIZE); - if (tmp == NULL) - return (ENOMEM); - - BP_ZERO(&bp); - BP_SET_LSIZE(&bp, VDEV_PAD_SIZE); - BP_SET_PSIZE(&bp, VDEV_PAD_SIZE); - BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); - BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); - DVA_SET_OFFSET(BP_IDENTITY(&bp), off); - rc = vdev_read_phys(vdev, &bp, tmp, off, 0); - if (rc == 0) - memcpy(buf, tmp, size); - free(tmp); - return (rc); -} - -static int -vdev_clear_pad2(vdev_t *vdev) -{ - char *zeroes; - uint64_t *end; - off_t off = offsetof(vdev_label_t, vl_pad2); - int rc; - - zeroes = malloc(VDEV_PAD_SIZE); - if (zeroes == NULL) - return (ENOMEM); - - memset(zeroes, 0, VDEV_PAD_SIZE); - end = (uint64_t *)(zeroes + VDEV_PAD_SIZE); - /* ZIO_CHECKSUM_LABEL magic and pre-calcualted checksum for all zeros */ - end[-5] = 0x0210da7ab10c7a11; - end[-4] = 0x97f48f807f6e2a3f; - end[-3] = 0xaf909f1658aacefc; - end[-2] = 0xcbd1ea57ff6db48b; - end[-1] = 0x6ec692db0d465fab; - rc = vdev_write(vdev, vdev->v_read_priv, off, zeroes, VDEV_PAD_SIZE); - free(zeroes); - return (rc); -} - -static void -bios_getmem(void) -{ - uint64_t size; - - /* Parse system memory map */ - v86.ebx = 0; - do { - v86.ctl = V86_FLAGS; - v86.addr = 0x15; /* int 0x15 function 0xe820 */ - v86.eax = 0xe820; - v86.ecx = sizeof(struct bios_smap); - v86.edx = SMAP_SIG; - v86.es = VTOPSEG(&smap); - v86.edi = VTOPOFF(&smap); - v86int(); - if (V86_CY(v86.efl) || (v86.eax != SMAP_SIG)) - break; - /* look for a low-memory segment that's large enough */ - if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base == 0) && - (smap.length >= (512 * 1024))) - bios_basemem = smap.length; - /* look for the first segment in 'extended' memory */ - if ((smap.type == SMAP_TYPE_MEMORY) && - (smap.base == 0x100000)) { - bios_extmem = smap.length; - } - - /* - * Look for the largest segment in 'extended' memory beyond - * 1MB but below 4GB. - */ - if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base > 0x100000) && - (smap.base < 0x100000000ull)) { - size = smap.length; - - /* - * If this segment crosses the 4GB boundary, - * truncate it. - */ - if (smap.base + size > 0x100000000ull) - size = 0x100000000ull - smap.base; - - if (size > high_heap_size) { - high_heap_size = size; - high_heap_base = smap.base; - } - } - } while (v86.ebx != 0); - - /* Fall back to the old compatibility function for base memory */ - if (bios_basemem == 0) { - v86.ctl = 0; - v86.addr = 0x12; /* int 0x12 */ - v86int(); - - bios_basemem = (v86.eax & 0xffff) * 1024; - } - - /* - * Fall back through several compatibility functions for extended - * memory. - */ - if (bios_extmem == 0) { - v86.ctl = V86_FLAGS; - v86.addr = 0x15; /* int 0x15 function 0xe801 */ - v86.eax = 0xe801; - v86int(); - if (!V86_CY(v86.efl)) { - bios_extmem = ((v86.ecx & 0xffff) + - ((v86.edx & 0xffff) * 64)) * 1024; - } - } - if (bios_extmem == 0) { - v86.ctl = 0; - v86.addr = 0x15; /* int 0x15 function 0x88 */ - v86.eax = 0x8800; - v86int(); - bios_extmem = (v86.eax & 0xffff) * 1024; - } - - /* - * If we have extended memory and did not find a suitable heap - * region in the SMAP, use the last 3MB of 'extended' memory as a - * high heap candidate. - */ - if (bios_extmem >= HEAP_MIN && high_heap_size < HEAP_MIN) { - high_heap_size = HEAP_MIN; - high_heap_base = bios_extmem + 0x100000 - HEAP_MIN; - } -} - -/* - * Try to detect a device supported by the legacy int13 BIOS - */ -static int -int13probe(int drive) -{ - v86.ctl = V86_FLAGS; - v86.addr = 0x13; - v86.eax = 0x800; - v86.edx = drive; - v86int(); - - if (!V86_CY(v86.efl) && /* carry clear */ - ((v86.edx & 0xff) != (drive & DRV_MASK))) { /* unit # OK */ - if ((v86.ecx & 0x3f) == 0) { /* absurd sector size */ - return (0); /* skip device */ - } - return (1); - } - return (0); -} - -/* - * We call this when we find a ZFS vdev - ZFS consumes the dsk - * structure so we must make a new one. - */ -static struct zfsdsk * -copy_dsk(struct zfsdsk *zdsk) -{ - struct zfsdsk *newdsk; - - newdsk = malloc(sizeof(struct zfsdsk)); - *newdsk = *zdsk; - return (newdsk); -} - -/* - * Get disk size from GPT. - */ -static uint64_t -drvsize_gpt(struct dsk *dskp) -{ -#ifdef GPT - struct gpt_hdr hdr; - char *sec; - - sec = dmadat->secbuf; - if (drvread(dskp, sec, 1, 1)) - return (0); - - memcpy(&hdr, sec, sizeof(hdr)); - if (memcmp(hdr.hdr_sig, GPT_HDR_SIG, sizeof(hdr.hdr_sig)) != 0 || - hdr.hdr_lba_self != 1 || hdr.hdr_revision < 0x00010000 || - hdr.hdr_entsz < sizeof(struct gpt_ent) || - DEV_BSIZE % hdr.hdr_entsz != 0) { - return (0); - } - return (hdr.hdr_lba_alt + 1); -#else - return (0); +#if defined(LOADER_UFS_SUPPORT) + &ufs_fsops, #endif -} + NULL +}; -/* - * Get disk size from eax=0x800 and 0x4800. We need to probe both - * because 0x4800 may not be available and we would like to get more - * or less correct disk size - if it is possible at all. - * Note we do not really want to touch drv.c because that code is shared - * with boot2 and we can not afford to grow that code. - */ -static uint64_t -drvsize_ext(struct zfsdsk *zdsk) +caddr_t +ptov(uintptr_t x) { - struct dsk *dskp; - uint64_t size, tmp; - int cyl, hds, sec; - - dskp = &zdsk->dsk; - - /* Try to read disk size from GPT */ - size = drvsize_gpt(dskp); - if (size != 0) - return (size); - - v86.ctl = V86_FLAGS; - v86.addr = 0x13; - v86.eax = 0x800; - v86.edx = dskp->drive; - v86int(); - - /* Don't error out if we get bad sector number, try EDD as well */ - if (V86_CY(v86.efl) || /* carry set */ - (v86.edx & 0xff) <= (unsigned)(dskp->drive & 0x7f)) /* unit # bad */ - return (0); - cyl = ((v86.ecx & 0xc0) << 2) + ((v86.ecx & 0xff00) >> 8) + 1; - /* Convert max head # -> # of heads */ - hds = ((v86.edx & 0xff00) >> 8) + 1; - sec = v86.ecx & 0x3f; - - size = (uint64_t)cyl * hds * sec; - - /* Determine if we can use EDD with this device. */ - v86.ctl = V86_FLAGS; - v86.addr = 0x13; - v86.eax = 0x4100; - v86.edx = dskp->drive; - v86.ebx = 0x55aa; - v86int(); - if (V86_CY(v86.efl) || /* carry set */ - (v86.ebx & 0xffff) != 0xaa55 || /* signature */ - (v86.ecx & EDD_INTERFACE_FIXED_DISK) == 0) - return (size); - - tmp = drvsize(dskp); - if (tmp > size) - size = tmp; - - return (size); -} - -/* - * The "layered" ioctl to read disk/partition size. Unfortunately - * the zfsboot case is hardest, because we do not have full software - * stack available, so we need to do some manual work here. - */ -uint64_t -ldi_get_size(void *priv) -{ - struct zfsdsk *zdsk = priv; - uint64_t size = zdsk->dsk.size; - - if (zdsk->dsk.start == 0) - size = drvsize_ext(zdsk); - - return (size * DEV_BSIZE); -} - -static void -probe_drive(struct zfsdsk *zdsk) -{ -#ifdef GPT - struct gpt_hdr hdr; - struct gpt_ent *ent; - unsigned part, entries_per_sec; - daddr_t slba; -#endif -#if defined(GPT) || defined(LOADER_GELI_SUPPORT) - daddr_t elba; -#endif - - struct dos_partition *dp; - char *sec; - unsigned i; - -#ifdef LOADER_GELI_SUPPORT - /* - * Taste the disk, if it is GELI encrypted, decrypt it then dig out the - * partition table and probe each slice/partition in turn for a vdev or - * GELI encrypted vdev. - */ - elba = drvsize_ext(zdsk); - if (elba > 0) { - elba--; - } - zdsk->gdev = geli_taste(vdev_read, zdsk, elba, "disk%u:0:"); - if ((zdsk->gdev != NULL) && (geli_havekey(zdsk->gdev) == 0)) - geli_passphrase(zdsk->gdev, gelipw); -#endif /* LOADER_GELI_SUPPORT */ - - sec = dmadat->secbuf; - zdsk->dsk.start = 0; - -#ifdef GPT - /* - * First check for GPT. - */ - if (drvread(&zdsk->dsk, sec, 1, 1)) { - return; - } - memcpy(&hdr, sec, sizeof(hdr)); - if (memcmp(hdr.hdr_sig, GPT_HDR_SIG, sizeof(hdr.hdr_sig)) != 0 || - hdr.hdr_lba_self != 1 || hdr.hdr_revision < 0x00010000 || - hdr.hdr_entsz < sizeof(*ent) || DEV_BSIZE % hdr.hdr_entsz != 0) { - goto trymbr; - } - - /* - * Probe all GPT partitions for the presence of ZFS pools. We - * return the spa_t for the first we find (if requested). This - * will have the effect of booting from the first pool on the - * disk. - * - * If no vdev is found, GELI decrypting the device and try again - */ - entries_per_sec = DEV_BSIZE / hdr.hdr_entsz; - slba = hdr.hdr_lba_table; - elba = slba + hdr.hdr_entries / entries_per_sec; - while (slba < elba) { - zdsk->dsk.start = 0; - if (drvread(&zdsk->dsk, sec, slba, 1)) - return; - for (part = 0; part < entries_per_sec; part++) { - ent = (struct gpt_ent *)(sec + part * hdr.hdr_entsz); - if (memcmp(&ent->ent_type, &freebsd_zfs_uuid, - sizeof(uuid_t)) == 0) { - zdsk->dsk.start = ent->ent_lba_start; - zdsk->dsk.size = - ent->ent_lba_end - ent->ent_lba_start + 1; - zdsk->dsk.slice = part + 1; - zdsk->dsk.part = 255; - if (vdev_probe(vdev_read2, zdsk, NULL) == 0) { - /* - * This slice had a vdev. We need a new - * dsk structure now since the vdev now - * owns this one. - */ - zdsk = copy_dsk(zdsk); - } -#ifdef LOADER_GELI_SUPPORT - else if ((zdsk->gdev = geli_taste(vdev_read, - zdsk, ent->ent_lba_end - ent->ent_lba_start, - "disk%up%u:", zdsk->dsk.unit, - zdsk->dsk.slice)) != NULL) { - if (geli_havekey(zdsk->gdev) == 0 || - geli_passphrase(zdsk->gdev, gelipw) - == 0) { - /* - * This slice has GELI, - * check it for ZFS. - */ - if (vdev_probe(vdev_read2, - zdsk, NULL) == 0) { - /* - * This slice had a - * vdev. We need a new - * dsk structure now - * since the vdev now - * owns this one. - */ - zdsk = copy_dsk(zdsk); - } - break; - } - } -#endif /* LOADER_GELI_SUPPORT */ - } - } - slba++; - } - return; -trymbr: -#endif /* GPT */ - - if (drvread(&zdsk->dsk, sec, DOSBBSECTOR, 1)) - return; - dp = (void *)(sec + DOSPARTOFF); - - for (i = 0; i < NDOSPART; i++) { - if (!dp[i].dp_typ) - continue; - zdsk->dsk.start = dp[i].dp_start; - zdsk->dsk.size = dp[i].dp_size; - zdsk->dsk.slice = i + 1; - if (vdev_probe(vdev_read2, zdsk, NULL) == 0) { - zdsk = copy_dsk(zdsk); - } -#ifdef LOADER_GELI_SUPPORT - else if ((zdsk->gdev = geli_taste(vdev_read, zdsk, - dp[i].dp_size - dp[i].dp_start, "disk%us%u:")) != NULL) { - if (geli_havekey(zdsk->gdev) == 0 || - geli_passphrase(zdsk->gdev, gelipw) == 0) { - /* - * This slice has GELI, check it for ZFS. - */ - if (vdev_probe(vdev_read2, zdsk, NULL) == 0) { - /* - * This slice had a vdev. We need a new - * dsk structure now since the vdev now - * owns this one. - */ - zdsk = copy_dsk(zdsk); - } - break; - } - } -#endif /* LOADER_GELI_SUPPORT */ - } + return (PTOV(x)); } int main(void) { - dnode_phys_t dn; - off_t off; - struct zfsdsk *zdsk; - int autoboot, i; - int nextboot; - int rc; - - dmadat = (void *)(roundup2(__base + (int32_t)&_end, 0x10000) - __base); + unsigned i; + int auto_boot, fd, nextboot = 0; + struct disk_devdesc devdesc; bios_getmem(); if (high_heap_size > 0) { - heap_end = PTOV(high_heap_base + high_heap_size); - heap_next = PTOV(high_heap_base); + heap_top = PTOV(high_heap_base + high_heap_size); + heap_bottom = PTOV(high_heap_base); } else { - heap_next = (char *)dmadat + sizeof(*dmadat); - heap_end = (char *)PTOV(bios_basemem); + heap_bottom = (char *) + (roundup2(__base + (int32_t)&_end, 0x10000) - __base); + heap_top = (char *)PTOV(bios_basemem); } - setheap(heap_next, heap_end); + setheap(heap_bottom, heap_top); - zdsk = calloc(1, sizeof(struct zfsdsk)); - zdsk->dsk.drive = *(uint8_t *)PTOV(ARGS); - zdsk->dsk.type = zdsk->dsk.drive & DRV_HARD ? TYPE_AD : TYPE_FD; - zdsk->dsk.unit = zdsk->dsk.drive & DRV_MASK; - zdsk->dsk.slice = *(uint8_t *)PTOV(ARGS + 1) + 1; - zdsk->dsk.part = 0; - zdsk->dsk.start = 0; - zdsk->dsk.size = drvsize_ext(zdsk); + /* + * Initialise the block cache. Set the upper limit. + */ + bcache_init(32768, 512); + + archsw.arch_autoload = NULL; + archsw.arch_getdev = i386_getdev; + archsw.arch_copyin = NULL; + archsw.arch_copyout = NULL; + archsw.arch_readin = NULL; + archsw.arch_isainb = NULL; + archsw.arch_isaoutb = NULL; + archsw.arch_zfs_probe = i386_zfs_probe; bootinfo.bi_version = BOOTINFO_VERSION; bootinfo.bi_size = sizeof(bootinfo); bootinfo.bi_basemem = bios_basemem / 1024; bootinfo.bi_extmem = bios_extmem / 1024; bootinfo.bi_memsizes_valid++; - bootinfo.bi_bios_dev = zdsk->dsk.drive; + bootinfo.bi_bios_dev = *(uint8_t *)PTOV(ARGS); - bootdev = MAKEBOOTDEV(dev_maj[zdsk->dsk.type], - zdsk->dsk.slice, zdsk->dsk.unit, zdsk->dsk.part); + /* Set up fall back device name. */ + snprintf(boot_devname, sizeof (boot_devname), "disk%d:", + bd_bios2unit(bootinfo.bi_bios_dev)); - /* Process configuration file */ + for (i = 0; devsw[i] != NULL; i++) + if (devsw[i]->dv_init != NULL) + (devsw[i]->dv_init)(); - autoboot = 1; + disk_parsedev(&devdesc, boot_devname + 4, NULL); - zfs_init(); + bootdev = MAKEBOOTDEV(dev_maj[DEVT_DISK], devdesc.d_slice + 1, + devdesc.dd.d_unit, + devdesc.d_partition >= 0 ? devdesc.d_partition : 0xff); /* - * Probe the boot drive first - we will try to boot from whatever - * pool we find on that drive. + * zfs_fmtdev() can be called only after dv_init */ - probe_drive(zdsk); - - /* - * Probe the rest of the drives that the bios knows about. This - * will find any other available pools and it may fill in missing - * vdevs for the boot pool. - */ -#ifndef VIRTUALBOX - for (i = 0; i < *(unsigned char *)PTOV(BIOS_NUMDRIVES); i++) -#else - for (i = 0; i < MAXBDDEV; i++) -#endif - { - if ((i | DRV_HARD) == *(uint8_t *)PTOV(ARGS)) - continue; - - if (!int13probe(i | DRV_HARD)) - break; - - zdsk = calloc(1, sizeof(struct zfsdsk)); - zdsk->dsk.drive = i | DRV_HARD; - zdsk->dsk.type = zdsk->dsk.drive & TYPE_AD; - zdsk->dsk.unit = i; - zdsk->dsk.slice = 0; - zdsk->dsk.part = 0; - zdsk->dsk.start = 0; - zdsk->dsk.size = drvsize_ext(zdsk); - probe_drive(zdsk); - } - - /* - * The first discovered pool, if any, is the pool. - */ - spa = spa_get_primary(); - if (!spa) { - printf("%s: No ZFS pools located, can't boot\n", BOOTPROG); - for (;;) - ; - } - - primary_spa = spa; - primary_vdev = spa_get_primary_vdev(spa); - - nextboot = 0; - rc = vdev_read_pad2(primary_vdev, cmd, sizeof(cmd)); - if (vdev_clear_pad2(primary_vdev)) - printf("failed to clear pad2 area of primary vdev\n"); - if (rc == 0) { - if (*cmd) { - /* - * We could find an old-style ZFS Boot Block header - * here. Simply ignore it. - */ - if (*(uint64_t *)cmd != 0x2f5b007b10c) { - /* - * Note that parse() is destructive to cmd[] - * and we also want to honor RBX_QUIET option - * that could be present in cmd[]. - */ - nextboot = 1; - memcpy(cmddup, cmd, sizeof(cmd)); - if (parse_cmd()) { - printf("failed to parse pad2 area of " - "primary vdev\n"); - reboot(); - } + if (bdev != NULL && bdev->dd.d_dev->dv_type == DEVT_ZFS) { + /* set up proper device name string for ZFS */ + strncpy(boot_devname, zfs_fmtdev(bdev), sizeof (boot_devname)); + if (zfs_nextboot(bdev, cmd, sizeof(cmd)) == 0) { + nextboot = 1; + memcpy(cmddup, cmd, sizeof(cmd)); + if (parse_cmd()) { if (!OPT_CHECK(RBX_QUIET)) - printf("zfs nextboot: %s\n", cmddup); + printf("failed to parse pad2 area\n"); + exit(0); } + if (!OPT_CHECK(RBX_QUIET)) + printf("zfs nextboot: %s\n", cmddup); /* Do not process this command twice */ *cmd = 0; } - } else - printf("failed to read pad2 area of primary vdev\n"); + } - /* Mount ZFS only if it's not already mounted via nextboot parsing. */ - if (zfsmount.spa == NULL && - (zfs_spa_init(spa) != 0 || zfs_mount(spa, 0, &zfsmount) != 0)) { - printf("%s: failed to mount default pool %s\n", - BOOTPROG, spa->spa_name); - autoboot = 0; - } else if (zfs_lookup(&zfsmount, PATH_CONFIG, &dn) == 0 || - zfs_lookup(&zfsmount, PATH_DOTCONFIG, &dn) == 0) { - off = 0; - zfs_read(spa, &dn, &off, cmd, sizeof(cmd)); + /* now make sure we have bdev on all cases */ + free(bdev); + i386_getdev((void **)&bdev, boot_devname, NULL); + + env_setenv("currdev", EV_VOLATILE, boot_devname, i386_setcurrdev, + env_nounset); + + /* Process configuration file */ + auto_boot = 1; + + fd = open(PATH_CONFIG, O_RDONLY); + if (fd == -1) + fd = open(PATH_DOTCONFIG, O_RDONLY); + + if (fd != -1) { + read(fd, cmd, sizeof (cmd)); + close(fd); } if (*cmd) { @@ -896,7 +260,7 @@ main(void) */ memcpy(cmddup, cmd, sizeof(cmd)); if (parse_cmd()) - autoboot = 0; + auto_boot = 0; if (!OPT_CHECK(RBX_QUIET)) printf("%s: %s\n", PATH_CONFIG, cmddup); /* Do not process this command twice */ @@ -904,10 +268,10 @@ main(void) } /* Do not risk waiting at the prompt forever. */ - if (nextboot && !autoboot) - reboot(); + if (nextboot && !auto_boot) + exit(0); - if (autoboot && !*kname) { + if (auto_boot && !*kname) { /* * Iterate through the list of loader and kernel paths, * trying to load. If interrupted by a keypress, or in case of @@ -924,28 +288,17 @@ main(void) /* Present the user with the boot2 prompt. */ for (;;) { - if (!autoboot || !OPT_CHECK(RBX_QUIET)) { + if (!auto_boot || !OPT_CHECK(RBX_QUIET)) { printf("\nFreeBSD/x86 boot\n"); - if (zfs_rlookup(spa, zfsmount.rootobj, rootname) != 0) - printf("Default: %s/<0x%llx>:%s\n" - "boot: ", - spa->spa_name, zfsmount.rootobj, kname); - else if (rootname[0] != '\0') - printf("Default: %s/%s:%s\n" - "boot: ", - spa->spa_name, rootname, kname); - else - printf("Default: %s:%s\n" - "boot: ", - spa->spa_name, kname); + printf("Default: %s%s\nboot: ", boot_devname, kname); } if (ioctrl & IO_SERIAL) sio_flush(); - if (!autoboot || keyhit(5)) + if (!auto_boot || keyhit(5)) getstr(cmd, sizeof(cmd)); - else if (!autoboot || !OPT_CHECK(RBX_QUIET)) + else if (!auto_boot || !OPT_CHECK(RBX_QUIET)) putchar('\n'); - autoboot = 0; + auto_boot = 0; if (parse_cmd()) putchar('\a'); else @@ -960,12 +313,6 @@ exit(int x) __exit(x); } -void -reboot(void) -{ - __exit(0); -} - static void load(void) { @@ -976,155 +323,229 @@ load(void) static Elf32_Phdr ep[2]; static Elf32_Shdr es[2]; caddr_t p; - dnode_phys_t dn; - off_t off; uint32_t addr, x; - int fmt, i, j; + int fd, fmt, i, j; + ssize_t size; - if (zfs_lookup(&zfsmount, kname, &dn)) { + if ((fd = open(kname, O_RDONLY)) == -1) { printf("\nCan't find %s\n", kname); return; } - off = 0; - if (xfsread(&dn, &off, &hdr, sizeof(hdr))) + + size = sizeof(hdr); + if (read(fd, &hdr, sizeof (hdr)) != size) { + close(fd); return; - if (N_GETMAGIC(hdr.ex) == ZMAGIC) + } + if (N_GETMAGIC(hdr.ex) == ZMAGIC) { fmt = 0; - else if (IS_ELF(hdr.eh)) + } else if (IS_ELF(hdr.eh)) { fmt = 1; - else { + } else { printf("Invalid %s\n", "format"); + close(fd); return; } if (fmt == 0) { addr = hdr.ex.a_entry & 0xffffff; p = PTOV(addr); - off = PAGE_SIZE; - if (xfsread(&dn, &off, p, hdr.ex.a_text)) + lseek(fd, PAGE_SIZE, SEEK_SET); + size = hdr.ex.a_text; + if (read(fd, p, hdr.ex.a_text) != size) { + close(fd); return; + } p += roundup2(hdr.ex.a_text, PAGE_SIZE); - if (xfsread(&dn, &off, p, hdr.ex.a_data)) + size = hdr.ex.a_data; + if (read(fd, p, hdr.ex.a_data) != size) { + close(fd); return; + } p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE); bootinfo.bi_symtab = VTOP(p); memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms)); p += sizeof(hdr.ex.a_syms); if (hdr.ex.a_syms) { - if (xfsread(&dn, &off, p, hdr.ex.a_syms)) + size = hdr.ex.a_syms; + if (read(fd, p, hdr.ex.a_syms) != size) { + close(fd); return; + } p += hdr.ex.a_syms; - if (xfsread(&dn, &off, p, sizeof(int))) + size = sizeof (int); + if (read(fd, p, sizeof (int)) != size) { + close(fd); return; + } x = *(uint32_t *)p; p += sizeof(int); x -= sizeof(int); - if (xfsread(&dn, &off, p, x)) + size = x; + if (read(fd, p, x) != size) { + close(fd); return; + } p += x; } } else { - off = hdr.eh.e_phoff; + lseek(fd, hdr.eh.e_phoff, SEEK_SET); for (j = i = 0; i < hdr.eh.e_phnum && j < 2; i++) { - if (xfsread(&dn, &off, ep + j, sizeof(ep[0]))) + size = sizeof (ep[0]); + if (read(fd, ep + j, sizeof (ep[0])) != size) { + close(fd); return; + } if (ep[j].p_type == PT_LOAD) j++; } for (i = 0; i < 2; i++) { p = PTOV(ep[i].p_paddr & 0xffffff); - off = ep[i].p_offset; - if (xfsread(&dn, &off, p, ep[i].p_filesz)) + lseek(fd, ep[i].p_offset, SEEK_SET); + size = ep[i].p_filesz; + if (read(fd, p, ep[i].p_filesz) != size) { + close(fd); return; + } } p += roundup2(ep[1].p_memsz, PAGE_SIZE); bootinfo.bi_symtab = VTOP(p); if (hdr.eh.e_shnum == hdr.eh.e_shstrndx + 3) { - off = hdr.eh.e_shoff + sizeof(es[0]) * - (hdr.eh.e_shstrndx + 1); - if (xfsread(&dn, &off, &es, sizeof(es))) + lseek(fd, hdr.eh.e_shoff + + sizeof (es[0]) * (hdr.eh.e_shstrndx + 1), + SEEK_SET); + size = sizeof(es); + if (read(fd, &es, sizeof (es)) != size) { + close(fd); return; + } for (i = 0; i < 2; i++) { memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size)); p += sizeof(es[i].sh_size); - off = es[i].sh_offset; - if (xfsread(&dn, &off, p, es[i].sh_size)) + lseek(fd, es[i].sh_offset, SEEK_SET); + size = es[i].sh_size; + if (read(fd, p, es[i].sh_size) != size) { + close(fd); return; + } p += es[i].sh_size; } } addr = hdr.eh.e_entry & 0xffffff; } + close(fd); + bootinfo.bi_esymtab = VTOP(p); bootinfo.bi_kernelname = VTOP(kname); - zfsargs.size = sizeof(zfsargs); - zfsargs.pool = zfsmount.spa->spa_guid; - zfsargs.root = zfsmount.rootobj; - zfsargs.primary_pool = primary_spa->spa_guid; #ifdef LOADER_GELI_SUPPORT explicit_bzero(gelipw, sizeof(gelipw)); - export_geli_boot_data(&zfsargs.gelidata); #endif - if (primary_vdev != NULL) - zfsargs.primary_vdev = primary_vdev->v_guid; - else - printf("failed to detect primary vdev\n"); - /* - * Note that the zfsargs struct is passed by value, not by pointer. - * Code in btxldr.S copies the values from the entry stack to a fixed - * location within loader(8) at startup due to the presence of - * KARGS_FLAGS_EXTARG. - */ - __exec((caddr_t)addr, RB_BOOTINFO | (opts & RBX_MASK), - bootdev, - KARGS_FLAGS_ZFS | KARGS_FLAGS_EXTARG, - (uint32_t)spa->spa_guid, - (uint32_t)(spa->spa_guid >> 32), - VTOP(&bootinfo), - zfsargs); + + if (bdev->dd.d_dev->dv_type == DEVT_ZFS) { + zfsargs.size = sizeof(zfsargs); + zfsargs.pool = bdev->d_kind.zfs.pool_guid; + zfsargs.root = bdev->d_kind.zfs.root_guid; +#ifdef LOADER_GELI_SUPPORT + export_geli_boot_data(&zfsargs.gelidata); +#endif + /* + * Note that the zfsargs struct is passed by value, not by + * pointer. Code in btxldr.S copies the values from the entry + * stack to a fixed location within loader(8) at startup due + * to the presence of KARGS_FLAGS_EXTARG. + */ + __exec((caddr_t)addr, RB_BOOTINFO | (opts & RBX_MASK), + bootdev, + KARGS_FLAGS_ZFS | KARGS_FLAGS_EXTARG, + (uint32_t)bdev->d_kind.zfs.pool_guid, + (uint32_t)(bdev->d_kind.zfs.pool_guid >> 32), + VTOP(&bootinfo), + zfsargs); + } else { +#ifdef LOADER_GELI_SUPPORT + geliargs.size = sizeof(geliargs); + export_geli_boot_data(&geliargs.gelidata); +#endif + + /* + * Note that the geliargs struct is passed by value, not by + * pointer. Code in btxldr.S copies the values from the entry + * stack to a fixed location within loader(8) at startup due + * to the presence of the KARGS_FLAGS_EXTARG flag. + */ + __exec((caddr_t)addr, RB_BOOTINFO | (opts & RBX_MASK), + bootdev, +#ifdef LOADER_GELI_SUPPORT + KARGS_FLAGS_GELI | KARGS_FLAGS_EXTARG, 0, 0, + VTOP(&bootinfo), geliargs +#else + 0, 0, 0, VTOP(&bootinfo) +#endif + ); + } } static int -zfs_mount_ds(char *dsname) +mount_root(char *arg) { - uint64_t newroot; - spa_t *newspa; - char *q; + char *root; + struct i386_devdesc *ddesc; + uint8_t part; - q = strchr(dsname, '/'); - if (q) - *q++ = '\0'; - newspa = spa_find_by_name(dsname); - if (newspa == NULL) { - printf("\nCan't find ZFS pool %s\n", dsname); - return (-1); + if (asprintf(&root, "%s:", arg) < 0) + return (1); + + if (i386_getdev((void **)&ddesc, root, NULL)) { + free(root); + return (1); } - if (zfs_spa_init(newspa)) - return (-1); - - newroot = 0; - if (q) { - if (zfs_lookup_dataset(newspa, q, &newroot)) { - printf("\nCan't find dataset %s in ZFS pool %s\n", - q, newspa->spa_name); - return (-1); - } + /* we should have new device descriptor, free old and replace it. */ + free(bdev); + bdev = ddesc; + if (bdev->dd.d_dev->dv_type == DEVT_DISK) { + if (bdev->d_kind.biosdisk.partition == -1) + part = 0xff; + else + part = bdev->d_kind.biosdisk.partition; + bootdev = MAKEBOOTDEV(dev_maj[bdev->dd.d_dev->dv_type], + bdev->d_kind.biosdisk.slice + 1, + bdev->dd.d_unit, part); + bootinfo.bi_bios_dev = bd_unit2bios(bdev); } - if (zfs_mount(newspa, newroot, &zfsmount)) { - printf("\nCan't mount ZFS dataset\n"); - return (-1); - } - spa = newspa; + strncpy(boot_devname, root, sizeof (boot_devname)); + setenv("currdev", root, 1); + free(root); return (0); } +static void +fs_list(char *arg) +{ + int fd; + struct dirent *d; + char line[80]; + + fd = open(arg, O_RDONLY); + if (fd < 0) + return; + pager_open(); + while ((d = readdirfd(fd)) != NULL) { + sprintf(line, "%s\n", d->d_name); + if (pager_output(line)) + break; + } + pager_close(); + close(fd); +} + static int parse_cmd(void) { char *arg = cmd; char *ep, *p, *q; const char *cp; + char line[80]; int c, i, j; while ((c = *arg++)) { @@ -1173,13 +594,15 @@ parse_cmd(void) ioctrl &= ~IO_SERIAL; } } if (c == '?') { - dnode_phys_t dn; - - if (zfs_lookup(&zfsmount, arg, &dn) == 0) { - zap_list(spa, &dn); - } + printf("\n"); + if (*arg == '\0') + arg = (char *)"/"; + fs_list(arg); + zfs_list(arg); return (-1); } else { + char *ptr; + printf("\n"); arg--; /* @@ -1187,24 +610,39 @@ parse_cmd(void) * hope no-one wants to load /status as a kernel. */ if (strcmp(arg, "status") == 0) { - spa_all_status(); + pager_open(); + for (i = 0; devsw[i] != NULL; i++) { + if (devsw[i]->dv_print != NULL) { + if (devsw[i]->dv_print(1)) + break; + } else { + snprintf(line, sizeof(line), + "%s: (unknown)\n", + devsw[i]->dv_name); + if (pager_output(line)) + break; + } + } + pager_close(); return (-1); } /* * If there is "zfs:" prefix simply ignore it. */ - if (strncmp(arg, "zfs:", 4) == 0) - arg += 4; + ptr = arg; + if (strncmp(ptr, "zfs:", 4) == 0) + ptr += 4; /* * If there is a colon, switch pools. */ - q = strchr(arg, ':'); + q = strchr(ptr, ':'); if (q) { *q++ = '\0'; - if (zfs_mount_ds(arg) != 0) + if (mount_root(arg) != 0) { return (-1); + } arg = q; } if ((i = ep - arg)) { @@ -1217,3 +655,43 @@ parse_cmd(void) } return (0); } + +/* + * Probe all disks to discover ZFS pools. The idea is to walk all possible + * disk devices, however, we also need to identify possible boot pool. + * For boot pool detection we have boot disk passed us from BIOS, recorded + * in bootinfo.bi_bios_dev. + */ +static void +i386_zfs_probe(void) +{ + char devname[32]; + int boot_unit; + struct i386_devdesc dev; + uint64_t pool_guid = 0; + + dev.dd.d_dev = &bioshd; + /* Translate bios dev to our unit number. */ + boot_unit = bd_bios2unit(bootinfo.bi_bios_dev); + + /* + * Open all the disks we can find and see if we can reconstruct + * ZFS pools from them. + */ + for (dev.dd.d_unit = 0; bd_unit2bios(&dev) >= 0; dev.dd.d_unit++) { + snprintf(devname, sizeof (devname), "%s%d:", bioshd.dv_name, + dev.dd.d_unit); + /* If this is not boot disk, use generic probe. */ + if (dev.dd.d_unit != boot_unit) + zfs_probe_dev(devname, NULL); + else + zfs_probe_dev(devname, &pool_guid); + + if (pool_guid != 0 && bdev == NULL) { + bdev = malloc(sizeof (struct i386_devdesc)); + bzero(bdev, sizeof (struct i386_devdesc)); + bdev->dd.d_dev = &zfs_dev; + bdev->d_kind.zfs.pool_guid = pool_guid; + } + } +} diff --git a/stand/libsa/zfs/Makefile.inc b/stand/libsa/zfs/Makefile.inc index 0c6632b438f3..5e4e2d455c83 100644 --- a/stand/libsa/zfs/Makefile.inc +++ b/stand/libsa/zfs/Makefile.inc @@ -1,7 +1,7 @@ # $FreeBSD$ .PATH: ${ZFSSRC} -SRCS+= zfs.c skein.c skein_block.c list.c +SRCS+= zfs.c nvlist.c skein.c skein_block.c list.c # Do not unroll skein loops, reduce code size CFLAGS+= -DSKEIN_LOOP=111 .PATH: ${SYSDIR}/crypto/skein diff --git a/stand/libsa/zfs/libzfs.h b/stand/libsa/zfs/libzfs.h index fef59e1bb13a..a5df0c1b8255 100644 --- a/stand/libsa/zfs/libzfs.h +++ b/stand/libsa/zfs/libzfs.h @@ -26,6 +26,12 @@ * $FreeBSD$ */ +#include + +#ifdef LOADER_GELI_SUPPORT +#include +#endif + #ifndef _BOOT_LIBZFS_H_ #define _BOOT_LIBZFS_H_ @@ -40,13 +46,80 @@ struct zfs_devdesc { uint64_t root_guid; }; -#ifdef LOADER_GELI_SUPPORT -#include -#endif +/* nvp implementation version */ +#define NV_VERSION 0 + +/* nvlist persistent unique name flags, stored in nvl_nvflags */ +#define NV_UNIQUE_NAME 0x1 +#define NV_UNIQUE_NAME_TYPE 0x2 + +#define NV_ALIGN4(x) (((x) + 3) & ~3) + +/* + * nvlist header. + * nvlist has 4 bytes header followed by version and flags, then nvpairs + * and the list is terminated by double zero. + */ +typedef struct { + char nvh_encoding; + char nvh_endian; + char nvh_reserved1; + char nvh_reserved2; +} nvs_header_t; + +typedef struct { + nvs_header_t nv_header; + size_t nv_asize; + size_t nv_size; + uint8_t *nv_data; + uint8_t *nv_idx; +} nvlist_t; + +/* + * nvpair header. + * nvpair has encoded and decoded size + * name string (size and data) + * data type and number of elements + * data + */ +typedef struct { + unsigned encoded_size; + unsigned decoded_size; +} nvp_header_t; + +/* + * nvlist stream head. + */ +typedef struct { + unsigned nvl_version; + unsigned nvl_nvflag; + nvp_header_t nvl_pair; +} nvs_data_t; + +typedef struct { + unsigned nv_size; + uint8_t nv_data[]; /* NV_ALIGN4(string) */ +} nv_string_t; + +typedef struct { + unsigned nv_type; /* data_type_t */ + unsigned nv_nelem; /* number of elements */ + uint8_t nv_data[]; /* data stream */ +} nv_pair_data_t; + +nvlist_t *nvlist_create(int); +void nvlist_destroy(nvlist_t *); +nvlist_t *nvlist_import(const uint8_t *, char, char); +int nvlist_remove(nvlist_t *, const char *, data_type_t); +void nvlist_print(nvlist_t *, unsigned int); +int nvlist_find(const nvlist_t *, const char *, data_type_t, + int *, void *, int *); +int nvlist_next(nvlist_t *); int zfs_parsedev(struct zfs_devdesc *dev, const char *devspec, const char **path); char *zfs_fmtdev(void *vdev); +int zfs_nextboot(void *vdev, char *buf, size_t size); int zfs_probe_dev(const char *devname, uint64_t *pool_guid); int zfs_list(const char *name); uint64_t ldi_get_size(void *); diff --git a/stand/libsa/zfs/nvlist.c b/stand/libsa/zfs/nvlist.c new file mode 100644 index 000000000000..910d25499401 --- /dev/null +++ b/stand/libsa/zfs/nvlist.c @@ -0,0 +1,601 @@ +/*- + * Copyright 2020 Toomas Soome + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include "libzfs.h" + +typedef struct xdr { + int (*xdr_getint)(const struct xdr *, const void *, int *); +} xdr_t; + +static int xdr_int(const xdr_t *, const void *, int *); +static int mem_int(const xdr_t *, const void *, int *); +static void nvlist_decode_nvlist(const xdr_t *, nvlist_t *); +static int nvlist_size(const xdr_t *, const uint8_t *); + +/* + * transform data from network to host. + */ +xdr_t ntoh = { + .xdr_getint = xdr_int +}; + +/* + * transform data from host to host. + */ +xdr_t native = { + .xdr_getint = mem_int +}; + +/* + * transform data from host to network. + */ +xdr_t hton = { + .xdr_getint = xdr_int +}; + +static int +xdr_short(const xdr_t *xdr, const uint8_t *buf, short *ip) +{ + int i, rv; + + rv = xdr->xdr_getint(xdr, buf, &i); + *ip = i; + return (rv); +} + +static int +xdr_u_short(const xdr_t *xdr, const uint8_t *buf, unsigned short *ip) +{ + unsigned u; + int rv; + + rv = xdr->xdr_getint(xdr, buf, &u); + *ip = u; + return (rv); +} + +static int +xdr_int(const xdr_t *xdr __unused, const void *buf, int *ip) +{ + *ip = be32dec(buf); + return (sizeof(int)); +} + +static int +xdr_u_int(const xdr_t *xdr __unused, const void *buf, unsigned *ip) +{ + *ip = be32dec(buf); + return (sizeof(unsigned)); +} + +static int +xdr_string(const xdr_t *xdr, const void *buf, nv_string_t *s) +{ + int size; + + size = xdr->xdr_getint(xdr, buf, &s->nv_size); + size = NV_ALIGN4(size + s->nv_size); + return (size); +} + +static int +xdr_int64(const xdr_t *xdr, const uint8_t *buf, int64_t *lp) +{ + int hi, rv; + unsigned lo; + + rv = xdr->xdr_getint(xdr, buf, &hi); + rv += xdr->xdr_getint(xdr, buf + rv, &lo); + *lp = (((int64_t)hi) << 32) | lo; + return (rv); +} + +static int +xdr_uint64(const xdr_t *xdr, const uint8_t *buf, uint64_t *lp) +{ + unsigned hi, lo; + int rv; + + rv = xdr->xdr_getint(xdr, buf, &hi); + rv += xdr->xdr_getint(xdr, buf + rv, &lo); + *lp = (((int64_t)hi) << 32) | lo; + return (rv); +} + +static int +xdr_char(const xdr_t *xdr, const uint8_t *buf, char *cp) +{ + int i, rv; + + rv = xdr->xdr_getint(xdr, buf, &i); + *cp = i; + return (rv); +} + +/* + * read native data. + */ +static int +mem_int(const xdr_t *xdr, const void *buf, int *i) +{ + *i = *(int *)buf; + return (sizeof(int)); +} + +void +nvlist_destroy(nvlist_t *nvl) +{ + if (nvl != NULL) { + /* Free data if it was allocated by us. */ + if (nvl->nv_asize > 0) + free(nvl->nv_data); + } + free(nvl); +} + +char * +nvstring_get(nv_string_t *nvs) +{ + char *s; + + s = malloc(nvs->nv_size + 1); + if (s != NULL) { + bcopy(nvs->nv_data, s, nvs->nv_size); + s[nvs->nv_size] = '\0'; + } + return (s); +} + +/* + * Create empty nvlist. + * The nvlist is terminated by 2x zeros (8 bytes). + */ +nvlist_t * +nvlist_create(int flag) +{ + nvlist_t *nvl; + nvs_data_t *nvs; + + nvl = calloc(1, sizeof(*nvl)); + if (nvl == NULL) + return (nvl); + + nvl->nv_header.nvh_encoding = NV_ENCODE_XDR; + nvl->nv_header.nvh_endian = _BYTE_ORDER == _LITTLE_ENDIAN; + + nvl->nv_asize = nvl->nv_size = sizeof(*nvs); + nvs = calloc(1, nvl->nv_asize); + if (nvs == NULL) { + free(nvl); + return (NULL); + } + /* data in nvlist is byte stream */ + nvl->nv_data = (uint8_t *)nvs; + + nvs->nvl_version = NV_VERSION; + nvs->nvl_nvflag = flag; + return (nvl); +} + +static void +nvlist_nvp_decode(const xdr_t *xdr, nvlist_t *nvl, nvp_header_t *nvph) +{ + nv_string_t *nv_string; + nv_pair_data_t *nvp_data; + nvlist_t nvlist; + + nv_string = (nv_string_t *)nvl->nv_idx; + nvl->nv_idx += xdr_string(xdr, &nv_string->nv_size, nv_string); + nvp_data = (nv_pair_data_t *)nvl->nv_idx; + + nvl->nv_idx += xdr_u_int(xdr, &nvp_data->nv_type, &nvp_data->nv_type); + nvl->nv_idx += xdr_u_int(xdr, &nvp_data->nv_nelem, &nvp_data->nv_nelem); + + switch (nvp_data->nv_type) { + case DATA_TYPE_NVLIST: + case DATA_TYPE_NVLIST_ARRAY: + bzero(&nvlist, sizeof (nvlist)); + nvlist.nv_data = &nvp_data->nv_data[0]; + nvlist.nv_idx = nvlist.nv_data; + for (int i = 0; i < nvp_data->nv_nelem; i++) { + nvlist.nv_asize = + nvlist_size(xdr, nvlist.nv_data); + nvlist_decode_nvlist(xdr, &nvlist); + nvl->nv_idx = nvlist.nv_idx; + nvlist.nv_data = nvlist.nv_idx; + } + break; + + case DATA_TYPE_BOOLEAN: + /* BOOLEAN does not take value space */ + break; + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + nvl->nv_idx += xdr_char(xdr, &nvp_data->nv_data[0], + (char *)&nvp_data->nv_data[0]); + break; + + case DATA_TYPE_INT16: + nvl->nv_idx += xdr_short(xdr, &nvp_data->nv_data[0], + (short *)&nvp_data->nv_data[0]); + break; + + case DATA_TYPE_UINT16: + nvl->nv_idx += xdr_u_short(xdr, &nvp_data->nv_data[0], + (unsigned short *)&nvp_data->nv_data[0]); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_INT32: + nvl->nv_idx += xdr_int(xdr, &nvp_data->nv_data[0], + (int *)&nvp_data->nv_data[0]); + break; + + case DATA_TYPE_UINT32: + nvl->nv_idx += xdr_u_int(xdr, &nvp_data->nv_data[0], + (unsigned *)&nvp_data->nv_data[0]); + break; + + case DATA_TYPE_INT64: + nvl->nv_idx += xdr_int64(xdr, &nvp_data->nv_data[0], + (int64_t *)&nvp_data->nv_data[0]); + break; + + case DATA_TYPE_UINT64: + nvl->nv_idx += xdr_uint64(xdr, &nvp_data->nv_data[0], + (uint64_t *)&nvp_data->nv_data[0]); + break; + + case DATA_TYPE_STRING: + nv_string = (nv_string_t *)&nvp_data->nv_data[0]; + nvl->nv_idx += xdr_string(xdr, &nvp_data->nv_data[0], + nv_string); + + break; + } +} + +static void +nvlist_decode_nvlist(const xdr_t *xdr, nvlist_t *nvl) +{ + nvp_header_t *nvph; + nvs_data_t *nvs = (nvs_data_t *)nvl->nv_data; + + nvl->nv_idx = nvl->nv_data; + nvl->nv_idx += xdr->xdr_getint(xdr, (const uint8_t *)&nvs->nvl_version, + &nvs->nvl_version); + nvl->nv_idx += xdr->xdr_getint(xdr, (const uint8_t *)&nvs->nvl_nvflag, + &nvs->nvl_nvflag); + + nvph = &nvs->nvl_pair; + nvl->nv_idx += xdr->xdr_getint(xdr, + (const uint8_t *)&nvph->encoded_size, &nvph->encoded_size); + nvl->nv_idx += xdr->xdr_getint(xdr, + (const uint8_t *)&nvph->decoded_size, &nvph->decoded_size); + + while (nvph->encoded_size && nvph->decoded_size) { + nvlist_nvp_decode(xdr, nvl, nvph); + + nvph = (nvp_header_t *)(nvl->nv_idx); + nvl->nv_idx += xdr->xdr_getint(xdr, &nvph->encoded_size, + &nvph->encoded_size); + nvl->nv_idx += xdr->xdr_getint(xdr, &nvph->decoded_size, + &nvph->decoded_size); + } +} + +static int +nvlist_size(const xdr_t *xdr, const uint8_t *stream) +{ + const uint8_t *p, *pair; + unsigned encoded_size, decoded_size; + + p = stream; + p += 2 * sizeof(unsigned); + + pair = p; + p += xdr->xdr_getint(xdr, p, &encoded_size); + p += xdr->xdr_getint(xdr, p, &decoded_size); + while (encoded_size && decoded_size) { + p = pair + encoded_size; + pair = p; + p += xdr->xdr_getint(xdr, p, &encoded_size); + p += xdr->xdr_getint(xdr, p, &decoded_size); + } + return (p - stream); +} + +/* + * Import nvlist from byte stream. + * Determine the stream size and allocate private copy. + * Then translate the data. + */ +nvlist_t * +nvlist_import(const uint8_t *stream, char encoding, char endian) +{ + nvlist_t *nvl; + + if (encoding != NV_ENCODE_XDR) + return (NULL); + + nvl = malloc(sizeof(*nvl)); + if (nvl == NULL) + return (nvl); + + nvl->nv_asize = nvl->nv_size = nvlist_size(&ntoh, stream); + nvl->nv_data = malloc(nvl->nv_asize); + if (nvl->nv_data == NULL) { + free(nvl); + return (NULL); + } + nvl->nv_idx = nvl->nv_data; + bcopy(stream, nvl->nv_data, nvl->nv_asize); + + nvlist_decode_nvlist(&ntoh, nvl); + nvl->nv_idx = nvl->nv_data; + return (nvl); +} + +/* + * remove pair from this nvlist. + */ +int +nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type) +{ + uint8_t *head, *tail; + nvs_data_t *data; + nvp_header_t *nvp; + nv_string_t *nvp_name; + nv_pair_data_t *nvp_data; + size_t size; + + if (nvl == NULL || nvl->nv_data == NULL || name == NULL) + return (EINVAL); + + head = nvl->nv_data; + data = (nvs_data_t *)head; + nvp = &data->nvl_pair; /* first pair in nvlist */ + head = (uint8_t *)nvp; + + while (nvp->encoded_size != 0 && nvp->decoded_size != 0) { + nvp_name = (nv_string_t *)(head + sizeof(*nvp)); + + nvp_data = (nv_pair_data_t *) + NV_ALIGN4((uintptr_t)&nvp_name->nv_data[0] + + nvp_name->nv_size); + + if (memcmp(nvp_name->nv_data, name, nvp_name->nv_size) == 0 && + nvp_data->nv_type == type) { + /* + * set tail to point to next nvpair and size + * is the length of the tail. + */ + tail = head + nvp->encoded_size; + size = nvl->nv_data + nvl->nv_size - tail; + + /* adjust the size of the nvlist. */ + nvl->nv_size -= nvp->encoded_size; + bcopy(tail, head, size); + return (0); + } + /* Not our pair, skip to next. */ + head = head + nvp->encoded_size; + nvp = (nvp_header_t *)head; + } + return (ENOENT); +} + +int +nvlist_find(const nvlist_t *nvl, const char *name, data_type_t type, + int *elementsp, void *valuep, int *sizep) +{ + nvs_data_t *data; + nvp_header_t *nvp; + nv_string_t *nvp_name; + nv_pair_data_t *nvp_data; + nvlist_t *nvlist; + + if (nvl == NULL || nvl->nv_data == NULL || name == NULL) + return (EINVAL); + + data = (nvs_data_t *)nvl->nv_data; + nvp = &data->nvl_pair; /* first pair in nvlist */ + + while (nvp->encoded_size != 0 && nvp->decoded_size != 0) { + nvp_name = (nv_string_t *)((uint8_t *)nvp + sizeof(*nvp)); + + nvp_data = (nv_pair_data_t *) + NV_ALIGN4((uintptr_t)&nvp_name->nv_data[0] + + nvp_name->nv_size); + + if (memcmp(nvp_name->nv_data, name, nvp_name->nv_size) == 0 && + nvp_data->nv_type == type) { + if (elementsp != NULL) + *elementsp = nvp_data->nv_nelem; + switch (nvp_data->nv_type) { + case DATA_TYPE_UINT64: + *(uint64_t *)valuep = + *(uint64_t *)nvp_data->nv_data; + return (0); + case DATA_TYPE_STRING: + nvp_name = (nv_string_t *)nvp_data->nv_data; + if (sizep != NULL) { + *sizep = nvp_name->nv_size; + } + *(const uint8_t **)valuep = + &nvp_name->nv_data[0]; + return (0); + case DATA_TYPE_NVLIST: + case DATA_TYPE_NVLIST_ARRAY: + nvlist = malloc(sizeof(*nvlist)); + if (nvlist != NULL) { + nvlist->nv_header = nvl->nv_header; + nvlist->nv_asize = 0; + nvlist->nv_size = 0; + nvlist->nv_idx = NULL; + nvlist->nv_data = &nvp_data->nv_data[0]; + *(nvlist_t **)valuep = nvlist; + return (0); + } + return (ENOMEM); + } + return (EIO); + } + /* Not our pair, skip to next. */ + nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size); + } + return (ENOENT); +} + +/* + * Return the next nvlist in an nvlist array. + */ +int +nvlist_next(nvlist_t *nvl) +{ + nvs_data_t *data; + nvp_header_t *nvp; + + if (nvl == NULL || nvl->nv_data == NULL || nvl->nv_asize != 0) + return (EINVAL); + + data = (nvs_data_t *)nvl->nv_data; + nvp = &data->nvl_pair; /* first pair in nvlist */ + + while (nvp->encoded_size != 0 && nvp->decoded_size != 0) { + nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size); + } + nvl->nv_data = (uint8_t *)nvp + sizeof(*nvp); + return (0); +} + +void +nvlist_print(nvlist_t *nvl, unsigned int indent) +{ + static const char *typenames[] = { + "DATA_TYPE_UNKNOWN", + "DATA_TYPE_BOOLEAN", + "DATA_TYPE_BYTE", + "DATA_TYPE_INT16", + "DATA_TYPE_UINT16", + "DATA_TYPE_INT32", + "DATA_TYPE_UINT32", + "DATA_TYPE_INT64", + "DATA_TYPE_UINT64", + "DATA_TYPE_STRING", + "DATA_TYPE_BYTE_ARRAY", + "DATA_TYPE_INT16_ARRAY", + "DATA_TYPE_UINT16_ARRAY", + "DATA_TYPE_INT32_ARRAY", + "DATA_TYPE_UINT32_ARRAY", + "DATA_TYPE_INT64_ARRAY", + "DATA_TYPE_UINT64_ARRAY", + "DATA_TYPE_STRING_ARRAY", + "DATA_TYPE_HRTIME", + "DATA_TYPE_NVLIST", + "DATA_TYPE_NVLIST_ARRAY", + "DATA_TYPE_BOOLEAN_VALUE", + "DATA_TYPE_INT8", + "DATA_TYPE_UINT8", + "DATA_TYPE_BOOLEAN_ARRAY", + "DATA_TYPE_INT8_ARRAY", + "DATA_TYPE_UINT8_ARRAY" + }; + nvs_data_t *data; + nvp_header_t *nvp; + nv_string_t *nvp_name; + nv_pair_data_t *nvp_data; + nvlist_t nvlist; + int i, j; + + data = (nvs_data_t *)nvl->nv_data; + nvp = &data->nvl_pair; /* first pair in nvlist */ + while (nvp->encoded_size != 0 && nvp->decoded_size != 0) { + nvp_name = (nv_string_t *)((uintptr_t)nvp + sizeof(*nvp)); + nvp_data = (nv_pair_data_t *) + NV_ALIGN4((uintptr_t)&nvp_name->nv_data[0] + + nvp_name->nv_size); + + for (int i = 0; i < indent; i++) + printf(" "); + + printf("%s [%d] %.*s", typenames[nvp_data->nv_type], + nvp_data->nv_nelem, nvp_name->nv_size, nvp_name->nv_data); + + switch (nvp_data->nv_type) { + case DATA_TYPE_UINT64: { + uint64_t val; + + val = *(uint64_t *)nvp_data->nv_data; + printf(" = 0x%jx\n", (uintmax_t)val); + break; + } + + case DATA_TYPE_STRING: { + nvp_name = (nv_string_t *)&nvp_data->nv_data[0]; + printf(" = \"%.*s\"\n", nvp_name->nv_size, + nvp_name->nv_data ); + break; + } + + case DATA_TYPE_NVLIST: + printf("\n"); + nvlist.nv_data = &nvp_data->nv_data[0]; + nvlist_print(&nvlist, indent + 2); + break; + + case DATA_TYPE_NVLIST_ARRAY: + nvlist.nv_data = &nvp_data->nv_data[0]; + for (j = 0; j < nvp_data->nv_nelem; j++) { + data = (nvs_data_t *)nvlist.nv_data; + printf("[%d]\n", j); + nvlist_print(&nvlist, indent + 2); + if (j != nvp_data->nv_nelem - 1) { + for (i = 0; i < indent; i++) + printf(" "); + printf("%s %.*s", + typenames[nvp_data->nv_type], + nvp_name->nv_size, + nvp_name->nv_data); + } + nvlist.nv_data = (uint8_t *)data + + nvlist_size(&native, nvlist.nv_data); + } + break; + + default: + printf("\n"); + } + nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size); + } + printf("%*s\n", indent + 13, "End of nvlist"); +} diff --git a/stand/libsa/zfs/zfs.c b/stand/libsa/zfs/zfs.c index d94072d80628..fd7b6cbfdeba 100644 --- a/stand/libsa/zfs/zfs.c +++ b/stand/libsa/zfs/zfs.c @@ -482,6 +482,215 @@ vdev_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t bytes) return (ret); } +static int +vdev_write(vdev_t *vdev __unused, void *priv, off_t offset, void *buf, + size_t bytes) +{ + int fd, ret; + size_t head, tail, total_size, full_sec_size; + unsigned secsz, do_tail_write; + off_t start_sec; + ssize_t res; + char *outbuf, *bouncebuf; + + fd = (uintptr_t)priv; + outbuf = (char *) buf; + bouncebuf = NULL; + + ret = ioctl(fd, DIOCGSECTORSIZE, &secsz); + if (ret != 0) + return (ret); + + start_sec = offset / secsz; + head = offset % secsz; + total_size = roundup2(head + bytes, secsz); + tail = total_size - (head + bytes); + do_tail_write = ((tail > 0) && (head + bytes > secsz)); + full_sec_size = total_size; + if (head > 0) + full_sec_size -= secsz; + if (do_tail_write) + full_sec_size -= secsz; + + /* Partial sector write requires a bounce buffer. */ + if ((head > 0) || do_tail_write || bytes < secsz) { + bouncebuf = malloc(secsz); + if (bouncebuf == NULL) { + printf("vdev_write: out of memory\n"); + return (ENOMEM); + } + } + + if (lseek(fd, start_sec * secsz, SEEK_SET) == -1) { + ret = errno; + goto error; + } + + /* Partial data for first sector */ + if (head > 0) { + res = read(fd, bouncebuf, secsz); + if (res != secsz) { + ret = EIO; + goto error; + } + memcpy(bouncebuf + head, outbuf, min(secsz - head, bytes)); + (void) lseek(fd, -secsz, SEEK_CUR); + res = write(fd, bouncebuf, secsz); + if (res != secsz) { + ret = EIO; + goto error; + } + outbuf += min(secsz - head, bytes); + } + + /* + * Full data write to sectors. + * Note, there is still corner case where we write + * to sector boundary, but less than sector size, e.g. write 512B + * to 4k sector. + */ + if (full_sec_size > 0) { + if (bytes < full_sec_size) { + res = read(fd, bouncebuf, secsz); + if (res != secsz) { + ret = EIO; + goto error; + } + memcpy(bouncebuf, outbuf, bytes); + (void) lseek(fd, -secsz, SEEK_CUR); + res = write(fd, bouncebuf, secsz); + if (res != secsz) { + ret = EIO; + goto error; + } + } else { + res = write(fd, outbuf, full_sec_size); + if (res != full_sec_size) { + ret = EIO; + goto error; + } + outbuf += full_sec_size; + } + } + + /* Partial data write to last sector */ + if (do_tail_write) { + res = read(fd, bouncebuf, secsz); + if (res != secsz) { + ret = EIO; + goto error; + } + memcpy(bouncebuf, outbuf, secsz - tail); + (void) lseek(fd, -secsz, SEEK_CUR); + res = write(fd, bouncebuf, secsz); + if (res != secsz) { + ret = EIO; + goto error; + } + } + + ret = 0; +error: + free(bouncebuf); + return (ret); +} + +static void +vdev_clear_pad2(vdev_t *vdev) +{ + vdev_t *kid; + vdev_boot_envblock_t *be; + off_t off = offsetof(vdev_label_t, vl_be); + zio_checksum_info_t *ci; + zio_cksum_t cksum; + + STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { + if (kid->v_state != VDEV_STATE_HEALTHY) + continue; + vdev_clear_pad2(kid); + } + + if (!STAILQ_EMPTY(&vdev->v_children)) + return; + + be = calloc(1, sizeof (*be)); + if (be == NULL) { + printf("failed to clear be area: out of memory\n"); + return; + } + + ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; + be->vbe_zbt.zec_magic = ZEC_MAGIC; + zio_checksum_label_verifier(&be->vbe_zbt.zec_cksum, off); + ci->ci_func[0](be, sizeof (*be), NULL, &cksum); + be->vbe_zbt.zec_cksum = cksum; + + if (vdev_write(vdev, vdev->v_read_priv, off, be, VDEV_PAD_SIZE)) { + printf("failed to clear be area of primary vdev: %d\n", + errno); + } + free(be); +} + +/* + * Read the next boot command from pad2. + * If any instance of pad2 is set to empty string, or the returned string + * values are not the same, we consider next boot not to be set. + */ +static char * +vdev_read_pad2(vdev_t *vdev) +{ + vdev_t *kid; + char *tmp, *result = NULL; + vdev_boot_envblock_t *be; + off_t off = offsetof(vdev_label_t, vl_be); + + STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { + if (kid->v_state != VDEV_STATE_HEALTHY) + continue; + tmp = vdev_read_pad2(kid); + if (tmp == NULL) + continue; + + /* The next boot is not set, we are done. */ + if (*tmp == '\0') { + free(result); + return (tmp); + } + if (result == NULL) { + result = tmp; + continue; + } + /* Are the next boot strings different? */ + if (strcmp(result, tmp) != 0) { + free(tmp); + *result = '\0'; + break; + } + free(tmp); + } + if (result != NULL) + return (result); + + be = malloc(sizeof (*be)); + if (be == NULL) + return (NULL); + + if (vdev_read(vdev, vdev->v_read_priv, off, be, sizeof (*be))) { + return (NULL); + } + + switch (be->vbe_version) { + case VB_RAW: + case VB_NVLIST: + result = strdup(be->vbe_bootenv); + default: + /* Backward compatibility with initial nextboot feaure. */ + result = strdup((char *)be); + } + return (result); +} + static int zfs_dev_init(void) { @@ -558,7 +767,7 @@ zfs_probe_partition(void *arg, const char *partname, strncpy(devname, ppa->devname, strlen(ppa->devname) - 1); devname[strlen(ppa->devname) - 1] = '\0'; sprintf(devname, "%s%s:", devname, partname); - pa.fd = open(devname, O_RDONLY); + pa.fd = open(devname, O_RDWR); if (pa.fd == -1) return (0); ret = zfs_probe(pa.fd, ppa->pool_guid); @@ -580,6 +789,57 @@ zfs_probe_partition(void *arg, const char *partname, return (0); } +int +zfs_nextboot(void *vdev, char *buf, size_t size) +{ + struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev; + spa_t *spa; + vdev_t *vd; + char *result = NULL; + + if (dev->dd.d_dev->dv_type != DEVT_ZFS) + return (1); + + if (dev->pool_guid == 0) + spa = STAILQ_FIRST(&zfs_pools); + else + spa = spa_find_by_guid(dev->pool_guid); + + if (spa == NULL) { + printf("ZFS: can't find pool by guid\n"); + return (1); + } + + STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children, v_childlink) { + char *tmp = vdev_read_pad2(vd); + + /* Continue on error. */ + if (tmp == NULL) + continue; + /* Nextboot is not set. */ + if (*tmp == '\0') { + free(result); + free(tmp); + return (1); + } + if (result == NULL) { + result = tmp; + continue; + } + free(tmp); + } + if (result == NULL) + return (1); + + STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children, v_childlink) { + vdev_clear_pad2(vd); + } + + strlcpy(buf, result, size); + free(result); + return (0); +} + int zfs_probe_dev(const char *devname, uint64_t *pool_guid) { @@ -591,7 +851,7 @@ zfs_probe_dev(const char *devname, uint64_t *pool_guid) if (pool_guid) *pool_guid = 0; - pa.fd = open(devname, O_RDONLY); + pa.fd = open(devname, O_RDWR); if (pa.fd == -1) return (ENXIO); /* diff --git a/stand/libsa/zfs/zfsimpl.c b/stand/libsa/zfs/zfsimpl.c index 8b1c9224049e..ad746148e38d 100644 --- a/stand/libsa/zfs/zfsimpl.c +++ b/stand/libsa/zfs/zfsimpl.c @@ -170,284 +170,48 @@ zfs_init(void) } static int -xdr_int(const unsigned char **xdr, int *ip) +nvlist_check_features_for_read(nvlist_t *nvl) { - *ip = be32dec(*xdr); - (*xdr) += 4; - return (0); -} - -static int -xdr_u_int(const unsigned char **xdr, u_int *ip) -{ - *ip = be32dec(*xdr); - (*xdr) += 4; - return (0); -} - -static int -xdr_uint64_t(const unsigned char **xdr, uint64_t *lp) -{ - u_int hi, lo; - - xdr_u_int(xdr, &hi); - xdr_u_int(xdr, &lo); - *lp = (((uint64_t)hi) << 32) | lo; - return (0); -} - -static int -nvlist_find(const unsigned char *nvlist, const char *name, int type, - int *elementsp, void *valuep, int *sizep) -{ - const unsigned char *p, *pair; - int junk; - int encoded_size, decoded_size; - - p = nvlist; - xdr_int(&p, &junk); - xdr_int(&p, &junk); - - pair = p; - xdr_int(&p, &encoded_size); - xdr_int(&p, &decoded_size); - while (encoded_size && decoded_size) { - int namelen, pairtype, elements; - const char *pairname; - - xdr_int(&p, &namelen); - pairname = (const char *)p; - p += roundup(namelen, 4); - xdr_int(&p, &pairtype); - - if (memcmp(name, pairname, namelen) == 0 && type == pairtype) { - xdr_int(&p, &elements); - if (elementsp) - *elementsp = elements; - if (type == DATA_TYPE_UINT64) { - xdr_uint64_t(&p, (uint64_t *)valuep); - return (0); - } else if (type == DATA_TYPE_STRING) { - int len; - xdr_int(&p, &len); - if (sizep != NULL) - *sizep = len; - (*(const char **)valuep) = (const char *)p; - return (0); - } else if (type == DATA_TYPE_NVLIST || - type == DATA_TYPE_NVLIST_ARRAY) { - (*(const unsigned char **)valuep) = - (const unsigned char *)p; - return (0); - } else { - return (EIO); - } - } else { - /* - * Not the pair we are looking for, skip to the - * next one. - */ - p = pair + encoded_size; - } - - pair = p; - xdr_int(&p, &encoded_size); - xdr_int(&p, &decoded_size); - } - - return (EIO); -} - -static int -nvlist_check_features_for_read(const unsigned char *nvlist) -{ - const unsigned char *p, *pair; - int junk; - int encoded_size, decoded_size; + nvlist_t *features = NULL; + nvs_data_t *data; + nvp_header_t *nvp; + nv_string_t *nvp_name; int rc; - rc = 0; + rc = nvlist_find(nvl, ZPOOL_CONFIG_FEATURES_FOR_READ, + DATA_TYPE_NVLIST, NULL, &features, NULL); + if (rc != 0) + return (rc); - p = nvlist; - xdr_int(&p, &junk); - xdr_int(&p, &junk); + data = (nvs_data_t *)features->nv_data; + nvp = &data->nvl_pair; /* first pair in nvlist */ - pair = p; - xdr_int(&p, &encoded_size); - xdr_int(&p, &decoded_size); - while (encoded_size && decoded_size) { - int namelen, pairtype; - const char *pairname; + while (nvp->encoded_size != 0 && nvp->decoded_size != 0) { int i, found; + nvp_name = (nv_string_t *)((uintptr_t)nvp + sizeof(*nvp)); found = 0; - xdr_int(&p, &namelen); - pairname = (const char *)p; - p += roundup(namelen, 4); - xdr_int(&p, &pairtype); - for (i = 0; features_for_read[i] != NULL; i++) { - if (memcmp(pairname, features_for_read[i], - namelen) == 0) { + if (memcmp(nvp_name->nv_data, features_for_read[i], + nvp_name->nv_size) == 0) { found = 1; break; } } if (!found) { - printf("ZFS: unsupported feature: %s\n", pairname); + printf("ZFS: unsupported feature: %.*s\n", + nvp_name->nv_size, nvp_name->nv_data); rc = EIO; } - - p = pair + encoded_size; - - pair = p; - xdr_int(&p, &encoded_size); - xdr_int(&p, &decoded_size); + nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size); } + nvlist_destroy(features); return (rc); } -/* - * Return the next nvlist in an nvlist array. - */ -static const unsigned char * -nvlist_next(const unsigned char *nvlist) -{ - const unsigned char *p, *pair; - int junk; - int encoded_size, decoded_size; - - p = nvlist; - xdr_int(&p, &junk); - xdr_int(&p, &junk); - - pair = p; - xdr_int(&p, &encoded_size); - xdr_int(&p, &decoded_size); - while (encoded_size && decoded_size) { - p = pair + encoded_size; - - pair = p; - xdr_int(&p, &encoded_size); - xdr_int(&p, &decoded_size); - } - - return (p); -} - -#ifdef TEST - -static const unsigned char * -nvlist_print(const unsigned char *nvlist, unsigned int indent) -{ - static const char *typenames[] = { - "DATA_TYPE_UNKNOWN", - "DATA_TYPE_BOOLEAN", - "DATA_TYPE_BYTE", - "DATA_TYPE_INT16", - "DATA_TYPE_UINT16", - "DATA_TYPE_INT32", - "DATA_TYPE_UINT32", - "DATA_TYPE_INT64", - "DATA_TYPE_UINT64", - "DATA_TYPE_STRING", - "DATA_TYPE_BYTE_ARRAY", - "DATA_TYPE_INT16_ARRAY", - "DATA_TYPE_UINT16_ARRAY", - "DATA_TYPE_INT32_ARRAY", - "DATA_TYPE_UINT32_ARRAY", - "DATA_TYPE_INT64_ARRAY", - "DATA_TYPE_UINT64_ARRAY", - "DATA_TYPE_STRING_ARRAY", - "DATA_TYPE_HRTIME", - "DATA_TYPE_NVLIST", - "DATA_TYPE_NVLIST_ARRAY", - "DATA_TYPE_BOOLEAN_VALUE", - "DATA_TYPE_INT8", - "DATA_TYPE_UINT8", - "DATA_TYPE_BOOLEAN_ARRAY", - "DATA_TYPE_INT8_ARRAY", - "DATA_TYPE_UINT8_ARRAY" - }; - - unsigned int i, j; - const unsigned char *p, *pair; - int junk; - int encoded_size, decoded_size; - - p = nvlist; - xdr_int(&p, &junk); - xdr_int(&p, &junk); - - pair = p; - xdr_int(&p, &encoded_size); - xdr_int(&p, &decoded_size); - while (encoded_size && decoded_size) { - int namelen, pairtype, elements; - const char *pairname; - - xdr_int(&p, &namelen); - pairname = (const char *)p; - p += roundup(namelen, 4); - xdr_int(&p, &pairtype); - - for (i = 0; i < indent; i++) - printf(" "); - printf("%s %.*s", typenames[pairtype], namelen, pairname); - - xdr_int(&p, &elements); - switch (pairtype) { - case DATA_TYPE_UINT64: { - uint64_t val; - xdr_uint64_t(&p, &val); - printf(" = 0x%jx\n", (uintmax_t)val); - break; - } - - case DATA_TYPE_STRING: { - int len; - xdr_int(&p, &len); - printf(" = \"%.*s\"\n", len, p); - break; - } - - case DATA_TYPE_NVLIST: - printf("\n"); - nvlist_print(p, indent + 1); - break; - - case DATA_TYPE_NVLIST_ARRAY: - for (j = 0; j < elements; j++) { - printf("[%d]\n", j); - p = nvlist_print(p, indent + 1); - if (j != elements - 1) { - for (i = 0; i < indent; i++) - printf(" "); - printf("%s %.*s", typenames[pairtype], - namelen, pairname); - } - } - break; - - default: - printf("\n"); - } - - p = pair + encoded_size; - - pair = p; - xdr_int(&p, &encoded_size); - xdr_int(&p, &decoded_size); - } - - return (p); -} - -#endif - static int vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset, size_t size) @@ -1082,7 +846,7 @@ vdev_create(uint64_t guid, vdev_read_t *_read) } static void -vdev_set_initial_state(vdev_t *vdev, const unsigned char *nvlist) +vdev_set_initial_state(vdev_t *vdev, const nvlist_t *nvlist) { uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present; uint64_t is_log; @@ -1117,7 +881,7 @@ vdev_set_initial_state(vdev_t *vdev, const unsigned char *nvlist) } static int -vdev_init(uint64_t guid, const unsigned char *nvlist, vdev_t **vdevp) +vdev_init(uint64_t guid, const nvlist_t *nvlist, vdev_t **vdevp) { uint64_t id, ashift, asize, nparity; const char *path; @@ -1128,8 +892,8 @@ vdev_init(uint64_t guid, const unsigned char *nvlist, vdev_t **vdevp) if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id, NULL) || - nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, - NULL, &type, &len)) { + nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL, + &type, &len)) { return (ENOENT); } @@ -1306,10 +1070,10 @@ vdev_insert(vdev_t *top_vdev, vdev_t *vdev) } static int -vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist) +vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const nvlist_t *nvlist) { vdev_t *top_vdev, *vdev; - const unsigned char *kids; + nvlist_t *kids = NULL; int rc, nkids; /* Get top vdev. */ @@ -1332,8 +1096,10 @@ vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist) rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, NULL, &guid, NULL); - if (rc != 0) + if (rc != 0) { + nvlist_destroy(kids); return (rc); + } rc = vdev_init(guid, kids, &vdev); if (rc != 0) return (rc); @@ -1342,7 +1108,7 @@ vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist) vdev->v_top = top_vdev; vdev_insert(top_vdev, vdev); - kids = nvlist_next(kids); + rc = nvlist_next(kids); } } else { /* @@ -1351,15 +1117,17 @@ vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist) */ rc = 0; } + nvlist_destroy(kids); return (rc); } static int -vdev_init_from_label(spa_t *spa, const unsigned char *nvlist) +vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist) { uint64_t pool_guid, top_guid; - const unsigned char *vdevs; + nvlist_t *vdevs; + int rc; if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, NULL, &pool_guid, NULL) || @@ -1371,7 +1139,9 @@ vdev_init_from_label(spa_t *spa, const unsigned char *nvlist) return (ENOENT); } - return (vdev_from_nvlist(spa, top_guid, vdevs)); + rc = vdev_from_nvlist(spa, top_guid, vdevs); + nvlist_destroy(vdevs); + return (rc); } static void @@ -1420,10 +1190,10 @@ vdev_set_state(vdev_t *vdev) } static int -vdev_update_from_nvlist(uint64_t top_guid, const unsigned char *nvlist) +vdev_update_from_nvlist(uint64_t top_guid, const nvlist_t *nvlist) { vdev_t *vdev; - const unsigned char *kids; + nvlist_t *kids = NULL; int rc, nkids; /* Update top vdev. */ @@ -1447,20 +1217,21 @@ vdev_update_from_nvlist(uint64_t top_guid, const unsigned char *nvlist) if (vdev != NULL) vdev_set_initial_state(vdev, kids); - kids = nvlist_next(kids); + rc = nvlist_next(kids); } } else { rc = 0; } + nvlist_destroy(kids); return (rc); } static int -vdev_init_from_nvlist(spa_t *spa, const unsigned char *nvlist) +vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist) { uint64_t pool_guid, vdev_children; - const unsigned char *vdevs, *kids; + nvlist_t *vdevs = NULL, *kids = NULL; int rc, nkids; if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, @@ -1474,13 +1245,16 @@ vdev_init_from_nvlist(spa_t *spa, const unsigned char *nvlist) } /* Wrong guid?! */ - if (spa->spa_guid != pool_guid) + if (spa->spa_guid != pool_guid) { + nvlist_destroy(vdevs); return (EINVAL); + } spa->spa_root_vdev->v_nchildren = vdev_children; rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, &nkids, &kids, NULL); + nvlist_destroy(vdevs); /* * MOS config has at least one child for root vdev. @@ -1506,8 +1280,9 @@ vdev_init_from_nvlist(spa_t *spa, const unsigned char *nvlist) rc = vdev_update_from_nvlist(guid, kids); if (rc != 0) break; - kids = nvlist_next(kids); + nvlist_next(kids); } + nvlist_destroy(kids); /* * Re-evaluate top-level vdev state. @@ -1819,26 +1594,20 @@ vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset, return (vdev_read_phys(vd, &bp, buf, off, size)); } -static unsigned char * +static nvlist_t * vdev_label_read_config(vdev_t *vd, uint64_t txg) { vdev_phys_t *label; uint64_t best_txg = 0; uint64_t label_txg = 0; uint64_t asize; - unsigned char *nvl; - size_t nvl_size; + nvlist_t *nvl = NULL, *tmp; int error; label = malloc(sizeof (vdev_phys_t)); if (label == NULL) return (NULL); - nvl_size = VDEV_PHYS_SIZE - sizeof (zio_eck_t) - 4; - nvl = malloc(nvl_size); - if (nvl == NULL) - goto done; - for (int l = 0; l < VDEV_LABELS; l++) { const unsigned char *nvlist; @@ -1847,35 +1616,40 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) sizeof (vdev_phys_t))) continue; - if (label->vp_nvlist[0] != NV_ENCODE_XDR) + nvlist = (const unsigned char *) label->vp_nvlist; + tmp = nvlist_import(nvlist + 4, nvlist[0], nvlist[1]); + if (tmp == NULL) continue; - nvlist = (const unsigned char *) label->vp_nvlist + 4; - error = nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, + error = nvlist_find(tmp, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64, NULL, &label_txg, NULL); if (error != 0 || label_txg == 0) { - memcpy(nvl, nvlist, nvl_size); + nvlist_destroy(nvl); + nvl = tmp; goto done; } if (label_txg <= txg && label_txg > best_txg) { best_txg = label_txg; - memcpy(nvl, nvlist, nvl_size); + nvlist_destroy(nvl); + nvl = tmp; + tmp = NULL; /* * Use asize from pool config. We need this * because we can get bad value from BIOS. */ - if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, + if (nvlist_find(nvl, ZPOOL_CONFIG_ASIZE, DATA_TYPE_UINT64, NULL, &asize, NULL) == 0) { vd->v_psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; } } + nvlist_destroy(tmp); } if (best_txg == 0) { - free(nvl); + nvlist_destroy(nvl); nvl = NULL; } done: @@ -1914,12 +1688,11 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) vdev_t vtmp; spa_t *spa; vdev_t *vdev; - unsigned char *nvlist; + nvlist_t *nvl; uint64_t val; uint64_t guid, vdev_children; uint64_t pool_txg, pool_guid; const char *pool_name; - const unsigned char *features; int rc, namelen; /* @@ -1936,54 +1709,53 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) if (vtmp.v_psize < SPA_MINDEVSIZE) return (EIO); - nvlist = vdev_label_read_config(&vtmp, UINT64_MAX); - if (nvlist == NULL) + nvl = vdev_label_read_config(&vtmp, UINT64_MAX); + if (nvl == NULL) return (EIO); - if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64, + if (nvlist_find(nvl, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64, NULL, &val, NULL) != 0) { - free(nvlist); + nvlist_destroy(nvl); return (EIO); } if (!SPA_VERSION_IS_SUPPORTED(val)) { printf("ZFS: unsupported ZFS version %u (should be %u)\n", (unsigned)val, (unsigned)SPA_VERSION); - free(nvlist); + nvlist_destroy(nvl); return (EIO); } /* Check ZFS features for read */ - if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ, - DATA_TYPE_NVLIST, NULL, &features, NULL) == 0 && - nvlist_check_features_for_read(features) != 0) { - free(nvlist); + rc = nvlist_check_features_for_read(nvl); + if (rc != 0) { + nvlist_destroy(nvl); return (EIO); } - if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64, + if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64, NULL, &val, NULL) != 0) { - free(nvlist); + nvlist_destroy(nvl); return (EIO); } if (val == POOL_STATE_DESTROYED) { /* We don't boot only from destroyed pools. */ - free(nvlist); + nvlist_destroy(nvl); return (EIO); } - if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64, + if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64, NULL, &pool_txg, NULL) != 0 || - nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, + nvlist_find(nvl, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, NULL, &pool_guid, NULL) != 0 || - nvlist_find(nvlist, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING, + nvlist_find(nvl, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING, NULL, &pool_name, &namelen) != 0) { /* * Cache and spare devices end up here - just ignore * them. */ - free(nvlist); + nvlist_destroy(nvl); return (EIO); } @@ -1994,11 +1766,11 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) if (spa == NULL) { char *name; - nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, + nvlist_find(nvl, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64, NULL, &vdev_children, NULL); name = malloc(namelen + 1); if (name == NULL) { - free(nvlist); + nvlist_destroy(nvl); return (ENOMEM); } bcopy(pool_name, name, namelen); @@ -2006,7 +1778,7 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) spa = spa_create(pool_guid, name); free(name); if (spa == NULL) { - free(nvlist); + nvlist_destroy(nvl); return (ENOMEM); } spa->spa_root_vdev->v_nchildren = vdev_children; @@ -2020,20 +1792,20 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) * be some kind of alias (overlapping slices, dangerously dedicated * disks etc). */ - if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, + if (nvlist_find(nvl, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, NULL, &guid, NULL) != 0) { - free(nvlist); + nvlist_destroy(nvl); return (EIO); } vdev = vdev_find(guid); /* Has this vdev already been inited? */ if (vdev && vdev->v_phys_read) { - free(nvlist); + nvlist_destroy(nvl); return (EIO); } - rc = vdev_init_from_label(spa, nvlist); - free(nvlist); + rc = vdev_init_from_label(spa, nvl); + nvlist_destroy(nvl); if (rc != 0) return (rc); @@ -2211,6 +1983,8 @@ zio_read(const spa_t *spa, const blkptr_t *bp, void *buf) BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp)); else if (size != BP_GET_PSIZE(bp)) bcopy(pbuf, buf, BP_GET_PSIZE(bp)); + } else { + printf("zio_read error: %d\n", error); } if (buf != pbuf) free(pbuf); @@ -3307,7 +3081,7 @@ check_mos_features(const spa_t *spa) } static int -load_nvlist(spa_t *spa, uint64_t obj, unsigned char **value) +load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) { dnode_phys_t dir; size_t size; @@ -3336,7 +3110,8 @@ load_nvlist(spa_t *spa, uint64_t obj, unsigned char **value) nv = NULL; return (rc); } - *value = nv; + *value = nvlist_import(nv + 4, nv[0], nv[1]); + free(nv); return (rc); } @@ -3345,7 +3120,7 @@ zfs_spa_init(spa_t *spa) { dnode_phys_t dir; uint64_t config_object; - unsigned char *nvlist; + nvlist_t *nvlist; int rc; if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) { @@ -3383,13 +3158,12 @@ zfs_spa_init(spa_t *spa) rc = load_nvlist(spa, config_object, &nvlist); if (rc != 0) return (rc); - /* * Update vdevs from MOS config. Note, we do skip encoding bytes * here. See also vdev_label_read_config(). */ - rc = vdev_init_from_nvlist(spa, nvlist + 4); - free(nvlist); + rc = vdev_init_from_nvlist(spa, nvlist); + nvlist_destroy(nvlist); return (rc); } diff --git a/stand/loader.mk b/stand/loader.mk index 97172db542e0..64033f11720f 100644 --- a/stand/loader.mk +++ b/stand/loader.mk @@ -136,6 +136,7 @@ CFLAGS+= -DLOADER_MBR_SUPPORT CFLAGS+= -DLOADER_ZFS_SUPPORT CFLAGS+= -I${ZFSSRC} CFLAGS+= -I${SYSDIR}/cddl/boot/zfs +CFLAGS+= -I${SYSDIR}/cddl/contrib/opensolaris/uts/common SRCS+= zfs_cmd.c .endif diff --git a/stand/userboot/userboot/Makefile b/stand/userboot/userboot/Makefile index d8e71b9781d7..7a5462bf2723 100644 --- a/stand/userboot/userboot/Makefile +++ b/stand/userboot/userboot/Makefile @@ -34,6 +34,7 @@ SRCS+= vers.c CFLAGS+= -Wall CFLAGS+= -I${BOOTSRC}/userboot +CFLAGS+= -I${SYSDIR}/cddl/contrib/opensolaris/uts/common CWARNFLAGS.main.c += -Wno-implicit-function-declaration LDFLAGS+= -nostdlib -Wl,-Bsymbolic diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h index 65b5bcfbf90d..9ad244d083b9 100644 --- a/sys/cddl/boot/zfs/zfsimpl.h +++ b/sys/cddl/boot/zfs/zfsimpl.h @@ -56,9 +56,16 @@ * Copyright 2013 by Saso Kiselkov. All rights reserved. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2020 by Delphix. All rights reserved. */ +#include +#include +#include + +#ifndef _ZFSIMPL_H_ +#define _ZFSIMPL_H_ + #define MAXNAMELEN 256 #define _NOTE(s) @@ -493,7 +500,7 @@ typedef struct zio_gbh { #define VDEV_RAIDZ_MAXPARITY 3 #define VDEV_PAD_SIZE (8 << 10) -/* 2 padding areas (vl_pad1 and vl_pad2) to skip */ +/* 2 padding areas (vl_pad1 and vl_be) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) @@ -519,9 +526,29 @@ typedef struct vdev_phys { zio_eck_t vp_zbt; } vdev_phys_t; +typedef enum vbe_vers { + /* The bootenv file is stored as ascii text in the envblock */ + VB_RAW = 0, + + /* + * The bootenv file is converted to an nvlist and then packed into the + * envblock. + */ + VB_NVLIST = 1 +} vbe_vers_t; + +typedef struct vdev_boot_envblock { + uint64_t vbe_version; + char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) - + sizeof (zio_eck_t)]; + zio_eck_t vbe_zbt; +} vdev_boot_envblock_t; + +CTASSERT(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE); + typedef struct vdev_label { char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ - char vl_pad2[VDEV_PAD_SIZE]; /* 8K */ + vdev_boot_envblock_t vl_be; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ } vdev_label_t; /* 256K total */ @@ -1811,3 +1838,5 @@ typedef struct zio { } zio_t; static void decode_embedded_bp_compressed(const blkptr_t *, void *); + +#endif /* _ZFSIMPL_H_ */