From e41fab8d404a80323255521bc3201053ea36688c Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Thu, 6 Apr 2017 18:17:29 +0000 Subject: [PATCH] loader: zfs reader should check all labels The current zfs reader is only checking first label from each device, however, we do have 4 labels on device and we should check all 4 to be protected against disk failures and incomplete label updates. The difficulty is about the fact that 2 label copies are in front of the pool data, and 2 are at the end, which means, we have to know the size of the pool data area. Since we have now the mechanism from common/disk.c to use the partition information, it does help us in this task; however, there are still some corner cases. Namely, if the pool is created without partition, directly on the disk, and firmware will give us the wrong size for the disk, we only can check the first two label copies. Reviewed by: allanjude Differential Revision: https://reviews.freebsd.org/D10203 --- sys/boot/efi/boot1/zfs_module.c | 9 ++ sys/boot/efi/loader/main.c | 11 ++ sys/boot/i386/common/drv.h | 2 +- sys/boot/i386/loader/main.c | 11 ++ sys/boot/i386/zfsboot/zfsboot.c | 23 +++- sys/boot/zfs/libzfs.h | 1 + sys/boot/zfs/zfsimpl.c | 222 +++++++++++++++++++------------- 7 files changed, 184 insertions(+), 95 deletions(-) diff --git a/sys/boot/efi/boot1/zfs_module.c b/sys/boot/efi/boot1/zfs_module.c index 78d635cde76..363ae342a82 100644 --- a/sys/boot/efi/boot1/zfs_module.c +++ b/sys/boot/efi/boot1/zfs_module.c @@ -40,6 +40,15 @@ static dev_info_t *devices; +uint64_t +ldi_get_size(void *priv) +{ + dev_info_t *devinfo = priv; + + return (devinfo->dev->Media->BlockSize * + (devinfo->dev->Media->LastBlock + 1)); +} + static int vdev_read(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes) { diff --git a/sys/boot/efi/loader/main.c b/sys/boot/efi/loader/main.c index 81f4d1b1e80..f40c71253b1 100644 --- a/sys/boot/efi/loader/main.c +++ b/sys/boot/efi/loader/main.c @@ -28,6 +28,7 @@ #include __FBSDID("$FreeBSD$"); +#include #include #include #include @@ -838,4 +839,14 @@ efi_zfs_probe(void) } } } + +uint64_t +ldi_get_size(void *priv) +{ + int fd = (uintptr_t) priv; + uint64_t size; + + ioctl(fd, DIOCGMEDIASIZE, &size); + return (size); +} #endif diff --git a/sys/boot/i386/common/drv.h b/sys/boot/i386/common/drv.h index e437cb83bf9..c0995df45ea 100644 --- a/sys/boot/i386/common/drv.h +++ b/sys/boot/i386/common/drv.h @@ -36,7 +36,7 @@ struct dsk { unsigned int slice; int part; daddr_t start; - int init; + uint64_t size; }; int drvread(struct dsk *dskp, void *buf, daddr_t lba, unsigned nblk); diff --git a/sys/boot/i386/loader/main.c b/sys/boot/i386/loader/main.c index 7fd4cef81ad..81bc2ff55dd 100644 --- a/sys/boot/i386/loader/main.c +++ b/sys/boot/i386/loader/main.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -463,4 +464,14 @@ i386_zfs_probe(void) zfs_probe_dev(devname, NULL); } } + +uint64_t +ldi_get_size(void *priv) +{ + int fd = (uintptr_t) priv; + uint64_t size; + + ioctl(fd, DIOCGMEDIASIZE, &size); + return (size); +} #endif diff --git a/sys/boot/i386/zfsboot/zfsboot.c b/sys/boot/i386/zfsboot/zfsboot.c index fe73f97ef4e..e76e58fc80e 100644 --- a/sys/boot/i386/zfsboot/zfsboot.c +++ b/sys/boot/i386/zfsboot/zfsboot.c @@ -468,6 +468,23 @@ copy_dsk(struct dsk *dsk) return (newdsk); } +/* + * The "layered" ioctl to read disk/partition size. Unfortunately + * the zfsboot case is hardest, because we do not have full software + * stack available, so we need to do some manual work here. + */ +uint64_t +ldi_get_size(void *priv) +{ + struct dsk *dskp = priv; + uint64_t size = dskp->size; + + if (dskp->start == 0) + size = drvsize(dskp); + + return (size * DEV_BSIZE); +} + static void probe_drive(struct dsk *dsk) { @@ -549,6 +566,7 @@ probe_drive(struct dsk *dsk) if (memcmp(&ent->ent_type, &freebsd_zfs_uuid, sizeof(uuid_t)) == 0) { dsk->start = ent->ent_lba_start; + dsk->size = ent->ent_lba_end - ent->ent_lba_start + 1; dsk->slice = part + 1; dsk->part = 255; if (vdev_probe(vdev_read, dsk, NULL) == 0) { @@ -593,6 +611,7 @@ probe_drive(struct dsk *dsk) if (!dp[i].dp_typ) continue; dsk->start = dp[i].dp_start; + dsk->size = dp[i].dp_size; dsk->slice = i + 1; if (vdev_probe(vdev_read, dsk, NULL) == 0) { dsk = copy_dsk(dsk); @@ -648,7 +667,7 @@ main(void) dsk->slice = *(uint8_t *)PTOV(ARGS + 1) + 1; dsk->part = 0; dsk->start = 0; - dsk->init = 0; + dsk->size = 0; bootinfo.bi_version = BOOTINFO_VERSION; bootinfo.bi_size = sizeof(bootinfo); @@ -699,7 +718,7 @@ main(void) dsk->slice = 0; dsk->part = 0; dsk->start = 0; - dsk->init = 0; + dsk->size = 0; probe_drive(dsk); } diff --git a/sys/boot/zfs/libzfs.h b/sys/boot/zfs/libzfs.h index 1b304998867..d1b4674679e 100644 --- a/sys/boot/zfs/libzfs.h +++ b/sys/boot/zfs/libzfs.h @@ -81,6 +81,7 @@ int zfs_parsedev(struct zfs_devdesc *dev, const char *devspec, char *zfs_fmtdev(void *vdev); int zfs_probe_dev(const char *devname, uint64_t *pool_guid); int zfs_list(const char *name); +uint64_t ldi_get_size(void *); void init_zfs_bootenv(char *currdev); int zfs_bootenv(const char *name); int zfs_belist_add(const char *name, uint64_t __unused); diff --git a/sys/boot/zfs/zfsimpl.c b/sys/boot/zfs/zfsimpl.c index a436fef8b53..0c2c1f86f3d 100644 --- a/sys/boot/zfs/zfsimpl.c +++ b/sys/boot/zfs/zfsimpl.c @@ -68,7 +68,7 @@ static const char *features_for_read[] = { */ static spa_list_t zfs_pools; -static const dnode_phys_t *dnode_cache_obj = NULL; +static const dnode_phys_t *dnode_cache_obj; static uint64_t dnode_cache_bn; static char *dnode_cache_buf; static char *zap_scratch; @@ -523,12 +523,11 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev, int nkids, i, is_new; uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present; - if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, - DATA_TYPE_UINT64, 0, &guid) - || nvlist_find(nvlist, ZPOOL_CONFIG_ID, - DATA_TYPE_UINT64, 0, &id) - || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, - DATA_TYPE_STRING, 0, &type)) { + if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, + NULL, &guid) + || nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) + || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, + NULL, &type)) { printf("ZFS: can't find vdev details\n"); return (ENOENT); } @@ -546,15 +545,15 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev, is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0; - nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0, + nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL, &is_offline); - nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0, + nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL, &is_removed); - nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0, + nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL, &is_faulted); - nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0, + nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, NULL, &is_degraded); - nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0, + nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, NULL, &isnt_present); vdev = vdev_find(guid); @@ -573,17 +572,19 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev, vdev->v_id = id; vdev->v_top = pvdev != NULL ? pvdev : vdev; if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT, - DATA_TYPE_UINT64, 0, &ashift) == 0) + DATA_TYPE_UINT64, NULL, &ashift) == 0) { vdev->v_ashift = ashift; - else + } else { vdev->v_ashift = 0; + } if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY, - DATA_TYPE_UINT64, 0, &nparity) == 0) + DATA_TYPE_UINT64, NULL, &nparity) == 0) { vdev->v_nparity = nparity; - else + } else { vdev->v_nparity = 0; + } if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, - DATA_TYPE_STRING, 0, &path) == 0) { + DATA_TYPE_STRING, NULL, &path) == 0) { if (strncmp(path, "/dev/", 5) == 0) path += 5; vdev->v_name = strdup(path); @@ -625,8 +626,8 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev, vdev->v_state = VDEV_STATE_CANT_OPEN; } - rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, - DATA_TYPE_NVLIST_ARRAY, &nkids, &kids); + rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, + &nkids, &kids); /* * Its ok if we don't have any kids. */ @@ -744,12 +745,17 @@ spa_get_primary_vdev(const spa_t *spa) #endif static spa_t * -spa_create(uint64_t guid) +spa_create(uint64_t guid, const char *name) { spa_t *spa; - spa = malloc(sizeof(spa_t)); + if ((spa = malloc(sizeof(spa_t))) == NULL) + return (NULL); memset(spa, 0, sizeof(spa_t)); + if ((spa->spa_name = strdup(name)) == NULL) { + free(spa); + return (NULL); + } STAILQ_INIT(&spa->spa_vdevs); spa->spa_guid = guid; STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link); @@ -905,24 +911,39 @@ spa_all_status(void) return (ret); } +uint64_t +vdev_label_offset(uint64_t psize, int l, uint64_t offset) +{ + uint64_t label_offset; + + if (l < VDEV_LABELS / 2) + label_offset = 0; + else + label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t); + + return (offset + l * sizeof (vdev_label_t) + label_offset); +} + static int vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) { vdev_t vtmp; vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch; + vdev_phys_t *tmp_label = zfs_alloc(sizeof(vdev_phys_t)); spa_t *spa; vdev_t *vdev, *top_vdev, *pool_vdev; off_t off; blkptr_t bp; - const unsigned char *nvlist; + const unsigned char *nvlist = NULL; uint64_t val; uint64_t guid; + uint64_t best_txg = 0; uint64_t pool_txg, pool_guid; - uint64_t is_log; + uint64_t psize; const char *pool_name; const unsigned char *vdevs; const unsigned char *features; - int i, rc, is_newer; + int i, l, rc, is_newer; char *upbuf; const struct uberblock *up; @@ -933,26 +954,47 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) memset(&vtmp, 0, sizeof(vtmp)); vtmp.v_phys_read = _read; vtmp.v_read_priv = read_priv; - off = offsetof(vdev_label_t, vl_vdev_phys); - BP_ZERO(&bp); - BP_SET_LSIZE(&bp, sizeof(vdev_phys_t)); - BP_SET_PSIZE(&bp, sizeof(vdev_phys_t)); - BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); - BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); - DVA_SET_OFFSET(BP_IDENTITY(&bp), off); - ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); - if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0)) - return (EIO); + psize = P2ALIGN(ldi_get_size(read_priv), + (uint64_t)sizeof (vdev_label_t)); - if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) { - return (EIO); + for (l = 0; l < VDEV_LABELS; l++) { + off = vdev_label_offset(psize, l, + offsetof(vdev_label_t, vl_vdev_phys)); + + BP_ZERO(&bp); + BP_SET_LSIZE(&bp, sizeof(vdev_phys_t)); + BP_SET_PSIZE(&bp, sizeof(vdev_phys_t)); + BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); + BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + DVA_SET_OFFSET(BP_IDENTITY(&bp), off); + ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); + + if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0)) + continue; + + if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR) + continue; + + nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4; + if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, + DATA_TYPE_UINT64, NULL, &pool_txg) != 0) + continue; + + if (best_txg <= pool_txg) { + best_txg = pool_txg; + memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t)); + } } + zfs_free(tmp_label, sizeof (vdev_phys_t)); + + if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) + return (EIO); + nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4; - if (nvlist_find(nvlist, - ZPOOL_CONFIG_VERSION, - DATA_TYPE_UINT64, 0, &val)) { + if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64, + NULL, &val) != 0) { return (EIO); } @@ -963,15 +1005,14 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) } /* Check ZFS features for read */ - if (nvlist_find(nvlist, - ZPOOL_CONFIG_FEATURES_FOR_READ, - DATA_TYPE_NVLIST, 0, &features) == 0 - && nvlist_check_features_for_read(features) != 0) + if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ, + DATA_TYPE_NVLIST, NULL, &features) == 0 && + nvlist_check_features_for_read(features) != 0) { return (EIO); + } - if (nvlist_find(nvlist, - ZPOOL_CONFIG_POOL_STATE, - DATA_TYPE_UINT64, 0, &val)) { + if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64, + NULL, &val) != 0) { return (EIO); } @@ -980,15 +1021,12 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) return (EIO); } - if (nvlist_find(nvlist, - ZPOOL_CONFIG_POOL_TXG, - DATA_TYPE_UINT64, 0, &pool_txg) - || nvlist_find(nvlist, - ZPOOL_CONFIG_POOL_GUID, - DATA_TYPE_UINT64, 0, &pool_guid) - || nvlist_find(nvlist, - ZPOOL_CONFIG_POOL_NAME, - DATA_TYPE_STRING, 0, &pool_name)) { + if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64, + NULL, &pool_txg) != 0 || + nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, + NULL, &pool_guid) != 0 || + nvlist_find(nvlist, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING, + NULL, &pool_name) != 0) { /* * Cache and spare devices end up here - just ignore * them. @@ -997,25 +1035,26 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) return (EIO); } - is_log = 0; - (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0, - &is_log); - if (is_log) + if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, + NULL, &val) == 0 && val != 0) { return (EIO); + } /* * Create the pool if this is the first time we've seen it. */ spa = spa_find_by_guid(pool_guid); - if (!spa) { - spa = spa_create(pool_guid); - spa->spa_name = strdup(pool_name); + if (spa == NULL) { + spa = spa_create(pool_guid, pool_name); + if (spa == NULL) + return (ENOMEM); } if (pool_txg > spa->spa_txg) { spa->spa_txg = pool_txg; is_newer = 1; - } else + } else { is_newer = 0; + } /* * Get the vdev tree and create our in-core copy of it. @@ -1023,23 +1062,21 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) * be some kind of alias (overlapping slices, dangerously dedicated * disks etc). */ - if (nvlist_find(nvlist, - ZPOOL_CONFIG_GUID, - DATA_TYPE_UINT64, 0, &guid)) { + if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, + NULL, &guid) != 0) { return (EIO); } vdev = vdev_find(guid); if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */ return (EIO); - if (nvlist_find(nvlist, - ZPOOL_CONFIG_VDEV_TREE, - DATA_TYPE_NVLIST, 0, &vdevs)) { + if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, + NULL, &vdevs)) { return (EIO); } rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer); - if (rc) + if (rc != 0) return (rc); /* @@ -1079,36 +1116,37 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) */ upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev)); up = (const struct uberblock *)upbuf; - for (i = 0; - i < VDEV_UBERBLOCK_COUNT(vdev); - i++) { - off = VDEV_UBERBLOCK_OFFSET(vdev, i); - BP_ZERO(&bp); - DVA_SET_OFFSET(&bp.blk_dva[0], off); - BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev)); - BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev)); - BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); - BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); - ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); + for (l = 0; l < VDEV_LABELS; l++) { + for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) { + off = vdev_label_offset(psize, l, + VDEV_UBERBLOCK_OFFSET(vdev, i)); + BP_ZERO(&bp); + DVA_SET_OFFSET(&bp.blk_dva[0], off); + BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev)); + BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev)); + BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); + BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); - if (vdev_read_phys(vdev, &bp, upbuf, off, 0)) - continue; + if (vdev_read_phys(vdev, &bp, upbuf, off, 0)) + continue; - if (up->ub_magic != UBERBLOCK_MAGIC) - continue; - if (up->ub_txg < spa->spa_txg) - continue; - if (up->ub_txg > spa->spa_uberblock.ub_txg) { - spa->spa_uberblock = *up; - } else if (up->ub_txg == spa->spa_uberblock.ub_txg) { - if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp) + if (up->ub_magic != UBERBLOCK_MAGIC) + continue; + if (up->ub_txg < spa->spa_txg) + continue; + if (up->ub_txg > spa->spa_uberblock.ub_txg || + (up->ub_txg == spa->spa_uberblock.ub_txg && + up->ub_timestamp > + spa->spa_uberblock.ub_timestamp)) { spa->spa_uberblock = *up; + } } } zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev)); vdev->spa = spa; - if (spap) + if (spap != NULL) *spap = spa; return (0); }