libsa/zfs: further improve handling of stale labels

Fix two problems with 6dd0803ffd.  First problem is that situation when
newer label was read before stale one, was handled differently to reverse
order case.  Second problem is that vdev_free() would free the fully
initialized leaf vdev that carried stale label.  In a case when vdev
carries a stale label, but is still referenced by a different label with
new a configuration, we don't want to free it, but rather insert it into
the new configuration.

o Provide a helper function nvlist_find_vdev_guid() that checks presence
  of certain GUID in a label.
o In top level vdev store the GUID of vdev used to instantiate top vdev.
o Cover all possible cases in the block in vdev_probe() where we encounter
  a known configuration.  Make the diagnostic print more informative and
  looking same regardless of probe order.  Make this whole block easier to
  read reducing one level of indentation for a price of a single comparison
  at runtime.

Reviewed by:		mav, imp
Differential Revision:	https://reviews.freebsd.org/D51913
Fixes:			6dd0803ffd
This commit is contained in:
Gleb Smirnoff
2025-08-20 07:52:20 -07:00
parent 9dbd8ee116
commit a1572e202a
2 changed files with 115 additions and 23 deletions
+109 -18
View File
@@ -833,6 +833,14 @@ vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
return (kid->v_read(kid, bp, buf, offset, bytes));
}
/*
* List of vdevs that were fully initialized from their own label, but later a
* newer label was found that obsoleted the stale label, freeing its
* configuration tree. We keep those vdevs around, since a new configuration
* may include them.
*/
static vdev_list_t orphans = STAILQ_HEAD_INITIALIZER(orphans);
static vdev_t *
vdev_find(vdev_list_t *list, uint64_t guid)
{
@@ -854,6 +862,11 @@ vdev_create(uint64_t guid, vdev_read_t *_read)
vdev_t *vdev;
vdev_indirect_config_t *vic;
if ((vdev = vdev_find(&orphans, guid))) {
STAILQ_REMOVE(&orphans, vdev, vdev, v_childlink);
return (vdev);
}
vdev = calloc(1, sizeof(vdev_t));
if (vdev != NULL) {
STAILQ_INIT(&vdev->v_children);
@@ -1101,8 +1114,8 @@ vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
}
static int
vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t txg,
const nvlist_t *nvlist)
vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t label_guid,
uint64_t txg, const nvlist_t *nvlist)
{
vdev_t *top_vdev, *vdev;
nvlist_t **kids = NULL;
@@ -1116,6 +1129,7 @@ vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t txg,
return (rc);
top_vdev->v_spa = spa;
top_vdev->v_top = top_vdev;
top_vdev->v_label = label_guid;
top_vdev->v_txg = txg;
(void )vdev_insert(spa->spa_root_vdev, top_vdev);
}
@@ -1160,12 +1174,14 @@ vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t txg,
static int
vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist)
{
uint64_t top_guid, txg;
uint64_t top_guid, label_guid, txg;
nvlist_t *vdevs;
int rc;
if (nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
NULL, &top_guid, NULL) ||
nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
NULL, &label_guid, NULL) != 0 ||
nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
NULL, &txg, NULL) != 0 ||
nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
@@ -1174,7 +1190,7 @@ vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist)
return (ENOENT);
}
rc = vdev_from_nvlist(spa, top_guid, txg, vdevs);
rc = vdev_from_nvlist(spa, top_guid, label_guid, txg, vdevs);
nvlist_destroy(vdevs);
return (rc);
}
@@ -1271,6 +1287,9 @@ vdev_free(struct vdev *vdev)
STAILQ_FOREACH_SAFE(kid, &vdev->v_children, v_childlink, safe)
vdev_free(kid);
if (vdev->v_phys_read != NULL)
STAILQ_INSERT_HEAD(&orphans, vdev, v_childlink);
else
free(vdev);
}
@@ -1323,7 +1342,7 @@ vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
* XXXGL: how can this happen?
*/
if (vdev == NULL)
rc = vdev_from_nvlist(spa, guid, 0, kids[i]);
rc = vdev_from_nvlist(spa, guid, 0, 0, kids[i]);
else
rc = vdev_update_from_nvlist(spa->spa_root_vdev, guid,
kids[i]);
@@ -1344,6 +1363,53 @@ vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
return (rc);
}
static bool
nvlist_find_child_guid(const nvlist_t *nvlist, uint64_t guid)
{
nvlist_t **kids = NULL;
int nkids, i;
bool rv = false;
if (nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
&nkids, &kids, NULL) != 0)
nkids = 0;
for (i = 0; i < nkids; i++) {
uint64_t kid_guid;
if (nvlist_find(kids[i], ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
NULL, &kid_guid, NULL) != 0)
break;
if (kid_guid == guid)
rv = true;
else
rv = nvlist_find_child_guid(kids[i], guid);
if (rv)
break;
}
for (i = 0; i < nkids; i++)
nvlist_destroy(kids[i]);
free(kids);
return (rv);
}
static bool
nvlist_find_vdev_guid(const nvlist_t *nvlist, uint64_t guid)
{
nvlist_t *vdevs;
bool rv;
if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, NULL,
&vdevs, NULL) != 0)
return (false);
rv = nvlist_find_child_guid(vdevs, guid);
nvlist_destroy(vdevs);
return (rv);
}
static spa_t *
spa_find_by_guid(uint64_t guid)
{
@@ -2012,7 +2078,7 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
{
vdev_t vtmp;
spa_t *spa;
vdev_t *vdev;
vdev_t *vdev, *top;
nvlist_t *nvl;
uint64_t val;
uint64_t guid, pool_guid, top_guid, txg;
@@ -2109,20 +2175,45 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv,
nvlist_destroy(nvl);
return (ENOMEM);
}
} else {
struct vdev *kid;
}
STAILQ_FOREACH(kid, &spa->spa_root_vdev->v_children,
v_childlink)
if (kid->v_guid == top_guid && kid->v_txg < txg) {
printf("ZFS: pool %s vdev %s ignoring stale "
"label from txg 0x%jx, using 0x%jx@0x%jx\n",
spa->spa_name, kid->v_name,
kid->v_txg, guid, txg);
STAILQ_REMOVE(&spa->spa_root_vdev->v_children,
kid, vdev, v_childlink);
vdev_free(kid);
/*
* Check if configuration is already known. If configuration is known
* and txg numbers don't match, we got 2x2 scenarios here. First, is
* the label being read right now _newer_ than the one read before.
* Second, is the vdev that provided the stale label _present_ in the
* newer configuration. If neither is true, we completely ignore the
* label.
*/
STAILQ_FOREACH(top, &spa->spa_root_vdev->v_children, v_childlink)
if (top->v_guid == top_guid) {
bool newer, present;
if (top->v_txg == txg)
break;
newer = (top->v_txg < txg);
present = newer ?
nvlist_find_vdev_guid(nvl, top->v_label) :
(vdev_find(&top->v_children, guid) != NULL);
printf("ZFS: pool %s vdev %s %s stale label from "
"0x%jx@0x%jx, %s 0x%jx@0x%jx\n",
spa->spa_name, top->v_name,
present ? "using" : "ignoring",
newer ? top->v_label : guid,
newer ? top->v_txg : txg,
present ? "referred by" : "using",
newer ? guid : top->v_label,
newer ? txg : top->v_txg);
if (newer) {
STAILQ_REMOVE(&spa->spa_root_vdev->v_children,
top, vdev, v_childlink);
vdev_free(top);
break;
} else if (present) {
break;
} else {
nvlist_destroy(nvl);
return (EIO);
}
}
+2 -1
View File
@@ -2024,7 +2024,8 @@ typedef struct vdev {
vdev_list_t v_children; /* children of this vdev */
const char *v_name; /* vdev name */
uint64_t v_guid; /* vdev guid */
uint64_t v_txg; /* most recent transaction */
uint64_t v_label; /* label instantiated from (top vdev) */
uint64_t v_txg; /* most recent transaction (top vdev) */
uint64_t v_id; /* index in parent */
uint64_t v_psize; /* physical device capacity */
int v_ashift; /* offset to block shift */