diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index d68272bea731..58819118b92e 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -395,13 +395,15 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, &state); /* - * If this is a resource notifying us of device removal then simply - * check for an available spare and continue unless the device is a - * l2arc vdev, in which case we just offline it. + * If this is a resource notifying us of device removal or a device + * that can't be opened (UNAVAIL), then check for an available spare + * and continue unless the device is a l2arc vdev, in which case we + * just offline it. */ if (strcmp(class, "resource.fs.zfs.removed") == 0 || (strcmp(class, "resource.fs.zfs.statechange") == 0 && - (state == VDEV_STATE_REMOVED || state == VDEV_STATE_FAULTED))) { + (state == VDEV_STATE_REMOVED || state == VDEV_STATE_FAULTED || + state == VDEV_STATE_CANT_OPEN))) { const char *devtype; char *devname; boolean_t skip_removal = B_FALSE; diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index bcb98af40067..f7eeddd21692 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -76,9 +76,11 @@ typedef struct dsl_scan_phys { typedef enum dsl_scan_flags { DSF_VISIT_DS_AGAIN = 1<<0, DSF_SCRUB_PAUSED = 1<<1, + DSF_SORTED_SCAN = 1<<2, /* scan is using sorted (sequential) method */ } dsl_scan_flags_t; -#define DSL_SCAN_FLAGS_MASK (DSF_VISIT_DS_AGAIN) +#define DSL_SCAN_FLAGS_MASK \ + (DSF_VISIT_DS_AGAIN | DSF_SCRUB_PAUSED | DSF_SORTED_SCAN) typedef struct dsl_errorscrub_phys { uint64_t dep_func; /* pool_scan_func_t */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 87422658cf23..05cc8062ed54 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -953,6 +953,8 @@ typedef struct zpool_load_policy { "org.openzfs:raidz_expand_end_time" #define VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED \ "org.openzfs:raidz_expand_bytes_copied" +#define VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_TO_COPY \ + "org.openzfs:raidz_expand_bytes_to_copy" /* vdev metaslab allocation bias */ #define VDEV_ALLOC_BIAS_LOG "log" diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index df8c2aed4045..820fb818fb2f 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -129,6 +129,13 @@ typedef struct vdev_raidz_expand { uint64_t vre_start_time; uint64_t vre_end_time; uint64_t vre_bytes_copied; + /* + * Total bytes to copy, captured at expansion start to prevent + * progress from exceeding 100% when new data is written during + * expansion. This field is 0 for backward compatibility with older + * pools that didn't capture this value at expansion start. + */ + uint64_t vre_bytes_to_copy; } vdev_raidz_expand_t; typedef struct vdev_raidz { diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index ae36161dd1b6..9d704d961a45 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -628,6 +628,11 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); + /* Restore scan method from persisted flags */ + if (scn->scn_phys.scn_flags & DSF_SORTED_SCAN) { + scn->scn_is_sorted = B_TRUE; + } + /* reload the queue into the in-core state */ if (scn->scn_phys.scn_queue_obj != 0) { zap_cursor_t zc; @@ -1132,6 +1137,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) if (scn->scn_is_sorted) { scan_io_queues_destroy(scn); scn->scn_is_sorted = B_FALSE; + scn->scn_phys.scn_flags &= ~DSF_SORTED_SCAN; if (scn->scn_taskq != NULL) { taskq_destroy(scn->scn_taskq); @@ -4499,6 +4505,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) */ if (!zfs_scan_legacy) { scn->scn_is_sorted = B_TRUE; + scn->scn_phys.scn_flags |= DSF_SORTED_SCAN; if (scn->scn_last_checkpoint == 0) scn->scn_last_checkpoint = ddi_get_lbolt(); } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 520ddd692bda..5b4d5744a8cf 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -5148,6 +5148,7 @@ vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) vdrz->vn_vre.vre_end_time = 0; vdrz->vn_vre.vre_state = DSS_SCANNING; vdrz->vn_vre.vre_bytes_copied = 0; + vdrz->vn_vre.vre_bytes_to_copy = raidvd->vdev_stat.vs_alloc; uint64_t state = vdrz->vn_vre.vre_state; VERIFY0(zap_update(spa->spa_meta_objset, @@ -5159,6 +5160,11 @@ vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, sizeof (start_time), 1, &start_time, tx)); + uint64_t bytes_to_copy = vdrz->vn_vre.vre_bytes_to_copy; + VERIFY0(zap_update(spa->spa_meta_objset, + raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_TO_COPY, + sizeof (bytes_to_copy), 1, &bytes_to_copy, tx)); + (void) zap_remove(spa->spa_meta_objset, raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); (void) zap_remove(spa->spa_meta_objset, @@ -5180,6 +5186,7 @@ vdev_raidz_load(vdev_t *vd) uint64_t start_time = 0; uint64_t end_time = 0; uint64_t bytes_copied = 0; + uint64_t bytes_to_copy = 0; if (vd->vdev_top_zap != 0) { err = zap_lookup(vd->vdev_spa->spa_meta_objset, @@ -5205,6 +5212,12 @@ vdev_raidz_load(vdev_t *vd) sizeof (bytes_copied), 1, &bytes_copied); if (err != 0 && err != ENOENT) return (err); + + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_TO_COPY, + sizeof (bytes_to_copy), 1, &bytes_to_copy); + if (err != 0 && err != ENOENT) + return (err); } /* @@ -5216,6 +5229,7 @@ vdev_raidz_load(vdev_t *vd) vdrz->vn_vre.vre_start_time = start_time; vdrz->vn_vre.vre_end_time = end_time; vdrz->vn_vre.vre_bytes_copied = bytes_copied; + vdrz->vn_vre.vre_bytes_to_copy = bytes_to_copy; return (0); } @@ -5250,7 +5264,14 @@ spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) pres->pres_expanding_vdev = vre->vre_vdev_id; vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); - pres->pres_to_reflow = vd->vdev_stat.vs_alloc; + /* + * Use the persisted bytes_to_copy value if available (captured at + * expansion start) to prevent progress from exceeding 100% when new + * data is written during expansion. Fall back to current vs_alloc + * for backward compatibility with older pools. + */ + pres->pres_to_reflow = vre->vre_bytes_to_copy != 0 ? + vre->vre_bytes_to_copy : vd->vdev_stat.vs_alloc; mutex_enter(&vre->vre_lock); pres->pres_reflowed = vre->vre_bytes_copied;