diff --git a/include/sys/arc.h b/include/sys/arc.h
index 37674eff4f66..2b3668c60868 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -354,8 +354,6 @@ boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top,
     uint64_t check);
 void l2arc_init(void);
 void l2arc_fini(void);
-void l2arc_start(void);
-void l2arc_stop(void);
 void l2arc_spa_rebuild_start(spa_t *spa);
 void l2arc_spa_rebuild_stop(spa_t *spa);
 
diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h
index b55d5da3378c..2c6d79922ab8 100644
--- a/include/sys/arc_impl.h
+++ b/include/sys/arc_impl.h
@@ -41,6 +41,27 @@
 extern "C" {
 #endif
 
+/*
+ * We can feed L2ARC from two states of ARC buffers, mru and mfu,
+ * and each of the state has two types: data and metadata.
+ */
+#define	L2ARC_FEED_TYPES	4
+
+/*
+ * L2ARC state and statistics for persistent marker management.
+ */
+typedef struct l2arc_info {
+	arc_buf_hdr_t	**l2arc_markers[L2ARC_FEED_TYPES];
+	uint64_t	l2arc_total_writes;	/* total writes for reset */
+	uint64_t	l2arc_total_capacity;	/* total L2ARC capacity */
+	uint64_t	l2arc_smallest_capacity; /* smallest device capacity */
+	/*
+	 * Per-device thread coordination for sublist processing
+	 */
+	boolean_t	*l2arc_sublist_busy[L2ARC_FEED_TYPES];
+	kmutex_t	l2arc_sublist_lock;	/* protects busy flags */
+} l2arc_info_t;
+
 /*
  * Note that buffers can be in one of 6 states:
  *	ARC_anon	- anonymous (discussed below)
@@ -421,6 +442,19 @@ typedef struct l2arc_dev {
 	 */
 	zfs_refcount_t		l2ad_lb_count;
 	boolean_t		l2ad_trim_all; /* TRIM whole device */
+	/*
+	 * DWPD tracking with daily reset
+	 */
+	uint64_t		l2ad_dwpd_writes;	/* 24h bytes written */
+	uint64_t		l2ad_dwpd_start;	/* 24h period start */
+	uint64_t		l2ad_dwpd_accumulated;	/* Accumulated */
+	/*
+	 * Per-device feed thread for parallel L2ARC writes
+	 */
+	kthread_t		*l2ad_feed_thread;	/* feed thread handle */
+	boolean_t		l2ad_thread_exit;	/* signal thread exit */
+	kmutex_t		l2ad_feed_thr_lock;	/* thread sleep/wake */
+	kcondvar_t		l2ad_feed_cv;		/* thread wakeup cv */
 } l2arc_dev_t;
 
 /*
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 62b062984d36..8d8e565ccf88 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -52,6 +52,7 @@
 #include <sys/dsl_crypt.h>
 #include <sys/zfeature.h>
 #include <sys/zthr.h>
+#include <sys/arc_impl.h>
 #include <sys/dsl_deadlist.h>
 #include <zfeature_common.h>
 
@@ -283,6 +284,7 @@ struct spa {
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
 	boolean_t	spa_aux_sync_uber;	/* need to sync aux uber */
+	l2arc_info_t	spa_l2arc_info;		/* L2ARC state and stats */
 	nvlist_t	*spa_label_features;	/* Features for reading MOS */
 	uint64_t	spa_config_object;	/* MOS object for pool config */
 	uint64_t	spa_config_generation;	/* config generation number */
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index a2faec4e18c4..5dea7f806df0 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -109,8 +109,7 @@ Seconds between L2ARC writing.
 .
 .It Sy l2arc_headroom Ns = Ns Sy 8 Pq u64
 How far through the ARC lists to search for L2ARC cacheable content,
-expressed as a multiplier of
-.Sy l2arc_write_max .
+expressed as a multiplier of the effective write size.
 ARC persistence across reboots can be achieved with persistent L2ARC
 by setting this parameter to
 .Sy 0 ,
@@ -125,6 +124,19 @@ A value of
 .Sy 100
 disables this feature.
 .
+.It Sy l2arc_dwpd_limit Ns = Ns Sy 100 Pq uint
+Drive Writes Per Day limit for L2ARC devices to protect SSD endurance,
+specified as a percentage where 100 equals 1.0 DWPD.
+A value of 100 means each L2ARC device can write its own capacity once per day.
+Lower values support fractional DWPD
+(50 = 0.5 DWPD, 30 = 0.3 DWPD for QLC SSDs).
+Higher values allow more writes (300 = 3.0 DWPD).
+The effective write rate is always bounded by
+.Sy l2arc_write_max .
+A value of 0 disables DWPD rate limiting entirely.
+DWPD limiting only applies after the initial fill pass completes and when
+total L2ARC capacity meets the persist threshold.
+.
 .It Sy l2arc_exclude_special Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Controls whether buffers present on special vdevs are eligible for caching
 into L2ARC.
@@ -179,9 +191,8 @@ can render it slow or unusable.
 This parameter limits L2ARC writes and rebuilds to achieve the target.
 .
 .It Sy l2arc_trim_ahead Ns = Ns Sy 0 Ns % Pq u64
-Trims ahead of the current write size
-.Pq Sy l2arc_write_max
-on L2ARC devices by this percentage of write size if we have filled the device.
+Trims ahead of the current write size on L2ARC devices by this percentage
+of write size if we have filled the device.
 If set to
 .Sy 100
 we TRIM twice the space required to accommodate upcoming writes.
@@ -216,13 +227,12 @@ to enable caching/reading prefetches to/from L2ARC.
 .It Sy l2arc_norw Ns = Ns Sy 0 Ns | Ns 1 Pq int
 No reads during writes.
 .
-.It Sy l2arc_write_boost Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64
-Cold L2ARC devices will have
-.Sy l2arc_write_max
-increased by this amount while they remain cold.
-.
 .It Sy l2arc_write_max Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64
-Max write bytes per interval.
+Maximum write rate in bytes per second for each L2ARC device.
+Used directly during initial fill, when DWPD limiting is disabled,
+or for non-persistent L2ARC.
+When DWPD limiting is active, writes are capped by this rate.
+Total L2ARC throughput scales with the number of cache devices in a pool.
 .
 .It Sy l2arc_rebuild_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Rebuild the L2ARC when importing a pool (persistent L2ARC).
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index d73678801273..b5be8c3ed0d8 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -821,6 +821,8 @@ typedef struct arc_async_flush {
  */
 
 #define	L2ARC_WRITE_SIZE	(32 * 1024 * 1024)	/* initial write max */
+#define	L2ARC_MIN_WRITE_SIZE	(1 * 1024 * 1024)	/* minimal write rate */
+#define	L2ARC_BURST_SIZE_MAX	(50 * 1024 * 1024)	/* max burst size */
 #define	L2ARC_HEADROOM		8			/* num of writes */
 
 /*
@@ -832,14 +834,17 @@ typedef struct arc_async_flush {
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
 /*
- * We can feed L2ARC from two states of ARC buffers, mru and mfu,
- * and each of the state has two types: data and metadata.
+ * Min L2ARC capacity to enable persistent markers, adaptive intervals, and
+ * DWPD rate limiting. Markers reset after capacity/8 writes. With this
+ * threshold (arc_c_max/2), minimum progress per cycle is:
+ * (arc_c_max/2)/8 = arc_c_max/16 (~6% of ARC). Below this, marker
+ * overhead isn't justified by the limited progress made.
  */
-#define	L2ARC_FEED_TYPES	4
+#define	L2ARC_PERSIST_THRESHOLD	(arc_c_max / 2)
 
 /* L2ARC Performance Tunables */
 static uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
-static uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
+static uint64_t l2arc_dwpd_limit = 100;			/* 100 = 1.0 DWPD */
 static uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* # of dev writes */
 static uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 static uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
@@ -855,7 +860,6 @@ static uint_t l2arc_meta_percent = 33;	/* limit on headers size */
 static list_t L2ARC_dev_list;			/* device list */
 static list_t *l2arc_dev_list;			/* device list pointer */
 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
-static l2arc_dev_t *l2arc_dev_last;		/* last device used */
 static list_t L2ARC_free_on_write;		/* free after write buf list */
 static list_t *l2arc_free_on_write;		/* free after write list ptr */
 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
@@ -874,6 +878,7 @@ typedef struct l2arc_data_free {
 	abd_t		*l2df_abd;
 	size_t		l2df_size;
 	arc_buf_contents_t l2df_type;
+	l2arc_dev_t	*l2df_dev;	/* L2ARC device that owns this ABD */
 	list_node_t	l2df_list_node;
 } l2arc_data_free_t;
 
@@ -891,10 +896,6 @@ typedef enum arc_ovf_level {
 	ARC_OVF_SEVERE			/* ARC is severely overflowed. */
 } arc_ovf_level_t;
 
-static kmutex_t l2arc_feed_thr_lock;
-static kcondvar_t l2arc_feed_thr_cv;
-static uint8_t l2arc_thread_exit;
-
 static kmutex_t l2arc_rebuild_thr_lock;
 static kcondvar_t l2arc_rebuild_thr_cv;
 
@@ -926,9 +927,10 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 
 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 static void l2arc_read_done(zio_t *);
-static void l2arc_do_free_on_write(void);
+static void l2arc_do_free_on_write(l2arc_dev_t *dev);
 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
     boolean_t state_only);
+static uint64_t l2arc_get_write_rate(l2arc_dev_t *dev);
 
 static void arc_prune_async(uint64_t adjust);
 
@@ -2938,13 +2940,15 @@ arc_loan_inuse_buf(arc_buf_t *buf, const void *tag)
 }
 
 static void
-l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
+l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type,
+    l2arc_dev_t *dev)
 {
 	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
 
 	df->l2df_abd = abd;
 	df->l2df_size = size;
 	df->l2df_type = type;
+	df->l2df_dev = dev;
 	mutex_enter(&l2arc_free_on_write_mtx);
 	list_insert_head(l2arc_free_on_write, df);
 	mutex_exit(&l2arc_free_on_write_mtx);
@@ -2973,10 +2977,17 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 
+	/*
+	 * L2HDR must exist since we're freeing an L2ARC-related ABD.
+	 */
+	ASSERT(HDR_HAS_L2HDR(hdr));
+
 	if (free_rdata) {
-		l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
+		l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type,
+		    hdr->b_l2hdr.b_dev);
 	} else {
-		l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
+		l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type,
+		    hdr->b_l2hdr.b_dev);
 	}
 }
 
@@ -3654,7 +3665,13 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 	}
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
+	boolean_t l1hdr_destroyed = B_FALSE;
 
+	/*
+	 * If L2_WRITING, destroy L1HDR before L2HDR (under mutex) so
+	 * arc_hdr_free_abd() can properly defer ABDs. Otherwise, destroy
+	 * L1HDR outside mutex to minimize contention.
+	 */
 	if (HDR_HAS_L2HDR(hdr)) {
 		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
 		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
@@ -3672,9 +3689,26 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 		 * want to re-destroy the header's L2 portion.
 		 */
 		if (HDR_HAS_L2HDR(hdr)) {
+			if (HDR_L2_WRITING(hdr)) {
+				l1hdr_destroyed = B_TRUE;
 
-			if (!HDR_EMPTY(hdr))
-				buf_discard_identity(hdr);
+				if (!HDR_EMPTY(hdr))
+					buf_discard_identity(hdr);
+
+				if (HDR_HAS_L1HDR(hdr)) {
+					arc_cksum_free(hdr);
+
+					while (hdr->b_l1hdr.b_buf != NULL)
+						arc_buf_destroy_impl(
+						    hdr->b_l1hdr.b_buf);
+
+					if (hdr->b_l1hdr.b_pabd != NULL)
+						arc_hdr_free_abd(hdr, B_FALSE);
+
+					if (HDR_HAS_RABD(hdr))
+						arc_hdr_free_abd(hdr, B_TRUE);
+				}
+			}
 
 			arc_hdr_l2hdr_destroy(hdr);
 		}
@@ -3683,26 +3717,22 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 			mutex_exit(&dev->l2ad_mtx);
 	}
 
-	/*
-	 * The header's identify can only be safely discarded once it is no
-	 * longer discoverable.  This requires removing it from the hash table
-	 * and the l2arc header list.  After this point the hash lock can not
-	 * be used to protect the header.
-	 */
-	if (!HDR_EMPTY(hdr))
-		buf_discard_identity(hdr);
+	if (!l1hdr_destroyed) {
+		if (!HDR_EMPTY(hdr))
+			buf_discard_identity(hdr);
 
-	if (HDR_HAS_L1HDR(hdr)) {
-		arc_cksum_free(hdr);
+		if (HDR_HAS_L1HDR(hdr)) {
+			arc_cksum_free(hdr);
 
-		while (hdr->b_l1hdr.b_buf != NULL)
-			arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
+			while (hdr->b_l1hdr.b_buf != NULL)
+				arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
 
-		if (hdr->b_l1hdr.b_pabd != NULL)
-			arc_hdr_free_abd(hdr, B_FALSE);
+			if (hdr->b_l1hdr.b_pabd != NULL)
+				arc_hdr_free_abd(hdr, B_FALSE);
 
-		if (HDR_HAS_RABD(hdr))
-			arc_hdr_free_abd(hdr, B_TRUE);
+			if (HDR_HAS_RABD(hdr))
+				arc_hdr_free_abd(hdr, B_TRUE);
+		}
 	}
 
 	ASSERT0P(hdr->b_hash_next);
@@ -6648,9 +6678,12 @@ arc_release(arc_buf_t *buf, const void *tag)
 	ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
 
 	/*
-	 * Do we have more than one buf?
+	 * Do we have more than one buf? Or L2_WRITING with unshared data?
+	 * Single-buf L2_WRITING with shared data can reuse the header since
+	 * L2ARC uses its own transformed copy.
 	 */
-	if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) {
+	if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf) ||
+	    (HDR_L2_WRITING(hdr) && !ARC_BUF_SHARED(buf))) {
 		arc_buf_hdr_t *nhdr;
 		uint64_t spa = hdr->b_spa;
 		uint64_t psize = HDR_GET_PSIZE(hdr);
@@ -6658,6 +6691,8 @@ arc_release(arc_buf_t *buf, const void *tag)
 		boolean_t protected = HDR_PROTECTED(hdr);
 		enum zio_compress compress = arc_hdr_get_compress(hdr);
 		arc_buf_contents_t type = arc_buf_type(hdr);
+		boolean_t single_buf_l2writing = (hdr->b_l1hdr.b_buf == buf &&
+		    ARC_BUF_LAST(buf) && HDR_L2_WRITING(hdr));
 
 		if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
@@ -6667,49 +6702,61 @@ arc_release(arc_buf_t *buf, const void *tag)
 		/*
 		 * Pull the buffer off of this hdr and find the last buffer
 		 * in the hdr's buffer list.
+		 *
+		 * For single_buf_l2writing, remove the buffer first to ensure
+		 * evictable space accounting sees consistent state.
 		 */
-		VERIFY3S(remove_reference(hdr, tag), >, 0);
-		arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
-		ASSERT3P(lastbuf, !=, NULL);
+		arc_buf_t *lastbuf;
+		if (single_buf_l2writing) {
+			(void) arc_buf_remove(hdr, buf);
+		} else {
+			VERIFY3S(remove_reference(hdr, tag), >, 0);
+			lastbuf = arc_buf_remove(hdr, buf);
+			ASSERT3P(lastbuf, !=, NULL);
+		}
 
 		/*
 		 * If the current arc_buf_t and the hdr are sharing their data
 		 * buffer, then we must stop sharing that block.
 		 */
-		if (ARC_BUF_SHARED(buf)) {
-			ASSERT(!arc_buf_is_shared(lastbuf));
+		if (!single_buf_l2writing) {
+			if (ARC_BUF_SHARED(buf)) {
+				ASSERT(!arc_buf_is_shared(lastbuf));
 
-			/*
-			 * First, sever the block sharing relationship between
-			 * buf and the arc_buf_hdr_t.
-			 */
-			arc_unshare_buf(hdr, buf);
+				/*
+				 * First, sever the block sharing relationship
+				 * between buf and the arc_buf_hdr_t.
+				 */
+				arc_unshare_buf(hdr, buf);
 
-			/*
-			 * Now we need to recreate the hdr's b_pabd. Since we
-			 * have lastbuf handy, we try to share with it, but if
-			 * we can't then we allocate a new b_pabd and copy the
-			 * data from buf into it.
-			 */
-			if (arc_can_share(hdr, lastbuf)) {
-				arc_share_buf(hdr, lastbuf);
-			} else {
-				arc_hdr_alloc_abd(hdr, 0);
-				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
-				    buf->b_data, psize);
+				/*
+				 * Now we need to recreate the hdr's b_pabd.
+				 * Since we have lastbuf handy, we try to share
+				 * with it, but if we can't then we allocate a
+				 * new b_pabd and copy the data from buf into it
+				 */
+				if (arc_can_share(hdr, lastbuf)) {
+					arc_share_buf(hdr, lastbuf);
+				} else {
+					arc_hdr_alloc_abd(hdr, 0);
+					abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
+					    buf->b_data, psize);
+				}
+			} else if (HDR_SHARED_DATA(hdr)) {
+				/*
+				 * Uncompressed shared buffers are always at the
+				 * end of the list. Compressed buffers don't
+				 * have the same requirements. This makes it
+				 * hard to simply assert that the lastbuf is
+				 * shared so we rely on the hdr's compression
+				 * flags to determine if we have a compressed,
+				 * shared buffer.
+				 */
+				ASSERT(arc_buf_is_shared(lastbuf) ||
+				    arc_hdr_get_compress(hdr) !=
+				    ZIO_COMPRESS_OFF);
+				ASSERT(!arc_buf_is_shared(buf));
 			}
-		} else if (HDR_SHARED_DATA(hdr)) {
-			/*
-			 * Uncompressed shared buffers are always at the end
-			 * of the list. Compressed buffers don't have the
-			 * same requirements. This makes it hard to
-			 * simply assert that the lastbuf is shared so
-			 * we rely on the hdr's compression flags to determine
-			 * if we have a compressed, shared buffer.
-			 */
-			ASSERT(arc_buf_is_shared(lastbuf) ||
-			    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
-			ASSERT(!arc_buf_is_shared(buf));
 		}
 
 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
@@ -6739,6 +6786,12 @@ arc_release(arc_buf_t *buf, const void *tag)
 
 		(void) zfs_refcount_add_many(&arc_anon->arcs_size[type],
 		    arc_buf_size(buf), buf);
+
+		if (single_buf_l2writing) {
+			mutex_enter(hash_lock);
+			VERIFY3S(remove_reference(hdr, tag), ==, 0);
+			mutex_exit(hash_lock);
+		}
 	} else {
 		ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
 		/* protected by hash lock, or hdr is on arc_anon */
@@ -8184,8 +8237,9 @@ arc_fini(void)
 	 * Free any buffers that were tagged for destruction.  This needs
 	 * to occur before arc_state_fini() runs and destroys the aggsum
 	 * values which are updated when freeing scatter ABDs.
+	 * Pass NULL to free all ABDs regardless of device.
 	 */
-	l2arc_do_free_on_write();
+	l2arc_do_free_on_write(NULL);
 
 	/*
 	 * buf_fini() must proceed arc_state_fini() because buf_fin() may
@@ -8330,7 +8384,7 @@ arc_fini(void)
  * may be necessary for different workloads:
  *
  *	l2arc_write_max		max write bytes per interval
- *	l2arc_write_boost	extra write bytes during device warmup
+ *	l2arc_dwpd_limit	device write endurance limit (100 = 1.0 DWPD)
  *	l2arc_noprefetch	skip caching prefetched buffers
  *	l2arc_headroom		number of max device writes to precache
  *	l2arc_headroom_boost	when we find compressed buffers during ARC
@@ -8347,7 +8401,6 @@ arc_fini(void)
  *
  *	l2arc_write_eligible()	check if a buffer is eligible to cache
  *	l2arc_write_size()	calculate how much to write
- *	l2arc_write_interval()	calculate sleep delay between writes
  *
  * These three functions determine what to write, how much, and how quickly
  * to send writes.
@@ -8468,24 +8521,23 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
 }
 
 static uint64_t
-l2arc_write_size(l2arc_dev_t *dev)
+l2arc_write_size(l2arc_dev_t *dev, clock_t *interval)
 {
 	uint64_t size;
-
-	/*
-	 * Make sure our globals have meaningful values in case the user
-	 * altered them.
-	 */
-	size = l2arc_write_max;
-	if (size == 0) {
-		cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
-		    "resetting it to the default (%d)", L2ARC_WRITE_SIZE);
-		size = l2arc_write_max = L2ARC_WRITE_SIZE;
+	uint64_t write_rate = l2arc_get_write_rate(dev);
+
+	if (write_rate > L2ARC_BURST_SIZE_MAX &&
+	    dev->l2ad_spa->spa_l2arc_info.l2arc_total_capacity >=
+	    L2ARC_PERSIST_THRESHOLD) {
+		/* Calculate interval to achieve desired rate with burst cap */
+		uint64_t feeds_per_sec = write_rate / L2ARC_BURST_SIZE_MAX;
+		*interval = hz / feeds_per_sec;
+		size = L2ARC_BURST_SIZE_MAX;
+	} else {
+		*interval = hz; /* 1 second default */
+		size = write_rate;
 	}
 
-	if (arc_warm == B_FALSE)
-		size += l2arc_write_boost;
-
 	/* We need to add in the worst case scenario of log block overhead. */
 	size += l2arc_log_blk_overhead(size, dev);
 	if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
@@ -8510,115 +8562,26 @@ l2arc_write_size(l2arc_dev_t *dev)
 
 }
 
-static clock_t
-l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
-{
-	clock_t interval, next, now;
-
-	/*
-	 * If the ARC lists are busy, increase our write rate; if the
-	 * lists are stale, idle back.  This is achieved by checking
-	 * how much we previously wrote - if it was more than half of
-	 * what we wanted, schedule the next write much sooner.
-	 */
-	if (l2arc_feed_again && wrote > (wanted / 2))
-		interval = (hz * l2arc_feed_min_ms) / 1000;
-	else
-		interval = hz * l2arc_feed_secs;
-
-	now = ddi_get_lbolt();
-	next = MAX(now, MIN(now + interval, began + interval));
-
-	return (next);
-}
-
-static boolean_t
-l2arc_dev_invalid(const l2arc_dev_t *dev)
-{
-	/*
-	 * We want to skip devices that are being rebuilt, trimmed,
-	 * removed, or belong to a spa that is being exported.
-	 */
-	return (dev->l2ad_vdev == NULL || vdev_is_dead(dev->l2ad_vdev) ||
-	    dev->l2ad_rebuild || dev->l2ad_trim_all ||
-	    dev->l2ad_spa == NULL || dev->l2ad_spa->spa_is_exporting);
-}
-
-/*
- * Cycle through L2ARC devices.  This is how L2ARC load balances.
- * If a device is returned, this also returns holding the spa config lock.
- */
-static l2arc_dev_t *
-l2arc_dev_get_next(void)
-{
-	l2arc_dev_t *first, *next = NULL;
-
-	/*
-	 * Lock out the removal of spas (spa_namespace_lock), then removal
-	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
-	 * both locks will be dropped and a spa config lock held instead.
-	 */
-	spa_namespace_enter(FTAG);
-	mutex_enter(&l2arc_dev_mtx);
-
-	/* if there are no vdevs, there is nothing to do */
-	if (l2arc_ndev == 0)
-		goto out;
-
-	first = NULL;
-	next = l2arc_dev_last;
-	do {
-		/* loop around the list looking for a non-faulted vdev */
-		if (next == NULL) {
-			next = list_head(l2arc_dev_list);
-		} else {
-			next = list_next(l2arc_dev_list, next);
-			if (next == NULL)
-				next = list_head(l2arc_dev_list);
-		}
-
-		/* if we have come back to the start, bail out */
-		if (first == NULL)
-			first = next;
-		else if (next == first)
-			break;
-
-		ASSERT3P(next, !=, NULL);
-	} while (l2arc_dev_invalid(next));
-
-	/* if we were unable to find any usable vdevs, return NULL */
-	if (l2arc_dev_invalid(next))
-		next = NULL;
-
-	l2arc_dev_last = next;
-
-out:
-	mutex_exit(&l2arc_dev_mtx);
-
-	/*
-	 * Grab the config lock to prevent the 'next' device from being
-	 * removed while we are writing to it.
-	 */
-	if (next != NULL)
-		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
-	spa_namespace_exit(FTAG);
-
-	return (next);
-}
-
 /*
  * Free buffers that were tagged for destruction.
  */
 static void
-l2arc_do_free_on_write(void)
+l2arc_do_free_on_write(l2arc_dev_t *dev)
 {
-	l2arc_data_free_t *df;
+	l2arc_data_free_t *df, *df_next;
+	boolean_t all = (dev == NULL);
 
 	mutex_enter(&l2arc_free_on_write_mtx);
-	while ((df = list_remove_head(l2arc_free_on_write)) != NULL) {
-		ASSERT3P(df->l2df_abd, !=, NULL);
-		abd_free(df->l2df_abd);
-		kmem_free(df, sizeof (l2arc_data_free_t));
+	df = list_head(l2arc_free_on_write);
+	while (df != NULL) {
+		df_next = list_next(l2arc_free_on_write, df);
+		if (all || df->l2df_dev == dev) {
+			list_remove(l2arc_free_on_write, df);
+			ASSERT3P(df->l2df_abd, !=, NULL);
+			abd_free(df->l2df_abd);
+			kmem_free(df, sizeof (l2arc_data_free_t));
+		}
+		df = df_next;
 	}
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
@@ -8806,7 +8769,7 @@ l2arc_write_done(zio_t *zio)
 	ASSERT(dev->l2ad_vdev != NULL);
 	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
 
-	l2arc_do_free_on_write();
+	l2arc_do_free_on_write(dev);
 
 	kmem_free(cb, sizeof (l2arc_write_callback_t));
 }
@@ -9044,48 +9007,139 @@ l2arc_read_done(zio_t *zio)
 }
 
 /*
- * This is the list priority from which the L2ARC will search for pages to
- * cache.  This is used within loops (0..3) to cycle through lists in the
- * desired order.  This order can have a significant effect on cache
- * performance.
+ * Get the multilist for the given list number (0..3) to cycle through
+ * lists in the desired order.  This order can have a significant effect
+ * on cache performance.
  *
  * Currently the metadata lists are hit first, MFU then MRU, followed by
- * the data lists.  This function returns a locked list, and also returns
- * the lock pointer.
+ * the data lists.
  */
-static multilist_sublist_t *
-l2arc_sublist_lock(int list_num)
+static multilist_t *
+l2arc_get_list(int list_num)
 {
-	multilist_t *ml = NULL;
-	unsigned int idx;
-
 	ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
 
 	switch (list_num) {
 	case 0:
-		ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
-		break;
+		return (&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
 	case 1:
-		ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
-		break;
+		return (&arc_mru->arcs_list[ARC_BUFC_METADATA]);
 	case 2:
-		ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
-		break;
+		return (&arc_mfu->arcs_list[ARC_BUFC_DATA]);
 	case 3:
-		ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
-		break;
+		return (&arc_mru->arcs_list[ARC_BUFC_DATA]);
 	default:
 		return (NULL);
 	}
+}
 
-	/*
-	 * Return a randomly-selected sublist. This is acceptable
-	 * because the caller feeds only a little bit of data for each
-	 * call (8MB). Subsequent calls will result in different
-	 * sublists being selected.
-	 */
-	idx = multilist_get_random_index(ml);
-	return (multilist_sublist_lock_idx(ml, idx));
+
+/*
+ * Lock a specific sublist within the given list number.
+ */
+static multilist_sublist_t *
+l2arc_sublist_lock(int list_num, int sublist_idx)
+{
+	multilist_t *ml = l2arc_get_list(list_num);
+	if (ml == NULL)
+		return (NULL);
+
+	return (multilist_sublist_lock_idx(ml, sublist_idx));
+}
+
+/*
+ * Check if a pool has any L2ARC devices.
+ */
+static boolean_t
+l2arc_pool_has_devices(spa_t *target_spa)
+{
+	l2arc_dev_t *dev;
+
+	ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
+
+	for (dev = list_head(l2arc_dev_list); dev != NULL;
+	    dev = list_next(l2arc_dev_list, dev)) {
+		if (dev->l2ad_spa == target_spa) {
+			return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Initialize pool-based markers for l2arc position saving.
+ */
+static void
+l2arc_pool_markers_init(spa_t *spa)
+{
+	mutex_init(&spa->spa_l2arc_info.l2arc_sublist_lock, NULL,
+	    MUTEX_DEFAULT, NULL);
+
+	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
+		multilist_t *ml = l2arc_get_list(pass);
+		if (ml == NULL)
+			continue;
+
+		int num_sublists = multilist_get_num_sublists(ml);
+
+		spa->spa_l2arc_info.l2arc_markers[pass] =
+		    arc_state_alloc_markers(num_sublists);
+		spa->spa_l2arc_info.l2arc_sublist_busy[pass] =
+		    kmem_zalloc(num_sublists * sizeof (boolean_t), KM_SLEEP);
+
+		for (int i = 0; i < num_sublists; i++) {
+			multilist_sublist_t *mls =
+			    multilist_sublist_lock_idx(ml, i);
+			multilist_sublist_insert_tail(mls,
+			    spa->spa_l2arc_info.l2arc_markers[pass][i]);
+			multilist_sublist_unlock(mls);
+		}
+	}
+}
+
+/*
+ * Free all allocated pool-based markers.
+ */
+static void
+l2arc_pool_markers_fini(spa_t *spa)
+{
+	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
+		if (spa->spa_l2arc_info.l2arc_markers[pass] == NULL)
+			continue;
+
+		multilist_t *ml = l2arc_get_list(pass);
+		if (ml == NULL)
+			continue;
+
+		int num_sublists = multilist_get_num_sublists(ml);
+
+		for (int i = 0; i < num_sublists; i++) {
+			ASSERT3P(spa->spa_l2arc_info.l2arc_markers[pass][i],
+			    !=, NULL);
+			multilist_sublist_t *mls =
+			    multilist_sublist_lock_idx(ml, i);
+			ASSERT(multilist_link_active(
+			    &spa->spa_l2arc_info.l2arc_markers[pass][i]->
+			    b_l1hdr.b_arc_node));
+			multilist_sublist_remove(mls,
+			    spa->spa_l2arc_info.l2arc_markers[pass][i]);
+			multilist_sublist_unlock(mls);
+		}
+
+		arc_state_free_markers(spa->spa_l2arc_info.l2arc_markers[pass],
+		    num_sublists);
+		spa->spa_l2arc_info.l2arc_markers[pass] = NULL;
+
+		/* Free sublist busy flags for this pass */
+		ASSERT3P(spa->spa_l2arc_info.l2arc_sublist_busy[pass], !=,
+		    NULL);
+		kmem_free(spa->spa_l2arc_info.l2arc_sublist_busy[pass],
+		    num_sublists * sizeof (boolean_t));
+		spa->spa_l2arc_info.l2arc_sublist_busy[pass] = NULL;
+	}
+
+	mutex_destroy(&spa->spa_l2arc_info.l2arc_sublist_lock);
 }
 
 /*
@@ -9113,6 +9167,67 @@ l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
 	}
 }
 
+/*
+ * Calculate DWPD rate limit for L2ARC device.
+ */
+static uint64_t
+l2arc_dwpd_rate_limit(l2arc_dev_t *dev)
+{
+	uint64_t device_size = dev->l2ad_end - dev->l2ad_start;
+	uint64_t daily_budget = (device_size * l2arc_dwpd_limit) / 100;
+	uint64_t now = gethrestime_sec();
+
+	/* Reset every 24 hours */
+	if ((now - dev->l2ad_dwpd_start) >= 24 * 3600) {
+		/* Save unused budget from previous period (max 1 day) */
+		dev->l2ad_dwpd_accumulated = MIN(daily_budget,
+		    daily_budget - dev->l2ad_dwpd_writes);
+		dev->l2ad_dwpd_writes = 0;
+		dev->l2ad_dwpd_start = now;
+	}
+
+	uint64_t elapsed = now - dev->l2ad_dwpd_start;
+	uint64_t dwpd_budget = daily_budget / (24 * 3600);
+	uint64_t expected_writes = elapsed * dwpd_budget;
+
+	uint64_t available_budget = dwpd_budget + dev->l2ad_dwpd_accumulated;
+	if (expected_writes > dev->l2ad_dwpd_writes) {
+		/* Add unused budget from current period */
+		available_budget += expected_writes - dev->l2ad_dwpd_writes;
+	}
+
+	return (available_budget);
+}
+
+/*
+ * Get write rate based on device state and DWPD configuration.
+ */
+static uint64_t
+l2arc_get_write_rate(l2arc_dev_t *dev)
+{
+	uint64_t write_max = l2arc_write_max;
+	spa_t *spa = dev->l2ad_spa;
+
+	/*
+	 * Make sure l2arc_write_max is valid in case user altered it.
+	 */
+	if (write_max == 0) {
+		cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
+		    "resetting it to the default (%d)", L2ARC_WRITE_SIZE);
+		write_max = l2arc_write_max = L2ARC_WRITE_SIZE;
+	}
+
+	/* Apply DWPD rate limit for persistent marker configurations */
+	if (!dev->l2ad_first && l2arc_dwpd_limit > 0 &&
+	    spa->spa_l2arc_info.l2arc_total_capacity >=
+	    L2ARC_PERSIST_THRESHOLD) {
+		uint64_t dwpd_rate = l2arc_dwpd_rate_limit(dev);
+		return (MIN(dwpd_rate, write_max));
+	}
+
+	return (write_max);
+}
+
 /*
  * Evict buffers from the device write hand to the distance specified in
  * bytes. This distance may span populated buffers, it may span nothing.
@@ -9322,6 +9437,13 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 		dev->l2ad_hand = dev->l2ad_start;
 		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
+		/*
+		 * Reset DWPD counters - first pass writes are free, start
+		 * fresh 24h budget period now that device is full.
+		 */
+		dev->l2ad_dwpd_writes = 0;
+		dev->l2ad_dwpd_start = gethrestime_sec();
+		dev->l2ad_dwpd_accumulated = 0;
 		goto top;
 	}
 
@@ -9457,6 +9579,245 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 	return (ret);
 }
 
+/*
+ * Write buffers from a single sublist to L2ARC.
+ * Handles locking, marker determination, and buffer processing.
+ * Returns B_TRUE if target size reached, B_FALSE otherwise.
+ */
+static boolean_t
+l2arc_write_sublist(spa_t *spa, l2arc_dev_t *dev, int pass, int sublist_idx,
+    uint64_t target_sz, uint64_t *write_asize, uint64_t *write_psize,
+    zio_t **pio, l2arc_write_callback_t **cb, arc_buf_hdr_t *head,
+    uint64_t *consumed, uint64_t sublist_headroom, boolean_t save_position)
+{
+	multilist_sublist_t *mls;
+	arc_buf_hdr_t *hdr, *prev_hdr;
+	arc_buf_hdr_t *persistent_marker, *local_marker;
+	boolean_t full = B_FALSE;
+	boolean_t scan_from_head = B_FALSE;
+	uint64_t guid = spa_load_guid(spa);
+
+	mls = l2arc_sublist_lock(pass, sublist_idx);
+	ASSERT3P(mls, !=, NULL);
+
+	persistent_marker = spa->spa_l2arc_info.
+	    l2arc_markers[pass][sublist_idx];
+
+	if (save_position && persistent_marker == multilist_sublist_head(mls)) {
+		multilist_sublist_unlock(mls);
+		return (B_FALSE);
+	}
+
+	local_marker = arc_state_alloc_marker();
+
+	if (save_position) {
+		hdr = multilist_sublist_prev(mls, persistent_marker);
+		ASSERT3P(hdr, !=, NULL);
+		scan_from_head = B_FALSE;
+	} else {
+		if (arc_warm) {
+			hdr = multilist_sublist_tail(mls);
+			scan_from_head = B_FALSE;
+		} else {
+			hdr = multilist_sublist_head(mls);
+			scan_from_head = B_TRUE;
+		}
+		ASSERT3P(hdr, !=, NULL);
+	}
+
+	prev_hdr = hdr;
+
+	while (hdr != NULL) {
+		kmutex_t *hash_lock;
+		abd_t *to_write = NULL;
+		prev_hdr = hdr;
+
+		hash_lock = HDR_LOCK(hdr);
+		if (!mutex_tryenter(hash_lock)) {
+skip:
+			/* Skip this buffer rather than waiting. */
+			if (scan_from_head)
+				hdr = multilist_sublist_next(mls, hdr);
+			else
+				hdr = multilist_sublist_prev(mls, hdr);
+			continue;
+		}
+
+		if (l2arc_headroom != 0 &&
+		    *consumed + HDR_GET_LSIZE(hdr) >
+		    MAX(sublist_headroom, HDR_GET_LSIZE(hdr))) {
+			/*
+			 * Searched too far in this sublist.
+			 */
+			mutex_exit(hash_lock);
+			break;
+		}
+
+		*consumed += HDR_GET_LSIZE(hdr);
+
+		if (!l2arc_write_eligible(guid, hdr)) {
+			mutex_exit(hash_lock);
+			goto skip;
+		}
+
+		ASSERT(HDR_HAS_L1HDR(hdr));
+		ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
+		ASSERT3U(arc_hdr_size(hdr), >, 0);
+		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
+		uint64_t psize = HDR_GET_PSIZE(hdr);
+		uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+
+		/*
+		 * If the allocated size of this buffer plus the max
+		 * size for the pending log block exceeds the evicted
+		 * target size, terminate writing buffers for this run.
+		 */
+		if (*write_asize + asize +
+		    sizeof (l2arc_log_blk_phys_t) > target_sz) {
+			full = B_TRUE;
+			mutex_exit(hash_lock);
+			break;
+		}
+
+		/*
+		 * We should not sleep with sublist lock held or it
+		 * may block ARC eviction.  Insert a marker to save
+		 * the position and drop the lock.
+		 */
+		if (scan_from_head)
+			multilist_sublist_insert_after(mls, hdr, local_marker);
+		else
+			multilist_sublist_insert_before(mls, hdr, local_marker);
+		multilist_sublist_unlock(mls);
+
+		/*
+		 * If this header has b_rabd, we can use this since it
+		 * must always match the data exactly as it exists on
+		 * disk. Otherwise, the L2ARC can normally use the
+		 * hdr's data, but if we're sharing data between the
+		 * hdr and one of its bufs, L2ARC needs its own copy of
+		 * the data so that the ZIO below can't race with the
+		 * buf consumer. To ensure that this copy will be
+		 * available for the lifetime of the ZIO and be cleaned
+		 * up afterwards, we add it to the l2arc_free_on_write
+		 * queue. If we need to apply any transforms to the
+		 * data (compression, encryption) we will also need the
+		 * extra buffer.
+		 */
+		if (HDR_HAS_RABD(hdr) && psize == asize) {
+			to_write = hdr->b_crypt_hdr.b_rabd;
+		} else if ((HDR_COMPRESSION_ENABLED(hdr) ||
+		    HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
+		    !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
+		    psize == asize) {
+			to_write = hdr->b_l1hdr.b_pabd;
+		} else {
+			int ret;
+			arc_buf_contents_t type = arc_buf_type(hdr);
+
+			ret = l2arc_apply_transforms(spa, hdr, asize,
+			    &to_write);
+			if (ret != 0) {
+				arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
+				mutex_exit(hash_lock);
+				goto next;
+			}
+
+			l2arc_free_abd_on_write(to_write, asize, type, dev);
+		}
+
+		hdr->b_l2hdr.b_dev = dev;
+		hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+		hdr->b_l2hdr.b_hits = 0;
+		hdr->b_l2hdr.b_arcs_state =
+		    hdr->b_l1hdr.b_state->arcs_state;
+		/* l2arc_hdr_arcstats_update() expects a valid asize */
+		HDR_SET_L2SIZE(hdr, asize);
+		arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
+		    ARC_FLAG_L2_WRITING);
+
+		(void) zfs_refcount_add_many(&dev->l2ad_alloc,
+		    arc_hdr_size(hdr), hdr);
+		l2arc_hdr_arcstats_increment(hdr);
+		vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+		mutex_enter(&dev->l2ad_mtx);
+		if (*pio == NULL) {
+			/*
+			 * Insert a dummy header on the buflist so
+			 * l2arc_write_done() can find where the
+			 * write buffers begin without searching.
+			 */
+			list_insert_head(&dev->l2ad_buflist, head);
+		}
+		list_insert_head(&dev->l2ad_buflist, hdr);
+		mutex_exit(&dev->l2ad_mtx);
+
+		boolean_t commit = l2arc_log_blk_insert(dev, hdr);
+		mutex_exit(hash_lock);
+
+		if (*pio == NULL) {
+			*cb = kmem_alloc(sizeof (l2arc_write_callback_t),
+			    KM_SLEEP);
+			(*cb)->l2wcb_dev = dev;
+			(*cb)->l2wcb_head = head;
+			list_create(&(*cb)->l2wcb_abd_list,
+			    sizeof (l2arc_lb_abd_buf_t),
+			    offsetof(l2arc_lb_abd_buf_t, node));
+			*pio = zio_root(spa, l2arc_write_done, *cb,
+			    ZIO_FLAG_CANFAIL);
+		}
+
+		zio_t *wzio = zio_write_phys(*pio, dev->l2ad_vdev,
+		    dev->l2ad_hand, asize, to_write, ZIO_CHECKSUM_OFF,
+		    NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE,
+		    ZIO_FLAG_CANFAIL, B_FALSE);
+
+		DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+		    zio_t *, wzio);
+		zio_nowait(wzio);
+
+		*write_psize += psize;
+		*write_asize += asize;
+		dev->l2ad_hand += asize;
+
+		if (commit) {
+			/* l2ad_hand will be adjusted inside. */
+			*write_asize += l2arc_log_blk_commit(dev, *pio, *cb);
+		}
+
+next:
+		multilist_sublist_lock(mls);
+		if (scan_from_head)
+			hdr = multilist_sublist_next(mls, local_marker);
+		else
+			hdr = multilist_sublist_prev(mls, local_marker);
+		multilist_sublist_remove(mls, local_marker);
+	}
+
+	/*
+	 * Position persistent marker for next iteration. In case of
+	 * save_position, validate that prev_hdr still belongs to the current
+	 * sublist. The sublist lock is dropped during L2ARC write I/O, allowing
+	 * ARC eviction to potentially free prev_hdr. If freed, we can't do much
+	 * except to reset the marker.
+	 */
+	multilist_sublist_remove(mls, persistent_marker);
+	if (save_position &&
+	    multilist_link_active(&prev_hdr->b_l1hdr.b_arc_node)) {
+		multilist_sublist_insert_before(mls, prev_hdr,
+		    persistent_marker);
+	} else {
+		multilist_sublist_insert_tail(mls, persistent_marker);
+	}
+
+	multilist_sublist_unlock(mls);
+
+	arc_state_free_marker(local_marker);
+
+	return (full);
+}
+
 static void
 l2arc_blk_fetch_done(zio_t *zio)
 {
@@ -9468,6 +9829,46 @@ l2arc_blk_fetch_done(zio_t *zio)
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
+/*
+ * Reset all L2ARC markers to tail position for the given spa.
+ */
+static void
+l2arc_reset_all_markers(spa_t *spa)
+{
+	ASSERT(spa->spa_l2arc_info.l2arc_markers != NULL);
+	ASSERT(MUTEX_HELD(&spa->spa_l2arc_info.l2arc_sublist_lock));
+
+	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
+		if (spa->spa_l2arc_info.l2arc_markers[pass] == NULL)
+			continue;
+
+		multilist_t *ml = l2arc_get_list(pass);
+		int num_sublists = multilist_get_num_sublists(ml);
+
+		for (int i = 0; i < num_sublists; i++) {
+			ASSERT3P(spa->spa_l2arc_info.l2arc_markers[pass][i],
+			    !=, NULL);
+			multilist_sublist_t *mls =
+			    multilist_sublist_lock_idx(ml, i);
+
+			/* Remove from current position */
+			ASSERT(multilist_link_active(&spa->spa_l2arc_info.
+			    l2arc_markers[pass][i]->b_l1hdr.b_arc_node));
+			multilist_sublist_remove(mls, spa->spa_l2arc_info.
+			    l2arc_markers[pass][i]);
+
+			/* Insert at tail (like initialization) */
+			multilist_sublist_insert_tail(mls,
+			    spa->spa_l2arc_info.l2arc_markers[pass][i]);
+
+			multilist_sublist_unlock(mls);
+		}
+	}
+
+	/* Reset write counter */
+	spa->spa_l2arc_info.l2arc_total_writes = 0;
+}
+
 /*
  * Find and write ARC buffers to the L2ARC device.
  *
@@ -9483,12 +9884,11 @@ l2arc_blk_fetch_done(zio_t *zio)
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
-	arc_buf_hdr_t 		*hdr, *head, *marker;
+	arc_buf_hdr_t 		*head;
 	uint64_t 		write_asize, write_psize, headroom;
-	boolean_t		full, from_head = !arc_warm;
+	boolean_t		full;
 	l2arc_write_callback_t	*cb = NULL;
-	zio_t 			*pio, *wzio;
-	uint64_t 		guid = spa_load_guid(spa);
+	zio_t 			*pio;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 
 	ASSERT3P(dev->l2ad_vdev, !=, NULL);
@@ -9498,7 +9898,27 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
-	marker = arc_state_alloc_marker();
+
+	/*
+	 * Determine L2ARC implementation based on total pool L2ARC capacity
+	 * vs ARC size. Use persistent markers for pools with significant
+	 * L2ARC investment, otherwise use simple HEAD/TAIL scanning.
+	 */
+	boolean_t save_position =
+	    (spa->spa_l2arc_info.l2arc_total_capacity >=
+	    L2ARC_PERSIST_THRESHOLD);
+
+	/*
+	 * Check if markers need reset based on smallest device threshold.
+	 * Reset when cumulative writes exceed 1/8th of smallest device.
+	 * Must be protected since multiple device threads may check/update.
+	 */
+	mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
+	if (save_position && spa->spa_l2arc_info.l2arc_total_writes >=
+	    spa->spa_l2arc_info.l2arc_smallest_capacity / 8) {
+		l2arc_reset_all_markers(spa);
+	}
+	mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
 
 	/*
 	 * Copy buffers for L2ARC writing.
@@ -9518,202 +9938,73 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				continue;
 		}
 
-		uint64_t passed_sz = 0;
 		headroom = target_sz * l2arc_headroom;
 		if (zfs_compressed_arc_enabled)
 			headroom = (headroom * l2arc_headroom_boost) / 100;
 
-		/*
-		 * Until the ARC is warm and starts to evict, read from the
-		 * head of the ARC lists rather than the tail.
-		 */
-		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
-		ASSERT3P(mls, !=, NULL);
-		if (from_head)
-			hdr = multilist_sublist_head(mls);
-		else
-			hdr = multilist_sublist_tail(mls);
-
-		while (hdr != NULL) {
-			kmutex_t *hash_lock;
-			abd_t *to_write = NULL;
+		multilist_t *ml = l2arc_get_list(pass);
+		ASSERT3P(ml, !=, NULL);
+		int num_sublists = multilist_get_num_sublists(ml);
+		int current_sublist = multilist_get_random_index(ml);
+		uint64_t consumed_headroom = 0;
 
-			hash_lock = HDR_LOCK(hdr);
-			if (!mutex_tryenter(hash_lock)) {
-skip:
-				/* Skip this buffer rather than waiting. */
-				if (from_head)
-					hdr = multilist_sublist_next(mls, hdr);
-				else
-					hdr = multilist_sublist_prev(mls, hdr);
-				continue;
-			}
+		int processed_sublists = 0;
+		while (processed_sublists < num_sublists && !full) {
+			uint64_t sublist_headroom;
 
-			passed_sz += HDR_GET_LSIZE(hdr);
-			if (l2arc_headroom != 0 && passed_sz > headroom) {
-				/*
-				 * Searched too far.
-				 */
-				mutex_exit(hash_lock);
+			if (consumed_headroom >= headroom)
 				break;
-			}
-
-			if (!l2arc_write_eligible(guid, hdr)) {
-				mutex_exit(hash_lock);
-				goto skip;
-			}
 
-			ASSERT(HDR_HAS_L1HDR(hdr));
-			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
-			ASSERT3U(arc_hdr_size(hdr), >, 0);
-			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
-			    HDR_HAS_RABD(hdr));
-			uint64_t psize = HDR_GET_PSIZE(hdr);
-			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
-			    psize);
+			sublist_headroom = (headroom - consumed_headroom) /
+			    (num_sublists - processed_sublists);
 
-			/*
-			 * If the allocated size of this buffer plus the max
-			 * size for the pending log block exceeds the evicted
-			 * target size, terminate writing buffers for this run.
-			 */
-			if (write_asize + asize +
-			    sizeof (l2arc_log_blk_phys_t) > target_sz) {
-				full = B_TRUE;
-				mutex_exit(hash_lock);
+			if (sublist_headroom == 0)
 				break;
-			}
 
 			/*
-			 * We should not sleep with sublist lock held or it
-			 * may block ARC eviction.  Insert a marker to save
-			 * the position and drop the lock.
+			 * Check if sublist is busy (being processed by another
+			 * L2ARC device thread). If so, skip to next sublist.
 			 */
-			if (from_head) {
-				multilist_sublist_insert_after(mls, hdr,
-				    marker);
-			} else {
-				multilist_sublist_insert_before(mls, hdr,
-				    marker);
+			mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
+			if (spa->spa_l2arc_info.l2arc_sublist_busy[pass]
+			    [current_sublist]) {
+				mutex_exit(&spa->spa_l2arc_info.
+				    l2arc_sublist_lock);
+				current_sublist = (current_sublist + 1) %
+				    num_sublists;
+				processed_sublists++;
+				continue;
 			}
-			multilist_sublist_unlock(mls);
+			/* Mark sublist as busy */
+			spa->spa_l2arc_info.l2arc_sublist_busy[pass]
+			    [current_sublist] = B_TRUE;
+			mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
 
 			/*
-			 * If this header has b_rabd, we can use this since it
-			 * must always match the data exactly as it exists on
-			 * disk. Otherwise, the L2ARC can normally use the
-			 * hdr's data, but if we're sharing data between the
-			 * hdr and one of its bufs, L2ARC needs its own copy of
-			 * the data so that the ZIO below can't race with the
-			 * buf consumer. To ensure that this copy will be
-			 * available for the lifetime of the ZIO and be cleaned
-			 * up afterwards, we add it to the l2arc_free_on_write
-			 * queue. If we need to apply any transforms to the
-			 * data (compression, encryption) we will also need the
-			 * extra buffer.
+			 * Write buffers from this sublist to L2ARC.
+			 * Function handles locking, marker management, and
+			 * buffer processing internally.
 			 */
-			if (HDR_HAS_RABD(hdr) && psize == asize) {
-				to_write = hdr->b_crypt_hdr.b_rabd;
-			} else if ((HDR_COMPRESSION_ENABLED(hdr) ||
-			    HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
-			    !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
-			    psize == asize) {
-				to_write = hdr->b_l1hdr.b_pabd;
-			} else {
-				int ret;
-				arc_buf_contents_t type = arc_buf_type(hdr);
-
-				ret = l2arc_apply_transforms(spa, hdr, asize,
-				    &to_write);
-				if (ret != 0) {
-					arc_hdr_clear_flags(hdr,
-					    ARC_FLAG_L2CACHE);
-					mutex_exit(hash_lock);
-					goto next;
-				}
-
-				l2arc_free_abd_on_write(to_write, asize, type);
-			}
-
-			hdr->b_l2hdr.b_dev = dev;
-			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
-			hdr->b_l2hdr.b_hits = 0;
-			hdr->b_l2hdr.b_arcs_state =
-			    hdr->b_l1hdr.b_state->arcs_state;
-			/* l2arc_hdr_arcstats_update() expects a valid asize */
-			HDR_SET_L2SIZE(hdr, asize);
-			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
-			    ARC_FLAG_L2_WRITING);
-
-			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
-			    arc_hdr_size(hdr), hdr);
-			l2arc_hdr_arcstats_increment(hdr);
-			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
-
-			mutex_enter(&dev->l2ad_mtx);
-			if (pio == NULL) {
-				/*
-				 * Insert a dummy header on the buflist so
-				 * l2arc_write_done() can find where the
-				 * write buffers begin without searching.
-				 */
-				list_insert_head(&dev->l2ad_buflist, head);
-			}
-			list_insert_head(&dev->l2ad_buflist, hdr);
-			mutex_exit(&dev->l2ad_mtx);
-
-			boolean_t commit = l2arc_log_blk_insert(dev, hdr);
-			mutex_exit(hash_lock);
-
-			if (pio == NULL) {
-				cb = kmem_alloc(
-				    sizeof (l2arc_write_callback_t), KM_SLEEP);
-				cb->l2wcb_dev = dev;
-				cb->l2wcb_head = head;
-				list_create(&cb->l2wcb_abd_list,
-				    sizeof (l2arc_lb_abd_buf_t),
-				    offsetof(l2arc_lb_abd_buf_t, node));
-				pio = zio_root(spa, l2arc_write_done, cb,
-				    ZIO_FLAG_CANFAIL);
-			}
-
-			wzio = zio_write_phys(pio, dev->l2ad_vdev,
-			    dev->l2ad_hand, asize, to_write,
-			    ZIO_CHECKSUM_OFF, NULL, hdr,
-			    ZIO_PRIORITY_ASYNC_WRITE,
-			    ZIO_FLAG_CANFAIL, B_FALSE);
-
-			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
-			    zio_t *, wzio);
-			zio_nowait(wzio);
-
-			write_psize += psize;
-			write_asize += asize;
-			dev->l2ad_hand += asize;
-
-			if (commit) {
-				/* l2ad_hand will be adjusted inside. */
-				write_asize +=
-				    l2arc_log_blk_commit(dev, pio, cb);
-			}
-
-next:
-			multilist_sublist_lock(mls);
-			if (from_head)
-				hdr = multilist_sublist_next(mls, marker);
-			else
-				hdr = multilist_sublist_prev(mls, marker);
-			multilist_sublist_remove(mls, marker);
+			full = l2arc_write_sublist(spa, dev, pass,
+			    current_sublist, target_sz, &write_asize,
+			    &write_psize, &pio, &cb, head,
+			    &consumed_headroom, sublist_headroom,
+			    save_position);
+
+			/* Clear busy flag for this sublist */
+			mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
+			spa->spa_l2arc_info.l2arc_sublist_busy[pass]
+			    [current_sublist] = B_FALSE;
+			mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
+
+			current_sublist = (current_sublist + 1) % num_sublists;
+			processed_sublists++;
 		}
 
-		multilist_sublist_unlock(mls);
-
 		if (full == B_TRUE)
 			break;
 	}
 
-	arc_state_free_marker(marker);
-
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
 		ASSERT0(write_psize);
@@ -9741,6 +10032,17 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	(void) zio_wait(pio);
 	dev->l2ad_writing = B_FALSE;
 
+	/*
+	 * Update cumulative write tracking for marker reset logic.
+	 * Protected for multi-device thread access.
+	 */
+	mutex_enter(&spa->spa_l2arc_info.l2arc_sublist_lock);
+	spa->spa_l2arc_info.l2arc_total_writes += write_asize;
+	mutex_exit(&spa->spa_l2arc_info.l2arc_sublist_lock);
+
+	/* Track writes for DWPD rate limiting */
+	dev->l2ad_dwpd_writes += write_asize;
+
 	/*
 	 * Update the device header after the zio completes as
 	 * l2arc_write_done() may have updated the memory holding the log block
@@ -9761,58 +10063,56 @@ l2arc_hdr_limit_reached(void)
 }
 
 /*
- * This thread feeds the L2ARC at regular intervals.  This is the beating
- * heart of the L2ARC.
+ * Per-device L2ARC feed thread.  Each L2ARC device has its own thread
+ * to allow parallel writes to multiple devices.
  */
 static  __attribute__((noreturn)) void
-l2arc_feed_thread(void *unused)
+l2arc_feed_thread(void *arg)
 {
-	(void) unused;
+	l2arc_dev_t *dev = arg;
 	callb_cpr_t cpr;
-	l2arc_dev_t *dev;
 	spa_t *spa;
 	uint64_t size, wrote;
 	clock_t begin, next = ddi_get_lbolt();
 	fstrans_cookie_t cookie;
 
-	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
+	ASSERT3P(dev, !=, NULL);
+
+	CALLB_CPR_INIT(&cpr, &dev->l2ad_feed_thr_lock, callb_generic_cpr, FTAG);
 
-	mutex_enter(&l2arc_feed_thr_lock);
+	mutex_enter(&dev->l2ad_feed_thr_lock);
 
 	cookie = spl_fstrans_mark();
-	while (l2arc_thread_exit == 0) {
+	while (dev->l2ad_thread_exit == B_FALSE) {
 		CALLB_CPR_SAFE_BEGIN(&cpr);
-		(void) cv_timedwait_idle(&l2arc_feed_thr_cv,
-		    &l2arc_feed_thr_lock, next);
-		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
+		(void) cv_timedwait_idle(&dev->l2ad_feed_cv,
+		    &dev->l2ad_feed_thr_lock, next);
+		CALLB_CPR_SAFE_END(&cpr, &dev->l2ad_feed_thr_lock);
 		next = ddi_get_lbolt() + hz;
 
 		/*
-		 * Quick check for L2ARC devices.
+		 * Check if thread should exit.
 		 */
-		mutex_enter(&l2arc_dev_mtx);
-		if (l2arc_ndev == 0) {
-			mutex_exit(&l2arc_dev_mtx);
-			continue;
-		}
-		mutex_exit(&l2arc_dev_mtx);
-		begin = ddi_get_lbolt();
+		if (dev->l2ad_thread_exit)
+			break;
 
 		/*
-		 * This selects the next l2arc device to write to, and in
-		 * doing so the next spa to feed from: dev->l2ad_spa.   This
-		 * will return NULL if there are now no l2arc devices or if
-		 * they are all faulted.
-		 *
-		 * If a device is returned, its spa's config lock is also
-		 * held to prevent device removal.  l2arc_dev_get_next()
-		 * will grab and release l2arc_dev_mtx.
+		 * Check if device is still valid.  If not, thread should exit.
 		 */
-		if ((dev = l2arc_dev_get_next()) == NULL)
-			continue;
+		if (dev->l2ad_vdev == NULL || vdev_is_dead(dev->l2ad_vdev))
+			break;
+		begin = ddi_get_lbolt();
 
+		/*
+		 * Try to acquire the spa config lock. If we can't get it,
+		 * skip this iteration as removal might be in progress.
+		 * The feed thread will exit naturally when it wakes up and
+		 * sees l2ad_thread_exit is set.
+		 */
 		spa = dev->l2ad_spa;
 		ASSERT3P(spa, !=, NULL);
+		if (!spa_config_tryenter(spa, SCL_L2ARC, dev, RW_READER))
+			continue;
 
 		/*
 		 * If the pool is read-only then force the feed thread to
@@ -9835,29 +10135,61 @@ l2arc_feed_thread(void *unused)
 
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
-		size = l2arc_write_size(dev);
+		/*
+		 * Check if using adaptive intervals (persistent markers).
+		 */
+		boolean_t use_adaptive_interval =
+		    (spa->spa_l2arc_info.l2arc_total_capacity >=
+		    L2ARC_PERSIST_THRESHOLD);
+
+		clock_t interval;
+		boolean_t was_first = dev->l2ad_first;
+		size = l2arc_write_size(dev, &interval);
 
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
 		 */
 		l2arc_evict(dev, size, B_FALSE);
 
+		/*
+		 * If first pass just completed during evict, the write size
+		 * was calculated without DWPD limiting. Recalculate now that
+		 * DWPD is active to avoid writing with unlimited budget.
+		 * Only applies to large devices where DWPD is active.
+		 */
+		if (was_first && !dev->l2ad_first && l2arc_dwpd_limit > 0 &&
+		    spa->spa_l2arc_info.l2arc_total_capacity >=
+		    L2ARC_PERSIST_THRESHOLD) {
+			size = l2arc_write_size(dev, &interval);
+		}
+
 		/*
 		 * Write ARC buffers.
 		 */
 		wrote = l2arc_write_buffers(spa, dev, size);
 
 		/*
-		 * Calculate interval between writes.
+		 * If smaller device, use legacy approach based on data written
 		 */
-		next = l2arc_write_interval(begin, size, wrote);
+		if (!use_adaptive_interval) {
+			if (l2arc_feed_again && wrote > (size / 2))
+				interval = (hz * l2arc_feed_min_ms) / 1000;
+			else
+				interval = hz * l2arc_feed_secs;
+		}
+
+		/*
+		 * Calculate next feed time.
+		 */
+		clock_t now = ddi_get_lbolt();
+		next = MAX(now, MIN(now + interval, begin + interval));
 		spa_config_exit(spa, SCL_L2ARC, dev);
 	}
 	spl_fstrans_unmark(cookie);
 
-	l2arc_thread_exit = 0;
-	cv_broadcast(&l2arc_feed_thr_cv);
-	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
+	dev->l2ad_feed_thread = NULL;
+	cv_broadcast(&dev->l2ad_feed_cv);
+	CALLB_CPR_EXIT(&cpr);		/* drops dev->l2ad_feed_thr_lock */
 	thread_exit();
 }
 
@@ -9967,6 +10299,30 @@ l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
 	}
 }
 
+
+/*
+ * Recalculate smallest L2ARC device capacity for the given spa.
+ * Must be called under l2arc_dev_mtx.
+ */
+static void
+l2arc_update_smallest_capacity(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
+	l2arc_dev_t *dev;
+	uint64_t smallest = UINT64_MAX;
+
+	for (dev = list_head(l2arc_dev_list); dev != NULL;
+	    dev = list_next(l2arc_dev_list, dev)) {
+		if (dev->l2ad_spa == spa) {
+			uint64_t cap = dev->l2ad_end - dev->l2ad_start;
+			if (cap < smallest)
+				smallest = cap;
+		}
+	}
+
+	spa->spa_l2arc_info.l2arc_smallest_capacity = smallest;
+}
+
 /*
  * Add a vdev for use by the L2ARC.  By this point the spa has already
  * validated the vdev and opened it.
@@ -9996,6 +10352,9 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
 	adddev->l2ad_trim_all = B_FALSE;
+	adddev->l2ad_dwpd_writes = 0;
+	adddev->l2ad_dwpd_start = gethrestime_sec();
+	adddev->l2ad_dwpd_accumulated = 0;
 	list_link_init(&adddev->l2ad_node);
 	adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
 
@@ -10016,6 +10375,14 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 
 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 	zfs_refcount_create(&adddev->l2ad_alloc);
+
+	/*
+	 * Initialize per-device thread fields
+	 */
+	adddev->l2ad_thread_exit = B_FALSE;
+	mutex_init(&adddev->l2ad_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&adddev->l2ad_feed_cv, NULL, CV_DEFAULT, NULL);
+
 	zfs_refcount_create(&adddev->l2ad_lb_asize);
 	zfs_refcount_create(&adddev->l2ad_lb_count);
 
@@ -10032,8 +10399,37 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 	 * Add device to global list
 	 */
 	mutex_enter(&l2arc_dev_mtx);
+
+	/*
+	 * Initialize pool-based position saving markers if this is the first
+	 * L2ARC device for this pool
+	 */
+	if (!l2arc_pool_has_devices(spa)) {
+		l2arc_pool_markers_init(spa);
+	}
+
 	list_insert_head(l2arc_dev_list, adddev);
 	atomic_inc_64(&l2arc_ndev);
+	spa->spa_l2arc_info.l2arc_total_capacity += (adddev->l2ad_end -
+	    adddev->l2ad_start);
+	l2arc_update_smallest_capacity(spa);
+
+	/*
+	 * Create per-device feed thread only if spa is writable.
+	 * The thread name includes the spa name and device number
+	 * for easy identification.
+	 */
+	if (spa_writeable(spa)) {
+		char thread_name[MAXNAMELEN];
+		snprintf(thread_name, sizeof (thread_name), "l2arc_%s_%llu",
+		    spa_name(spa), (u_longlong_t)vd->vdev_id);
+		adddev->l2ad_feed_thread = thread_create_named(thread_name,
+		    NULL, 0, l2arc_feed_thread, adddev, 0, &p0, TS_RUN,
+		    minclsyspri);
+	} else {
+		adddev->l2ad_feed_thread = NULL;
+	}
+
 	mutex_exit(&l2arc_dev_mtx);
 }
 
@@ -10090,6 +10486,8 @@ l2arc_device_teardown(void *arg)
 	ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
 	list_destroy(&remdev->l2ad_lbptr_list);
 	mutex_destroy(&remdev->l2ad_mtx);
+	mutex_destroy(&remdev->l2ad_feed_thr_lock);
+	cv_destroy(&remdev->l2ad_feed_cv);
 	zfs_refcount_destroy(&remdev->l2ad_alloc);
 	zfs_refcount_destroy(&remdev->l2ad_lb_asize);
 	zfs_refcount_destroy(&remdev->l2ad_lb_count);
@@ -10144,6 +10542,21 @@ l2arc_remove_vdev(vdev_t *vd)
 			cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
 	}
 	mutex_exit(&l2arc_rebuild_thr_lock);
+
+	/*
+	 * Signal per-device feed thread to exit and wait for it.
+	 * Thread only exists if pool was imported read-write.
+	 */
+	if (remdev->l2ad_feed_thread != NULL) {
+		mutex_enter(&remdev->l2ad_feed_thr_lock);
+		remdev->l2ad_thread_exit = B_TRUE;
+		cv_signal(&remdev->l2ad_feed_cv);
+		while (remdev->l2ad_feed_thread != NULL)
+			cv_wait(&remdev->l2ad_feed_cv,
+			    &remdev->l2ad_feed_thr_lock);
+		mutex_exit(&remdev->l2ad_feed_thr_lock);
+	}
+
 	rva->rva_async = asynchronous;
 
 	/*
@@ -10152,8 +10565,18 @@ l2arc_remove_vdev(vdev_t *vd)
 	ASSERT(spa_config_held(spa, SCL_L2ARC, RW_WRITER) & SCL_L2ARC);
 	mutex_enter(&l2arc_dev_mtx);
 	list_remove(l2arc_dev_list, remdev);
-	l2arc_dev_last = NULL;		/* may have been invalidated */
 	atomic_dec_64(&l2arc_ndev);
+	spa->spa_l2arc_info.l2arc_total_capacity -=
+	    (remdev->l2ad_end - remdev->l2ad_start);
+	l2arc_update_smallest_capacity(spa);
+
+	/*
+	 * Clean up pool-based markers if this was the last L2ARC device
+	 * for this pool
+	 */
+	if (!l2arc_pool_has_devices(spa)) {
+		l2arc_pool_markers_fini(spa);
+	}
 
 	/* During a pool export spa & vdev will no longer be valid */
 	if (asynchronous) {
@@ -10176,11 +10599,8 @@ l2arc_remove_vdev(vdev_t *vd)
 void
 l2arc_init(void)
 {
-	l2arc_thread_exit = 0;
 	l2arc_ndev = 0;
 
-	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
-	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -10197,8 +10617,6 @@ l2arc_init(void)
 void
 l2arc_fini(void)
 {
-	mutex_destroy(&l2arc_feed_thr_lock);
-	cv_destroy(&l2arc_feed_thr_cv);
 	mutex_destroy(&l2arc_rebuild_thr_lock);
 	cv_destroy(&l2arc_rebuild_thr_cv);
 	mutex_destroy(&l2arc_dev_mtx);
@@ -10208,29 +10626,6 @@ l2arc_fini(void)
 	list_destroy(l2arc_free_on_write);
 }
 
-void
-l2arc_start(void)
-{
-	if (!(spa_mode_global & SPA_MODE_WRITE))
-		return;
-
-	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
-	    TS_RUN, defclsyspri);
-}
-
-void
-l2arc_stop(void)
-{
-	if (!(spa_mode_global & SPA_MODE_WRITE))
-		return;
-
-	mutex_enter(&l2arc_feed_thr_lock);
-	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
-	l2arc_thread_exit = 1;
-	while (l2arc_thread_exit != 0)
-		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
-	mutex_exit(&l2arc_feed_thr_lock);
-}
 
 /*
  * Punches out rebuild threads for the L2ARC devices in a spa. This should
@@ -11236,8 +11631,8 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW,
 	"Max write bytes per interval");
 
-ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW,
-	"Extra write bytes during device warmup");
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, dwpd_limit, U64, ZMOD_RW,
+	"L2ARC device endurance limit as percentage (100 = 1.0 DWPD)");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW,
 	"Number of max device writes to precache");
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index aee170e9ea51..bbcbaafaad34 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -2668,7 +2668,6 @@ spa_init(spa_mode_t mode)
 	zpool_prop_init();
 	zpool_feature_init();
 	vdev_prop_init();
-	l2arc_start();
 	scan_init();
 	qat_init();
 	spa_import_progress_init();
@@ -2678,8 +2677,6 @@ spa_init(spa_mode_t mode)
 void
 spa_fini(void)
 {
-	l2arc_stop();
-
 	spa_evict_all();
 
 	vdev_file_fini();
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 9b4c4154c056..e217a8cae915 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -1128,7 +1128,8 @@ tags = ['functional', 'log_spacemap']
 
 [tests/functional/l2arc]
 tests = ['l2arc_arcstats_pos', 'l2arc_mfuonly_pos', 'l2arc_l2miss_pos',
-    'persist_l2arc_001_pos', 'persist_l2arc_002_pos',
+    'l2arc_dwpd_ratelimit_pos', 'l2arc_dwpd_reimport_pos', 'l2arc_multidev_scaling_pos',
+    'l2arc_multidev_throughput_pos', 'persist_l2arc_001_pos', 'persist_l2arc_002_pos',
     'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos']
 tags = ['functional', 'l2arc']
 
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index 54b50c9dba77..4886a821879a 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -46,6 +46,7 @@ INITIALIZE_CHUNK_SIZE		initialize_chunk_size		zfs_initialize_chunk_size
 INITIALIZE_VALUE		initialize_value		zfs_initialize_value
 KEEP_LOG_SPACEMAPS_AT_EXPORT	keep_log_spacemaps_at_export	zfs_keep_log_spacemaps_at_export
 LUA_MAX_MEMLIMIT		lua.max_memlimit		zfs_lua_max_memlimit
+L2ARC_DWPD_LIMIT		l2arc.dwpd_limit		l2arc_dwpd_limit
 L2ARC_MFUONLY			l2arc.mfuonly			l2arc_mfuonly
 L2ARC_NOPREFETCH		l2arc.noprefetch		l2arc_noprefetch
 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE	l2arc.rebuild_blocks_min_l2size	l2arc_rebuild_blocks_min_l2size
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index a4b3e7376093..d9ba6c5b85e2 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1657,6 +1657,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/l2arc/l2arc_arcstats_pos.ksh \
 	functional/l2arc/l2arc_l2miss_pos.ksh \
 	functional/l2arc/l2arc_mfuonly_pos.ksh \
+	functional/l2arc/l2arc_dwpd_ratelimit_pos.ksh \
+	functional/l2arc/l2arc_dwpd_reimport_pos.ksh \
+	functional/l2arc/l2arc_multidev_scaling_pos.ksh \
+	functional/l2arc/l2arc_multidev_throughput_pos.ksh \
 	functional/l2arc/persist_l2arc_001_pos.ksh \
 	functional/l2arc/persist_l2arc_002_pos.ksh \
 	functional/l2arc/persist_l2arc_003_neg.ksh \
diff --git a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh
index e8e9ed5ddc02..877cbe5edbb0 100755
--- a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh
@@ -55,12 +55,15 @@ function cleanup
 
 	log_must set_tunable32 L2ARC_WRITE_MAX $write_max
 	log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+	log_must set_tunable32 L2ARC_DWPD_LIMIT $dwpd_limit
 }
 log_onexit cleanup
 
 typeset write_max=$(get_tunable L2ARC_WRITE_MAX)
 typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+typeset dwpd_limit=$(get_tunable L2ARC_DWPD_LIMIT)
 log_must set_tunable32 L2ARC_NOPREFETCH 0
+log_must set_tunable32 L2ARC_DWPD_LIMIT 0
 
 typeset VDEV="$VDIR/vdev.disk"
 typeset VDEV_SZ=$(( 4 * 1024 * 1024 * 1024 ))
diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_dwpd_ratelimit_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_dwpd_ratelimit_pos.ksh
new file mode 100755
index 000000000000..ab2eca25a77a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_dwpd_ratelimit_pos.ksh
@@ -0,0 +1,124 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/l2arc/l2arc.cfg
+
+#
+# DESCRIPTION:
+#	L2ARC DWPD rate limiting correctly limits write rate.
+#
+# STRATEGY:
+#	1. Set DWPD limit before creating pool.
+#	2. Create pool with cache device (arc_max = 1.5 * cache_size).
+#	3. Fill L2ARC to complete first pass.
+#	4. Measure writes over test period.
+#	5. Repeat 1-4 for DWPD values 0, 100, 1000, 10000.
+#	6. Verify DWPD=0 > DWPD=10000 > DWPD=1000 > DWPD=100.
+#
+
+verify_runnable "global"
+
+log_assert "L2ARC DWPD rate limiting correctly limits write rate."
+
+function cleanup
+{
+	if poolexists $TESTPOOL ; then
+		destroy_pool $TESTPOOL
+	fi
+
+	log_must set_tunable32 L2ARC_WRITE_MAX $write_max
+	log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+	log_must set_tunable32 L2ARC_DWPD_LIMIT $dwpd_limit
+	log_must set_tunable64 ARC_MIN $arc_min
+	log_must set_tunable64 ARC_MAX $arc_max
+}
+log_onexit cleanup
+
+# Save original tunables
+typeset write_max=$(get_tunable L2ARC_WRITE_MAX)
+typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+typeset dwpd_limit=$(get_tunable L2ARC_DWPD_LIMIT)
+typeset arc_min=$(get_tunable ARC_MIN)
+typeset arc_max=$(get_tunable ARC_MAX)
+
+# Test parameters
+typeset cache_sz=900
+typeset fill_mb=1200
+typeset test_time=15
+
+# Configure arc_max = 1.8 * cache_size for continuous L2ARC feed
+log_must set_tunable64 ARC_MIN $((cache_sz * 8 / 10 * 1024 * 1024))
+log_must set_tunable64 ARC_MAX $((cache_sz * 18 / 10 * 1024 * 1024))
+log_must set_tunable32 L2ARC_NOPREFETCH 0
+log_must set_tunable32 L2ARC_WRITE_MAX $((200 * 1024 * 1024))
+
+# Create larger main vdev to accommodate fill data
+log_must truncate -s 5G $VDEV
+log_must truncate -s ${cache_sz}M $VDEV_CACHE
+
+typeset -A results
+
+# Test each DWPD value with fresh pool to measure first-pass fill
+for dwpd in 0 10000 1000 100; do
+	log_must set_tunable32 L2ARC_DWPD_LIMIT $dwpd
+
+	if poolexists $TESTPOOL; then
+		destroy_pool $TESTPOOL
+	fi
+	log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
+
+	# Fill first pass and wait for L2ARC writes to stabilize
+	log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1M count=$fill_mb
+	log_must sleep 10
+
+	# Take baseline after first pass completes
+	baseline=$(kstat arcstats.l2_write_bytes)
+	log_note "Baseline for DWPD=$dwpd: ${baseline}"
+
+	# Generate continuous workload to measure DWPD-limited L2ARC writes
+	# Write 2GB to ensure continuous L2ARC feed pressure throughout measurement
+	dd if=/dev/urandom of=/$TESTPOOL/file2 bs=1M count=2000 >/dev/null 2>&1 &
+	dd_pid=$!
+	log_must sleep $test_time
+	kill $dd_pid 2>/dev/null
+	wait $dd_pid 2>/dev/null
+	log_must sleep 2
+	end=$(kstat arcstats.l2_write_bytes)
+
+	results[$dwpd]=$((end - baseline))
+	log_note "DWPD=$dwpd: delta=$((results[$dwpd] / 1024))KB"
+done
+
+# Verify ordering: higher DWPD = more writes, 0 = unlimited
+if [[ ${results[0]} -le ${results[10000]} ]]; then
+	log_fail "DWPD=0 (unlimited) should write more than DWPD=10000"
+fi
+if [[ ${results[10000]} -le ${results[1000]} ]]; then
+	log_fail "DWPD=10000 should write more than DWPD=1000"
+fi
+if [[ ${results[1000]} -le ${results[100]} ]]; then
+	log_fail "DWPD=1000 should write more than DWPD=100"
+fi
+
+log_must zpool destroy $TESTPOOL
+
+log_pass "L2ARC DWPD rate limiting correctly limits write rate."
diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_dwpd_reimport_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_dwpd_reimport_pos.ksh
new file mode 100755
index 000000000000..4075168f92d4
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_dwpd_reimport_pos.ksh
@@ -0,0 +1,161 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/l2arc/l2arc.cfg
+
+#
+# DESCRIPTION:
+#	L2ARC DWPD rate limiting works after pool export/import.
+#
+# STRATEGY:
+#	1. Set DWPD limit before creating pool.
+#	2. Create pool with 900MB cache device.
+#	3. Fill L2ARC and wait for first pass to complete.
+#	4. Measure DWPD-limited writes with continuous workload.
+#	5. Export and import pool.
+#	6. Wait for rebuild, then fill L2ARC and wait for first pass to complete.
+#	7. Measure DWPD-limited writes again.
+#	8. Verify rate limiting still works after import (non-zero writes).
+#
+
+verify_runnable "global"
+
+log_assert "L2ARC DWPD rate limiting works after pool export/import."
+
+function cleanup
+{
+	if poolexists $TESTPOOL ; then
+		destroy_pool $TESTPOOL
+	fi
+
+	log_must set_tunable32 L2ARC_WRITE_MAX $write_max
+	log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+	log_must set_tunable32 L2ARC_DWPD_LIMIT $dwpd_limit
+	log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE $rebuild_min
+	log_must set_tunable64 ARC_MIN $arc_min
+	log_must set_tunable64 ARC_MAX $arc_max
+}
+log_onexit cleanup
+
+# Save original tunables
+typeset write_max=$(get_tunable L2ARC_WRITE_MAX)
+typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+typeset dwpd_limit=$(get_tunable L2ARC_DWPD_LIMIT)
+typeset rebuild_min=$(get_tunable L2ARC_REBUILD_BLOCKS_MIN_L2SIZE)
+typeset arc_min=$(get_tunable ARC_MIN)
+typeset arc_max=$(get_tunable ARC_MAX)
+
+# Test parameters
+typeset cache_sz=900
+typeset fill_mb=900
+typeset test_time=15
+
+# Set DWPD before pool creation (10000 = 100 DWPD)
+log_must set_tunable32 L2ARC_DWPD_LIMIT 10000
+log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE 0
+
+# Configure arc_max = 1.8 * cache_size for continuous L2ARC feed
+log_must set_tunable64 ARC_MIN $((cache_sz * 8 / 10 * 1024 * 1024))
+log_must set_tunable64 ARC_MAX $((cache_sz * 18 / 10 * 1024 * 1024))
+log_must set_tunable32 L2ARC_NOPREFETCH 0
+log_must set_tunable32 L2ARC_WRITE_MAX $((200 * 1024 * 1024))
+
+# Create larger main vdev to accommodate fill data
+log_must truncate -s 5G $VDEV
+log_must truncate -s ${cache_sz}M $VDEV_CACHE
+
+log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
+
+# Fill first pass and wait for L2ARC writes to complete
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1M count=$fill_mb
+arcstat_quiescence_noecho l2_size
+
+# Verify L2ARC is populated before export
+typeset l2_size_before=$(kstat arcstats.l2_size)
+log_note "L2ARC size before export: $((l2_size_before / 1024 / 1024))MB"
+if [[ $l2_size_before -eq 0 ]]; then
+	log_fail "L2ARC not populated before export"
+fi
+
+# Measure DWPD-limited writes before export
+baseline1=$(kstat arcstats.l2_write_bytes)
+log_note "Baseline before export: ${baseline1}"
+dd if=/dev/urandom of=/$TESTPOOL/file2 bs=1M count=2000 >/dev/null 2>&1 &
+dd_pid=$!
+log_must sleep $test_time
+kill $dd_pid 2>/dev/null
+wait $dd_pid 2>/dev/null
+log_must sleep 2
+end1=$(kstat arcstats.l2_write_bytes)
+typeset writes_before=$((end1 - baseline1))
+
+log_note "Writes before export: $((writes_before / 1024))KB"
+
+# Verify L2ARC actually wrote data
+if [[ $writes_before -eq 0 ]]; then
+	log_fail "No L2ARC writes before export - DWPD may be too restrictive"
+fi
+
+# Export and import pool
+log_must zpool export $TESTPOOL
+log_must zpool import -d $VDIR $TESTPOOL
+
+# Wait for rebuild to complete
+log_must sleep 5
+
+# Verify L2ARC is populated after import
+typeset l2_size_after=$(kstat arcstats.l2_size)
+log_note "L2ARC size after import: $((l2_size_after / 1024 / 1024))MB"
+if [[ $l2_size_after -eq 0 ]]; then
+	log_fail "L2ARC not populated after import"
+fi
+
+# Fill first pass again after import and wait for L2ARC writes to complete
+log_must dd if=/dev/urandom of=/$TESTPOOL/file3 bs=1M count=$fill_mb
+log_must sleep 10
+
+# Verify L2ARC is still populated after refill
+l2_size=$(kstat arcstats.l2_size)
+log_note "L2ARC size after refill: $((l2_size / 1024 / 1024))MB"
+
+# Measure DWPD-limited writes after import
+baseline2=$(kstat arcstats.l2_write_bytes)
+log_note "Baseline after import: ${baseline2}"
+dd if=/dev/urandom of=/$TESTPOOL/file4 bs=1M count=2000 >/dev/null 2>&1 &
+dd_pid=$!
+log_must sleep $test_time
+kill $dd_pid 2>/dev/null
+wait $dd_pid 2>/dev/null
+log_must sleep 2
+end2=$(kstat arcstats.l2_write_bytes)
+typeset writes_after=$((end2 - baseline2))
+
+log_note "Writes after import: $((writes_after / 1024))KB"
+
+# Verify rate limiting persists after import
+if [[ $writes_after -eq 0 ]]; then
+	log_fail "No writes after import - rate limiting may be broken"
+fi
+
+log_must zpool destroy $TESTPOOL
+
+log_pass "L2ARC DWPD rate limiting works after pool export/import."
diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_multidev_scaling_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_multidev_scaling_pos.ksh
new file mode 100755
index 000000000000..c6bb33ede0b3
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_multidev_scaling_pos.ksh
@@ -0,0 +1,129 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/l2arc/l2arc.cfg
+
+#
+# DESCRIPTION:
+#	L2ARC parallel writes scale with number of cache devices.
+#
+# STRATEGY:
+#	1. Configure L2ARC write rate to 16MB/s per device.
+#	2. Disable DWPD rate limiting to test pure parallel throughput.
+#	3. Create pool with single 900MB cache device.
+#	4. Generate continuous writes and measure L2ARC throughput over 25s.
+#	5. Recreate pool with dual 900MB cache devices (1800MB total).
+#	6. Generate continuous writes and measure L2ARC throughput over 25s.
+#	7. Verify dual-device throughput is ~2x single-device throughput,
+#	   demonstrating that per-device feed threads enable parallel writes.
+#	   Expected: single ~400MB (16MB/s), dual ~800MB (2×16MB/s).
+#
+
+verify_runnable "global"
+
+log_assert "L2ARC parallel writes scale with number of cache devices."
+
+function cleanup
+{
+	if poolexists $TESTPOOL ; then
+		destroy_pool $TESTPOOL
+	fi
+
+	log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+	log_must set_tunable32 L2ARC_DWPD_LIMIT $dwpd_limit
+	log_must set_tunable64 ARC_MIN $arc_min
+	log_must set_tunable64 ARC_MAX $arc_max
+}
+log_onexit cleanup
+
+# Save original tunables
+typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+typeset dwpd_limit=$(get_tunable L2ARC_DWPD_LIMIT)
+typeset arc_min=$(get_tunable ARC_MIN)
+typeset arc_max=$(get_tunable ARC_MAX)
+
+# Test parameters
+typeset cache_sz=900   # 900MB per device
+typeset fill_mb=2500   # 2.5GB initial data
+typeset test_time=25   # Measurement window: 16MB/s × 25s = 400MB per device
+typeset VDEV_CACHE2="$VDIR/cache2"
+
+# Disable DWPD to test pure parallel throughput
+log_must set_tunable32 L2ARC_DWPD_LIMIT 0
+
+# Set L2ARC_WRITE_MAX to 16MB/s to test parallel scaling
+log_must set_tunable32 L2ARC_WRITE_MAX $((16 * 1024 * 1024))
+log_must set_tunable32 L2ARC_NOPREFETCH 0
+
+# Configure arc_max so persist threshold (arc_max/2) is below device size
+# persist_threshold = 1024MB/2 = 512MB, device usable ~896MB > 512MB
+log_must set_tunable64 ARC_MIN $((512 * 1024 * 1024))
+log_must set_tunable64 ARC_MAX $((1024 * 1024 * 1024))
+
+# Single device test: 1 × 900MB
+log_must truncate -s ${cache_sz}M $VDEV_CACHE
+log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE
+
+# Measure single-device write throughput
+typeset start=$(kstat arcstats.l2_write_bytes)
+dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1M count=$fill_mb &
+typeset dd_pid=$!
+log_must sleep $test_time
+typeset end=$(kstat arcstats.l2_write_bytes)
+kill $dd_pid 2>/dev/null
+wait $dd_pid 2>/dev/null
+typeset single_writes=$((end - start))
+
+log_note "Single-device writes: $((single_writes / 1024 / 1024))MB (target ~400MB at 16MB/s)"
+
+# Dual device test: 2 × 900MB = 1800MB total capacity
+log_must zpool destroy $TESTPOOL
+log_must truncate -s ${cache_sz}M $VDEV_CACHE
+log_must truncate -s ${cache_sz}M $VDEV_CACHE2
+
+log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE $VDEV_CACHE2
+
+# Measure parallel write throughput (2 feed threads active)
+start=$(kstat arcstats.l2_write_bytes)
+dd if=/dev/urandom of=/$TESTPOOL/file2 bs=1M count=$fill_mb &
+dd_pid=$!
+log_must sleep $test_time
+end=$(kstat arcstats.l2_write_bytes)
+kill $dd_pid 2>/dev/null
+wait $dd_pid 2>/dev/null
+typeset dual_writes=$((end - start))
+
+log_note "Dual-device writes: $((dual_writes / 1024 / 1024))MB (target ~800MB at 2×16MB/s)"
+
+# Verify parallel write scaling (dual should be ~2x single)
+# Actual values may be lower than target due to dd overhead, ARC pressure,
+# and feed thread scheduling, but ratio should show clear parallel benefit.
+# Require 1.5x minimum to pass.
+typeset min_ratio=$((single_writes * 3 / 2))
+if [[ $dual_writes -lt $min_ratio ]]; then
+	log_fail "Dual-device writes ($((dual_writes / 1024 / 1024))MB)" \
+	    "should be at least 1.5x single ($((single_writes / 1024 / 1024))MB)"
+fi
+
+log_must zpool destroy $TESTPOOL
+
+log_pass "L2ARC parallel writes scale with number of cache devices."
diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc_multidev_throughput_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/l2arc_multidev_throughput_pos.ksh
new file mode 100755
index 000000000000..661a631105a1
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/l2arc/l2arc_multidev_throughput_pos.ksh
@@ -0,0 +1,110 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/l2arc/l2arc.cfg
+
+#
+# DESCRIPTION:
+#	L2ARC parallel writes scale with number of cache devices.
+#
+# STRATEGY:
+#	1. Disable DWPD rate limiting.
+#	2. Create pool with 5 cache devices.
+#	3. Write data and measure L2ARC throughput.
+#	4. Verify throughput scales with device count (~32MB/s per device).
+#
+
+verify_runnable "global"
+
+log_assert "L2ARC parallel writes scale with number of cache devices."
+
+function cleanup
+{
+	if poolexists $TESTPOOL ; then
+		destroy_pool $TESTPOOL
+	fi
+
+	log_must set_tunable32 L2ARC_WRITE_MAX $write_max
+	log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
+	log_must set_tunable32 L2ARC_DWPD_LIMIT $dwpd_limit
+	log_must set_tunable64 ARC_MIN $arc_min
+	log_must set_tunable64 ARC_MAX $arc_max
+}
+log_onexit cleanup
+
+# Save original tunables
+typeset write_max=$(get_tunable L2ARC_WRITE_MAX)
+typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
+typeset dwpd_limit=$(get_tunable L2ARC_DWPD_LIMIT)
+typeset arc_min=$(get_tunable ARC_MIN)
+typeset arc_max=$(get_tunable ARC_MAX)
+
+# Test parameters
+typeset num_devs=5
+typeset cache_sz=200
+typeset test_time=5
+typeset expected_rate=$((32 * 1024 * 1024))  # 32 MB/s per device
+
+# Disable DWPD rate limiting
+log_must set_tunable32 L2ARC_DWPD_LIMIT 0
+
+# Use default L2ARC_WRITE_MAX (32MB/s per device)
+log_must set_tunable32 L2ARC_WRITE_MAX $expected_rate
+log_must set_tunable32 L2ARC_NOPREFETCH 0
+
+# Configure arc_max large enough
+log_must set_tunable64 ARC_MIN $((512 * 1024 * 1024))
+log_must set_tunable64 ARC_MAX $((1024 * 1024 * 1024))
+
+# Create cache devices
+typeset cache_devs=""
+for i in $(seq 1 $num_devs); do
+	typeset dev="$VDIR/cache$i"
+	log_must truncate -s ${cache_sz}M $dev
+	cache_devs="$cache_devs $dev"
+done
+
+log_must zpool create -f $TESTPOOL $VDEV cache $cache_devs
+
+# Generate data and measure L2ARC writes
+typeset start=$(kstat arcstats.l2_write_bytes)
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1M count=800
+log_must sleep $test_time
+typeset end=$(kstat arcstats.l2_write_bytes)
+
+typeset bytes=$((end - start))
+typeset bytes_mb=$((bytes / 1024 / 1024))
+# expected = 32MB/s * 5 devices * 5 seconds = 800MB
+typeset expected=$((expected_rate * num_devs * test_time))
+typeset expected_mb=$((expected / 1024 / 1024))
+
+log_note "L2ARC writes: ${bytes_mb}MB (expected ~${expected_mb}MB)"
+
+# Verify writes are at least 80% of expected
+typeset min_bytes=$((expected * 80 / 100))
+if [[ $bytes -lt $min_bytes ]]; then
+	log_fail "Writes ${bytes_mb}MB below minimum $((min_bytes/1024/1024))MB"
+fi
+
+log_must zpool destroy $TESTPOOL
+
+log_pass "L2ARC parallel writes scale with number of cache devices."
diff --git a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh
index 9b0a4865591c..52c0fa7600cc 100755
--- a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh
+++ b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh
@@ -50,16 +50,19 @@ function cleanup
 	log_must rm -f $VDEVS
 	log_must set_tunable32 L2ARC_TRIM_AHEAD $l2arc_trimahead
 	log_must set_tunable32 L2ARC_WRITE_MAX $l2arc_writemax
+	log_must set_tunable32 L2ARC_DWPD_LIMIT $l2arc_dwpdlimit
 }
 log_onexit cleanup
 
 # The cache device $TRIM_VDEV2 has to be small enough, so that
-# dev->l2ad_hand loops around and dev->l2ad_first=0. Otherwise 
+# dev->l2ad_hand loops around and dev->l2ad_first=0. Otherwise
 # l2arc_evict() exits before evicting/trimming.
 typeset l2arc_trimahead=$(get_tunable L2ARC_TRIM_AHEAD)
 typeset l2arc_writemax=$(get_tunable L2ARC_WRITE_MAX)
+typeset l2arc_dwpdlimit=$(get_tunable L2ARC_DWPD_LIMIT)
 log_must set_tunable32 L2ARC_TRIM_AHEAD 1
 log_must set_tunable32 L2ARC_WRITE_MAX $((64 * 1024 * 1024))
+log_must set_tunable32 L2ARC_DWPD_LIMIT 0
 VDEVS="$TRIM_VDEV1 $TRIM_VDEV2"
 log_must truncate -s $((MINVDEVSIZE)) $TRIM_VDEV2
 log_must truncate -s $((4 * MINVDEVSIZE)) $TRIM_VDEV1