From 42d4276725567d4a7ff60131d5d68ef13422e2b4 Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Wed, 30 Apr 2025 21:45:37 -0400
Subject: [PATCH 01/18] Add performance test of partial shard reads

---
 tests/test_codecs/test_sharding.py | 65 ++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py
index 403fd80e81..6efc3a251c 100644
--- a/tests/test_codecs/test_sharding.py
+++ b/tests/test_codecs/test_sharding.py
@@ -197,6 +197,71 @@ def test_sharding_partial_read(
     assert np.all(read_data == 1)
 
 
+@pytest.mark.slow_hypothesis
+@pytest.mark.parametrize("store", ["local"], indirect=["store"])
+def test_partial_shard_read_performance(store: Store) -> None:
+    import asyncio
+    import json
+    from functools import partial
+    from itertools import product
+    from timeit import timeit
+    from unittest.mock import AsyncMock
+
+    # The whole test array is a single shard to keep runtime manageable while
+    # using a realistic shard size (256 MiB uncompressed, ~115 MiB compressed).
+    # In practice, the array is likely to be much larger with many shards of this
+    # rough order of magnitude. There are 512 chunks per shard in this example.
+    array_shape = (512, 512, 512)
+    shard_shape = (512, 512, 512)  # 256 MiB uncompressed unit16s
+    chunk_shape = (64, 64, 64)  # 512 KiB uncompressed unit16s
+    dtype = np.uint16
+
+    a = zarr.create_array(
+        StorePath(store),
+        shape=array_shape,
+        chunks=chunk_shape,
+        shards=shard_shape,
+        compressors=BloscCodec(cname="zstd"),
+        dtype=dtype,
+        fill_value=np.iinfo(dtype).max,
+    )
+    # Narrow range of values lets zstd compress to about 1/2 of uncompressed size
+    a[:] = np.random.default_rng(123).integers(low=0, high=50, size=array_shape, dtype=dtype)
+
+    num_calls = 20
+    experiments = []
+    for concurrency, get_latency, statement in product(
+        [1, 10, 100], [0.0, 0.01], ["a[0, :, :]", "a[:, 0, :]", "a[:, :, 0]"]
+    ):
+        zarr.config.set({"async.concurrency": concurrency})
+
+        async def get_with_latency(*args: Any, get_latency: float, **kwargs: Any) -> Any:
+            await asyncio.sleep(get_latency)
+            return await store.get(*args, **kwargs)
+
+        store_mock = AsyncMock(wraps=store, spec=store.__class__)
+        store_mock.get.side_effect = partial(get_with_latency, get_latency=get_latency)
+
+        a = zarr.open_array(StorePath(store_mock))
+
+        store_mock.reset_mock()
+
+        # Each timeit call accesses a 512x512 slice covering 64 chunks
+        time = timeit(statement, number=num_calls, globals={"a": a}) / num_calls
+        experiments.append(
+            {
+                "concurrency": concurrency,
+                "statement": statement,
+                "get_latency": get_latency,
+                "time": time,
+                "store_get_calls": store_mock.get.call_count,
+            }
+        )
+
+    with open("zarr-python-partial-shard-read-performance-no-coalesce.json", "w") as f:
+        json.dump(experiments, f)
+
+
 @pytest.mark.parametrize(
     "array_fixture",
     [

From 2f06574b0f0626d182b5c82979be88cc9caa8d5a Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Sat, 19 Apr 2025 00:57:46 -0400
Subject: [PATCH 02/18] WIP Consolidate reads of multiple chunks in the same
 shard

---
 src/zarr/codecs/sharding.py | 126 ++++++++++++++++++++++++++++++------
 1 file changed, 106 insertions(+), 20 deletions(-)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 4638d973cb..05cddd77c1 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -38,11 +38,13 @@
 from zarr.core.common import (
     ChunkCoords,
     ChunkCoordsLike,
+    concurrent_map,
     parse_enum,
     parse_named_configuration,
     parse_shapelike,
     product,
 )
+from zarr.core.config import config
 from zarr.core.indexing import (
     BasicIndexer,
     SelectorTuple,
@@ -327,6 +329,11 @@ async def finalize(
         return await shard_builder.finalize(index_location, index_encoder)
 
 
+class _ChunkCoordsByteSlice(NamedTuple):
+    coords: ChunkCoords
+    byte_slice: slice
+
+
 @dataclass(frozen=True)
 class ShardingCodec(
     ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin
@@ -490,32 +497,21 @@ async def _decode_partial_single(
         all_chunk_coords = {chunk_coords for chunk_coords, *_ in indexed_chunks}
 
         # reading bytes of all requested chunks
-        shard_dict: ShardMapping = {}
+        shard_dict_maybe: ShardMapping | None = {}
         if self._is_total_shard(all_chunk_coords, chunks_per_shard):
             # read entire shard
             shard_dict_maybe = await self._load_full_shard_maybe(
-                byte_getter=byte_getter,
-                prototype=chunk_spec.prototype,
-                chunks_per_shard=chunks_per_shard,
+                byte_getter, chunk_spec.prototype, chunks_per_shard
             )
-            if shard_dict_maybe is None:
-                return None
-            shard_dict = shard_dict_maybe
         else:
             # read some chunks within the shard
-            shard_index = await self._load_shard_index_maybe(byte_getter, chunks_per_shard)
-            if shard_index is None:
-                return None
-            shard_dict = {}
-            for chunk_coords in all_chunk_coords:
-                chunk_byte_slice = shard_index.get_chunk_slice(chunk_coords)
-                if chunk_byte_slice:
-                    chunk_bytes = await byte_getter.get(
-                        prototype=chunk_spec.prototype,
-                        byte_range=RangeByteRequest(chunk_byte_slice[0], chunk_byte_slice[1]),
-                    )
-                    if chunk_bytes:
-                        shard_dict[chunk_coords] = chunk_bytes
+            shard_dict_maybe = await self._load_partial_shard_maybe(
+                byte_getter, chunk_spec.prototype, chunks_per_shard, all_chunk_coords
+            )
+
+        if shard_dict_maybe is None:
+            return None
+        shard_dict = shard_dict_maybe
 
         # decoding chunks and writing them into the output buffer
         await self.codec_pipeline.read(
@@ -537,6 +533,96 @@ async def _decode_partial_single(
         else:
             return out
 
+    async def _load_partial_shard_maybe(
+        self,
+        byte_getter: ByteGetter,
+        prototype: BufferPrototype,
+        chunks_per_shard: ChunkCoords,
+        all_chunk_coords: set[ChunkCoords],
+    ) -> ShardMapping | None:
+        shard_index = await self._load_shard_index_maybe(byte_getter, chunks_per_shard)
+        if shard_index is None:
+            return None
+
+        chunks = [
+            _ChunkCoordsByteSlice(chunk_coords, slice(*chunk_byte_slice))
+            for chunk_coords in all_chunk_coords
+            if (chunk_byte_slice := shard_index.get_chunk_slice(chunk_coords))
+        ]
+        if len(chunks) == 0:
+            return {}
+
+        groups = self._coalesce_chunks(chunks)
+
+        shard_dicts = await concurrent_map(
+            [(group, byte_getter, prototype) for group in groups],
+            self._get_group_bytes,
+            config.get("async.concurrency"),
+        )
+
+        shard_dict: ShardMutableMapping = {}
+        for d in shard_dicts:
+            shard_dict.update(d)
+
+        return shard_dict
+
+    def _coalesce_chunks(
+        self,
+        chunks: list[_ChunkCoordsByteSlice],
+        max_gap_bytes: int = 2**20,  # 1MiB
+        coalesce_max_bytes: int = 100 * 2**20,  # 100MiB
+    ) -> list[list[_ChunkCoordsByteSlice]]:
+        sorted_chunks = sorted(chunks, key=lambda c: c.byte_slice.start)
+
+        groups = []
+        current_group = [sorted_chunks[0]]
+
+        for chunk in sorted_chunks[1:]:
+            gap_to_chunk = chunk.byte_slice.start - current_group[-1].byte_slice.stop
+            current_group_size = (
+                current_group[-1].byte_slice.stop - current_group[0].byte_slice.start
+            )
+            if gap_to_chunk < max_gap_bytes and current_group_size < coalesce_max_bytes:
+                current_group.append(chunk)
+            else:
+                groups.append(current_group)
+                current_group = [chunk]
+
+        groups.append(current_group)
+
+        from pprint import pprint
+
+        pprint(
+            [
+                f"{len(g)} chunks, {(g[-1].byte_slice.stop - g[0].byte_slice.start) / 1e6:.1f}MB"
+                for g in groups
+            ]
+        )
+
+        return groups
+
+    async def _get_group_bytes(
+        self,
+        group: list[_ChunkCoordsByteSlice],
+        byte_getter: ByteGetter,
+        prototype: BufferPrototype,
+    ) -> ShardMapping:
+        group_start = group[0].byte_slice.start
+        group_end = group[-1].byte_slice.stop
+
+        group_bytes = await byte_getter.get(
+            prototype=prototype,
+            byte_range=RangeByteRequest(group_start, group_end),
+        )
+        if group_bytes is None:
+            return {}
+
+        shard_dict = {}
+        for chunk in group:
+            s = slice(chunk.byte_slice.start - group_start, chunk.byte_slice.stop - group_start)
+            shard_dict[chunk.coords] = group_bytes[s]
+        return shard_dict
+
     async def _encode_single(
         self,
         shard_array: NDBuffer,

From b56759984ee627076bb7b876f444d4ed2fc9907b Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Mon, 21 Apr 2025 23:54:11 -0400
Subject: [PATCH 03/18] Add test and make max gap and max coalesce size config
 options

---
 src/zarr/codecs/sharding.py        | 14 +++-----------
 src/zarr/core/config.py            |  6 ++++++
 tests/test_codecs/test_sharding.py | 31 +++++++++++++++++++++++++++++-
 3 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 05cddd77c1..94cdda132f 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -569,9 +569,10 @@ async def _load_partial_shard_maybe(
     def _coalesce_chunks(
         self,
         chunks: list[_ChunkCoordsByteSlice],
-        max_gap_bytes: int = 2**20,  # 1MiB
-        coalesce_max_bytes: int = 100 * 2**20,  # 100MiB
     ) -> list[list[_ChunkCoordsByteSlice]]:
+        max_gap_bytes = config.get("sharding.read.coalesce_max_gap_bytes")
+        coalesce_max_bytes = config.get("sharding.read.coalesce_max_bytes")
+
         sorted_chunks = sorted(chunks, key=lambda c: c.byte_slice.start)
 
         groups = []
@@ -590,15 +591,6 @@ def _coalesce_chunks(
 
         groups.append(current_group)
 
-        from pprint import pprint
-
-        pprint(
-            [
-                f"{len(g)} chunks, {(g[-1].byte_slice.stop - g[0].byte_slice.start) / 1e6:.1f}MB"
-                for g in groups
-            ]
-        )
-
         return groups
 
     async def _get_group_bytes(
diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
index 2a10943d80..4622a939d8 100644
--- a/src/zarr/core/config.py
+++ b/src/zarr/core/config.py
@@ -108,6 +108,12 @@ def enable_gpu(self) -> ConfigSet:
             },
             "async": {"concurrency": 10, "timeout": None},
             "threading": {"max_workers": None},
+            "sharding": {
+                "read": {
+                    "coalesce_max_bytes": 100 * 2**20,  # 100MiB
+                    "coalesce_max_gap_bytes": 2**20,  # 1MiB
+                }
+            },
             "json_indent": 2,
             "codec_pipeline": {
                 "path": "zarr.core.codec_pipeline.BatchedCodecPipeline",
diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py
index 6efc3a251c..2feb7bbb2f 100644
--- a/tests/test_codecs/test_sharding.py
+++ b/tests/test_codecs/test_sharding.py
@@ -258,10 +258,39 @@ async def get_with_latency(*args: Any, get_latency: float, **kwargs: Any) -> Any
             }
         )
 
-    with open("zarr-python-partial-shard-read-performance-no-coalesce.json", "w") as f:
+    with open("zarr-python-partial-shard-read-performance.json", "w") as f:
         json.dump(experiments, f)
 
 
+@pytest.mark.parametrize("index_location", ["start", "end"])
+@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"])
+def test_sharding_multiple_chunks_partial_shard_read(
+    store: Store, index_location: ShardingCodecIndexLocation
+) -> None:
+    array_shape = (8, 64)
+    shard_shape = (4, 32)
+    chunk_shape = (2, 4)
+
+    data = np.arange(np.prod(array_shape), dtype="float32").reshape(array_shape)
+
+    a = zarr.create_array(
+        StorePath(store),
+        shape=data.shape,
+        chunks=chunk_shape,
+        shards={"shape": shard_shape, "index_location": index_location},
+        compressors=BloscCodec(cname="lz4"),
+        dtype=data.dtype,
+        fill_value=1,
+    )
+    a[:] = data
+
+    # Reads 2.5 (3 full, one partial) chunks each from 2 shards (a subset of both shards)
+    assert np.allclose(a[0, 22:42], np.arange(22, 42, dtype="float32"))
+
+    # Reads 2 chunks from both shards along dimension 0
+    assert np.allclose(a[:, 0], np.arange(0, data.size, array_shape[1], dtype="float32"))
+
+
 @pytest.mark.parametrize(
     "array_fixture",
     [

From c47b87c50190bc5f159ac895d8cf1bc00af9d9ad Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Tue, 22 Apr 2025 00:14:25 -0400
Subject: [PATCH 04/18] Code clarity and comments

---
 src/zarr/codecs/sharding.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 94cdda132f..3107f63bbf 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -570,6 +570,15 @@ def _coalesce_chunks(
         self,
         chunks: list[_ChunkCoordsByteSlice],
     ) -> list[list[_ChunkCoordsByteSlice]]:
+        """
+        Combine chunks from a single shard into groups that should be read together
+        in a single request.
+
+        Respects the following configuration options:
+        - `sharding.read.coalesce_max_gap_bytes`: The maximum gap between
+          chunks to coalesce into a single group.
+        - `sharding.read.coalesce_max_bytes`: The maximum number of bytes in a group.
+        """
         max_gap_bytes = config.get("sharding.read.coalesce_max_gap_bytes")
         coalesce_max_bytes = config.get("sharding.read.coalesce_max_bytes")
 
@@ -602,6 +611,7 @@ async def _get_group_bytes(
         group_start = group[0].byte_slice.start
         group_end = group[-1].byte_slice.stop
 
+        # A single call to retrieve the bytes for the entire group.
         group_bytes = await byte_getter.get(
             prototype=prototype,
             byte_range=RangeByteRequest(group_start, group_end),
@@ -609,10 +619,15 @@ async def _get_group_bytes(
         if group_bytes is None:
             return {}
 
+        # Extract the bytes corresponding to each chunk in group from group_bytes.
         shard_dict = {}
         for chunk in group:
-            s = slice(chunk.byte_slice.start - group_start, chunk.byte_slice.stop - group_start)
-            shard_dict[chunk.coords] = group_bytes[s]
+            chunk_slice = slice(
+                chunk.byte_slice.start - group_start,
+                chunk.byte_slice.stop - group_start,
+            )
+            shard_dict[chunk.coords] = group_bytes[chunk_slice]
+
         return shard_dict
 
     async def _encode_single(

From 0f5aa0071a94cc83ec3fae41e371dfea06864815 Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Wed, 30 Apr 2025 21:39:41 -0400
Subject: [PATCH 05/18] Test that chunk request coalescing reduces calls to
 store

---
 tests/test_codecs/test_sharding.py | 53 +++++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 8 deletions(-)

diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py
index 2feb7bbb2f..82e5fe612d 100644
--- a/tests/test_codecs/test_sharding.py
+++ b/tests/test_codecs/test_sharding.py
@@ -1,5 +1,6 @@
 import pickle
 from typing import Any
+from unittest.mock import AsyncMock
 
 import numpy as np
 import numpy.typing as npt
@@ -9,7 +10,7 @@
 import zarr.api
 import zarr.api.asynchronous
 from zarr import Array
-from zarr.abc.store import Store
+from zarr.abc.store import RangeByteRequest, Store, SuffixByteRequest
 from zarr.codecs import (
     BloscCodec,
     ShardingCodec,
@@ -264,17 +265,24 @@ async def get_with_latency(*args: Any, get_latency: float, **kwargs: Any) -> Any
 
 @pytest.mark.parametrize("index_location", ["start", "end"])
 @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"])
+@pytest.mark.parametrize("coalesce_reads", [True, False])
 def test_sharding_multiple_chunks_partial_shard_read(
-    store: Store, index_location: ShardingCodecIndexLocation
+    store: Store, index_location: ShardingCodecIndexLocation, coalesce_reads: bool
 ) -> None:
-    array_shape = (8, 64)
-    shard_shape = (4, 32)
+    array_shape = (16, 64)
+    shard_shape = (8, 32)
     chunk_shape = (2, 4)
-
     data = np.arange(np.prod(array_shape), dtype="float32").reshape(array_shape)
 
+    if coalesce_reads:
+        # 1MiB, enough to coalesce all chunks within a shard in this example
+        zarr.config.set({"sharding.read.coalesce_max_gap_bytes": 2**20})
+    else:
+        zarr.config.set({"sharding.read.coalesce_max_gap_bytes": -1})  # disable coalescing
+
+    store_mock = AsyncMock(wraps=store, spec=store.__class__)
     a = zarr.create_array(
-        StorePath(store),
+        StorePath(store_mock),
         shape=data.shape,
         chunks=chunk_shape,
         shards={"shape": shard_shape, "index_location": index_location},
@@ -284,12 +292,41 @@ def test_sharding_multiple_chunks_partial_shard_read(
     )
     a[:] = data
 
-    # Reads 2.5 (3 full, one partial) chunks each from 2 shards (a subset of both shards)
+    store_mock.reset_mock()  # ignore store calls during array creation
+
+    # Reads 3 (2 full, 1 partial) chunks each from 2 shards (a subset of both shards)
+    # for a total of 6 chunks accessed
     assert np.allclose(a[0, 22:42], np.arange(22, 42, dtype="float32"))
 
-    # Reads 2 chunks from both shards along dimension 0
+    if coalesce_reads:
+        # 2 shard index requests + 2 coalesced chunk data byte ranges (one for each shard)
+        assert store_mock.get.call_count == 4
+    else:
+        # 2 shard index requests + 6 chunks
+        assert store_mock.get.call_count == 8
+
+    for method, args, kwargs in store_mock.method_calls:
+        assert method == "get"
+        assert args[0].startswith("c/")  # get from a chunk
+        assert isinstance(kwargs["byte_range"], (SuffixByteRequest, RangeByteRequest))
+
+    store_mock.reset_mock()
+
+    # Reads 4 chunks from both shards along dimension 0 for a total of 8 chunks accessed
     assert np.allclose(a[:, 0], np.arange(0, data.size, array_shape[1], dtype="float32"))
 
+    if coalesce_reads:
+        # 2 shard index requests + 2 coalesced chunk data byte ranges (one for each shard)
+        assert store_mock.get.call_count == 4
+    else:
+        # 2 shard index requests + 8 chunks
+        assert store_mock.get.call_count == 10
+
+    for method, args, kwargs in store_mock.method_calls:
+        assert method == "get"
+        assert args[0].startswith("c/")  # get from a chunk
+        assert isinstance(kwargs["byte_range"], (SuffixByteRequest, RangeByteRequest))
+
 
 @pytest.mark.parametrize(
     "array_fixture",

From 53de4a1ca11a7b686a4ec924d571b4dac22dd8d1 Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Wed, 30 Apr 2025 21:53:35 -0400
Subject: [PATCH 06/18] Profile a few values for coalesce_max_gap

---
 tests/test_codecs/test_sharding.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py
index 82e5fe612d..cb14ee97dc 100644
--- a/tests/test_codecs/test_sharding.py
+++ b/tests/test_codecs/test_sharding.py
@@ -198,6 +198,7 @@ def test_sharding_partial_read(
     assert np.all(read_data == 1)
 
 
+@pytest.mark.skip("This is profiling rather than a test")
 @pytest.mark.slow_hypothesis
 @pytest.mark.parametrize("store", ["local"], indirect=["store"])
 def test_partial_shard_read_performance(store: Store) -> None:
@@ -231,10 +232,18 @@ def test_partial_shard_read_performance(store: Store) -> None:
 
     num_calls = 20
     experiments = []
-    for concurrency, get_latency, statement in product(
-        [1, 10, 100], [0.0, 0.01], ["a[0, :, :]", "a[:, 0, :]", "a[:, :, 0]"]
+    for concurrency, get_latency, coalesce_max_gap, statement in product(
+        [1, 10, 100],
+        [0.0, 0.01],
+        [-1, 2**20, 10 * 2**20],
+        ["a[0, :, :]", "a[:, 0, :]", "a[:, :, 0]"],
     ):
-        zarr.config.set({"async.concurrency": concurrency})
+        zarr.config.set(
+            {
+                "async.concurrency": concurrency,
+                "sharding.read.coalesce_max_gap_bytes": coalesce_max_gap,
+            }
+        )
 
         async def get_with_latency(*args: Any, get_latency: float, **kwargs: Any) -> Any:
             await asyncio.sleep(get_latency)
@@ -252,14 +261,15 @@ async def get_with_latency(*args: Any, get_latency: float, **kwargs: Any) -> Any
         experiments.append(
             {
                 "concurrency": concurrency,
-                "statement": statement,
+                "coalesce_max_gap": coalesce_max_gap,
                 "get_latency": get_latency,
+                "statement": statement,
                 "time": time,
                 "store_get_calls": store_mock.get.call_count,
             }
         )
 
-    with open("zarr-python-partial-shard-read-performance.json", "w") as f:
+    with open("zarr-python-partial-shard-read-performance-with-coalesce.json", "w") as f:
         json.dump(experiments, f)
 
 

From 6cda80a339171c49ea574b8f8d162ff8e4115fbd Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Sun, 1 Jun 2025 23:03:19 -0400
Subject: [PATCH 07/18] Update [doc]tests to include new sharding.read.* values

---
 docs/user-guide/config.rst | 2 ++
 tests/test_config.py       | 8 ++++++++
 2 files changed, 10 insertions(+)

diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst
index 91ffe50b91..1822011331 100644
--- a/docs/user-guide/config.rst
+++ b/docs/user-guide/config.rst
@@ -88,4 +88,6 @@ This is the current default configuration::
     'default_zarr_format': 3,
     'json_indent': 2,
     'ndbuffer': 'zarr.core.buffer.cpu.NDBuffer',
+    'sharding': {'read': {'coalesce_max_bytes': 104857600,
+                          'coalesce_max_gap_bytes': 1048576}},
     'threading': {'max_workers': None}}
diff --git a/tests/test_config.py b/tests/test_config.py
index 2cbf172752..81d85d9746 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -103,6 +103,12 @@ def test_config_defaults_set() -> None:
                 "vlen-utf8": "zarr.codecs.vlen_utf8.VLenUTF8Codec",
                 "vlen-bytes": "zarr.codecs.vlen_utf8.VLenBytesCodec",
             },
+            "sharding": {
+                "read": {
+                    "coalesce_max_bytes": 104857600,  # 100 MiB
+                    "coalesce_max_gap_bytes": 1048576,  # 1 MiB
+                },
+            },
         }
     ]
     assert config.get("array.order") == "C"
@@ -110,6 +116,8 @@ def test_config_defaults_set() -> None:
     assert config.get("async.timeout") is None
     assert config.get("codec_pipeline.batch_size") == 1
     assert config.get("json_indent") == 2
+    assert config.get("sharding.read.coalesce_max_bytes") == 100 * 2**20  # 100 MiB
+    assert config.get("sharding.read.coalesce_max_gap_bytes") == 2**20  # 1 MiB
 
 
 @pytest.mark.parametrize(

From af0d1446c0fb9eac40e20ed978c3ad9007667137 Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Sun, 1 Jun 2025 23:05:30 -0400
Subject: [PATCH 08/18] document sharded read config options in
 user-guide/config.rst

---
 docs/user-guide/config.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst
index 1822011331..99a3826df1 100644
--- a/docs/user-guide/config.rst
+++ b/docs/user-guide/config.rst
@@ -33,6 +33,10 @@ Configuration options include the following:
 - Async and threading options, e.g. ``async.concurrency`` and ``threading.max_workers``
 - Selections of implementations of codecs, codec pipelines and buffers
 - Enabling GPU support with ``zarr.config.enable_gpu()``. See :ref:`user-guide-gpu` for more.
+- Tuning reads from sharded zarrs. When reading less than a complete shard, reads to nearby chunks
+  within the same shard will be combined into a single request if they are less than
+  ``sharding.read.coalesce_max_gap_bytes`` apart and the combined request size is less than
+  ``sharding.read.coalesce_max_bytes``.
 
 For selecting custom implementations of codecs, pipelines, buffers and ndbuffers,
 first register the implementations in the registry and then select them in the config.

From 094ab3872c09a92562755dfd32230c37eb277c4b Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Sun, 1 Jun 2025 23:09:45 -0400
Subject: [PATCH 09/18] tweak logic: start new coalesced group if coalescing
 would exceed `coalesce_max_bytes`

previous logic only started a new group if existing group
was size already exceeded coalesce_max_bytes.
---
 src/zarr/codecs/sharding.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 3107f63bbf..75cc748e96 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -589,10 +589,8 @@ def _coalesce_chunks(
 
         for chunk in sorted_chunks[1:]:
             gap_to_chunk = chunk.byte_slice.start - current_group[-1].byte_slice.stop
-            current_group_size = (
-                current_group[-1].byte_slice.stop - current_group[0].byte_slice.start
-            )
-            if gap_to_chunk < max_gap_bytes and current_group_size < coalesce_max_bytes:
+            size_if_coalesced = chunk.byte_slice.stop - current_group[0].byte_slice.start
+            if gap_to_chunk < max_gap_bytes and size_if_coalesced < coalesce_max_bytes:
                 current_group.append(chunk)
             else:
                 groups.append(current_group)

From 50dea49a3dd485e0ad8da42bbf36f1df1d0a2acf Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Sun, 1 Jun 2025 23:21:28 -0400
Subject: [PATCH 10/18] set `mypy_path = "src"` to help pre-commit mypy find
 imported classes

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index d25af7c5fc..b78c628b86 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -347,6 +347,7 @@ ignore = [
 [tool.mypy]
 python_version = "3.11"
 ignore_missing_imports = true
+mypy_path = "src"
 namespace_packages = false
 
 strict = true

From fe077c40d7d6e0cded0d368cb86c3fd013aa4bf2 Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Sun, 1 Jun 2025 23:54:13 -0400
Subject: [PATCH 11/18] Reorder methods in sharding.py, add docstring +
 commenting

---
 src/zarr/codecs/sharding.py | 238 ++++++++++++++++++++----------------
 1 file changed, 134 insertions(+), 104 deletions(-)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 75cc748e96..d5cff1a2cc 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -198,7 +198,9 @@ async def from_bytes(
 
     @classmethod
     def create_empty(
-        cls, chunks_per_shard: ChunkCoords, buffer_prototype: BufferPrototype | None = None
+        cls,
+        chunks_per_shard: ChunkCoords,
+        buffer_prototype: BufferPrototype | None = None,
     ) -> _ShardReader:
         if buffer_prototype is None:
             buffer_prototype = default_buffer_prototype()
@@ -248,7 +250,9 @@ def merge_with_morton_order(
 
     @classmethod
     def create_empty(
-        cls, chunks_per_shard: ChunkCoords, buffer_prototype: BufferPrototype | None = None
+        cls,
+        chunks_per_shard: ChunkCoords,
+        buffer_prototype: BufferPrototype | None = None,
     ) -> _ShardBuilder:
         if buffer_prototype is None:
             buffer_prototype = default_buffer_prototype()
@@ -330,13 +334,17 @@ async def finalize(
 
 
 class _ChunkCoordsByteSlice(NamedTuple):
+    """Holds a chunk's coordinates and it's byte range in a serialized shard."""
+
     coords: ChunkCoords
     byte_slice: slice
 
 
 @dataclass(frozen=True)
 class ShardingCodec(
-    ArrayBytesCodec, ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin
+    ArrayBytesCodec,
+    ArrayBytesCodecPartialDecodeMixin,
+    ArrayBytesCodecPartialEncodeMixin,
 ):
     chunk_shape: ChunkCoords
     codecs: tuple[Codec, ...]
@@ -446,7 +454,10 @@ async def _decode_single(
 
         # setup output array
         out = chunk_spec.prototype.nd_buffer.create(
-            shape=shard_shape, dtype=shard_spec.dtype, order=shard_spec.order, fill_value=0
+            shape=shard_shape,
+            dtype=shard_spec.dtype,
+            order=shard_spec.order,
+            fill_value=0,
         )
         shard_dict = await _ShardReader.from_bytes(shard_bytes, self, chunks_per_shard)
 
@@ -490,7 +501,10 @@ async def _decode_partial_single(
 
         # setup output array
         out = shard_spec.prototype.nd_buffer.create(
-            shape=indexer.shape, dtype=shard_spec.dtype, order=shard_spec.order, fill_value=0
+            shape=indexer.shape,
+            dtype=shard_spec.dtype,
+            order=shard_spec.order,
+            fill_value=0,
         )
 
         indexed_chunks = list(indexer)
@@ -533,101 +547,6 @@ async def _decode_partial_single(
         else:
             return out
 
-    async def _load_partial_shard_maybe(
-        self,
-        byte_getter: ByteGetter,
-        prototype: BufferPrototype,
-        chunks_per_shard: ChunkCoords,
-        all_chunk_coords: set[ChunkCoords],
-    ) -> ShardMapping | None:
-        shard_index = await self._load_shard_index_maybe(byte_getter, chunks_per_shard)
-        if shard_index is None:
-            return None
-
-        chunks = [
-            _ChunkCoordsByteSlice(chunk_coords, slice(*chunk_byte_slice))
-            for chunk_coords in all_chunk_coords
-            if (chunk_byte_slice := shard_index.get_chunk_slice(chunk_coords))
-        ]
-        if len(chunks) == 0:
-            return {}
-
-        groups = self._coalesce_chunks(chunks)
-
-        shard_dicts = await concurrent_map(
-            [(group, byte_getter, prototype) for group in groups],
-            self._get_group_bytes,
-            config.get("async.concurrency"),
-        )
-
-        shard_dict: ShardMutableMapping = {}
-        for d in shard_dicts:
-            shard_dict.update(d)
-
-        return shard_dict
-
-    def _coalesce_chunks(
-        self,
-        chunks: list[_ChunkCoordsByteSlice],
-    ) -> list[list[_ChunkCoordsByteSlice]]:
-        """
-        Combine chunks from a single shard into groups that should be read together
-        in a single request.
-
-        Respects the following configuration options:
-        - `sharding.read.coalesce_max_gap_bytes`: The maximum gap between
-          chunks to coalesce into a single group.
-        - `sharding.read.coalesce_max_bytes`: The maximum number of bytes in a group.
-        """
-        max_gap_bytes = config.get("sharding.read.coalesce_max_gap_bytes")
-        coalesce_max_bytes = config.get("sharding.read.coalesce_max_bytes")
-
-        sorted_chunks = sorted(chunks, key=lambda c: c.byte_slice.start)
-
-        groups = []
-        current_group = [sorted_chunks[0]]
-
-        for chunk in sorted_chunks[1:]:
-            gap_to_chunk = chunk.byte_slice.start - current_group[-1].byte_slice.stop
-            size_if_coalesced = chunk.byte_slice.stop - current_group[0].byte_slice.start
-            if gap_to_chunk < max_gap_bytes and size_if_coalesced < coalesce_max_bytes:
-                current_group.append(chunk)
-            else:
-                groups.append(current_group)
-                current_group = [chunk]
-
-        groups.append(current_group)
-
-        return groups
-
-    async def _get_group_bytes(
-        self,
-        group: list[_ChunkCoordsByteSlice],
-        byte_getter: ByteGetter,
-        prototype: BufferPrototype,
-    ) -> ShardMapping:
-        group_start = group[0].byte_slice.start
-        group_end = group[-1].byte_slice.stop
-
-        # A single call to retrieve the bytes for the entire group.
-        group_bytes = await byte_getter.get(
-            prototype=prototype,
-            byte_range=RangeByteRequest(group_start, group_end),
-        )
-        if group_bytes is None:
-            return {}
-
-        # Extract the bytes corresponding to each chunk in group from group_bytes.
-        shard_dict = {}
-        for chunk in group:
-            chunk_slice = slice(
-                chunk.byte_slice.start - group_start,
-                chunk.byte_slice.stop - group_start,
-            )
-            shard_dict[chunk.coords] = group_bytes[chunk_slice]
-
-        return shard_dict
-
     async def _encode_single(
         self,
         shard_array: NDBuffer,
@@ -688,7 +607,9 @@ async def _encode_partial_single(
 
         indexer = list(
             get_indexer(
-                selection, shape=shard_shape, chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape)
+                selection,
+                shape=shard_shape,
+                chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape),
             )
         )
 
@@ -762,7 +683,8 @@ def _shard_index_size(self, chunks_per_shard: ChunkCoords) -> int:
             get_pipeline_class()
             .from_codecs(self.index_codecs)
             .compute_encoded_size(
-                16 * product(chunks_per_shard), self._get_index_chunk_spec(chunks_per_shard)
+                16 * product(chunks_per_shard),
+                self._get_index_chunk_spec(chunks_per_shard),
             )
         )
 
@@ -807,7 +729,8 @@ async def _load_shard_index_maybe(
             )
         else:
             index_bytes = await byte_getter.get(
-                prototype=numpy_buffer_prototype(), byte_range=SuffixByteRequest(shard_index_size)
+                prototype=numpy_buffer_prototype(),
+                byte_range=SuffixByteRequest(shard_index_size),
             )
         if index_bytes is not None:
             return await self._decode_shard_index(index_bytes, chunks_per_shard)
@@ -821,7 +744,10 @@ async def _load_shard_index(
         ) or _ShardIndex.create_empty(chunks_per_shard)
 
     async def _load_full_shard_maybe(
-        self, byte_getter: ByteGetter, prototype: BufferPrototype, chunks_per_shard: ChunkCoords
+        self,
+        byte_getter: ByteGetter,
+        prototype: BufferPrototype,
+        chunks_per_shard: ChunkCoords,
     ) -> _ShardReader | None:
         shard_bytes = await byte_getter.get(prototype=prototype)
 
@@ -831,6 +757,110 @@ async def _load_full_shard_maybe(
             else None
         )
 
+    async def _load_partial_shard_maybe(
+        self,
+        byte_getter: ByteGetter,
+        prototype: BufferPrototype,
+        chunks_per_shard: ChunkCoords,
+        all_chunk_coords: set[ChunkCoords],
+    ) -> ShardMapping | None:
+        """
+        Read bytes from `byte_getter` for the case where the read is less than a full shard.
+        Returns a mapping of chunk coordinates to bytes.
+        """
+        shard_index = await self._load_shard_index_maybe(byte_getter, chunks_per_shard)
+        if shard_index is None:
+            return None
+
+        chunks = [
+            _ChunkCoordsByteSlice(chunk_coords, slice(*chunk_byte_slice))
+            for chunk_coords in all_chunk_coords
+            # Drop chunks where index lookup fails
+            if (chunk_byte_slice := shard_index.get_chunk_slice(chunk_coords))
+        ]
+        if len(chunks) == 0:
+            return {}
+
+        groups = self._coalesce_chunks(chunks)
+
+        shard_dicts = await concurrent_map(
+            [(group, byte_getter, prototype) for group in groups],
+            self._get_group_bytes,
+            config.get("async.concurrency"),
+        )
+
+        shard_dict: ShardMutableMapping = {}
+        for d in shard_dicts:
+            shard_dict.update(d)
+
+        return shard_dict
+
+    def _coalesce_chunks(
+        self,
+        chunks: list[_ChunkCoordsByteSlice],
+    ) -> list[list[_ChunkCoordsByteSlice]]:
+        """
+        Combine chunks from a single shard into groups that should be read together
+        in a single request.
+
+        Respects the following configuration options:
+        - `sharding.read.coalesce_max_gap_bytes`: The maximum gap between
+          chunks to coalesce into a single group.
+        - `sharding.read.coalesce_max_bytes`: The maximum number of bytes in a group.
+        """
+        max_gap_bytes = config.get("sharding.read.coalesce_max_gap_bytes")
+        coalesce_max_bytes = config.get("sharding.read.coalesce_max_bytes")
+
+        sorted_chunks = sorted(chunks, key=lambda c: c.byte_slice.start)
+
+        groups = []
+        current_group = [sorted_chunks[0]]
+
+        for chunk in sorted_chunks[1:]:
+            gap_to_chunk = chunk.byte_slice.start - current_group[-1].byte_slice.stop
+            size_if_coalesced = chunk.byte_slice.stop - current_group[0].byte_slice.start
+            if gap_to_chunk < max_gap_bytes and size_if_coalesced < coalesce_max_bytes:
+                current_group.append(chunk)
+            else:
+                groups.append(current_group)
+                current_group = [chunk]
+
+        groups.append(current_group)
+
+        return groups
+
+    async def _get_group_bytes(
+        self,
+        group: list[_ChunkCoordsByteSlice],
+        byte_getter: ByteGetter,
+        prototype: BufferPrototype,
+    ) -> ShardMapping:
+        """
+        Reads a possibly coalesced group of one or more chunks from a shard.
+        Returns a mapping of chunk coordinates to bytes.
+        """
+        group_start = group[0].byte_slice.start
+        group_end = group[-1].byte_slice.stop
+
+        # A single call to retrieve the bytes for the entire group.
+        group_bytes = await byte_getter.get(
+            prototype=prototype,
+            byte_range=RangeByteRequest(group_start, group_end),
+        )
+        if group_bytes is None:
+            return {}
+
+        # Extract the bytes corresponding to each chunk in group from group_bytes.
+        shard_dict = {}
+        for chunk in group:
+            chunk_slice = slice(
+                chunk.byte_slice.start - group_start,
+                chunk.byte_slice.stop - group_start,
+            )
+            shard_dict[chunk.coords] = group_bytes[chunk_slice]
+
+        return shard_dict
+
     def compute_encoded_size(self, input_byte_length: int, shard_spec: ArraySpec) -> int:
         chunks_per_shard = self._get_chunks_per_shard(shard_spec)
         return input_byte_length + self._shard_index_size(chunks_per_shard)

From 85fe05207529206e8ed48a5ec749f4b54ee4ab6f Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Sun, 1 Jun 2025 23:56:27 -0400
Subject: [PATCH 12/18] wording docs fix

---
 docs/user-guide/config.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst
index 99a3826df1..27a4b2c60a 100644
--- a/docs/user-guide/config.rst
+++ b/docs/user-guide/config.rst
@@ -33,7 +33,7 @@ Configuration options include the following:
 - Async and threading options, e.g. ``async.concurrency`` and ``threading.max_workers``
 - Selections of implementations of codecs, codec pipelines and buffers
 - Enabling GPU support with ``zarr.config.enable_gpu()``. See :ref:`user-guide-gpu` for more.
-- Tuning reads from sharded zarrs. When reading less than a complete shard, reads to nearby chunks
+- Tuning reads from sharded zarrs. When reading less than a complete shard, reads of nearby chunks
   within the same shard will be combined into a single request if they are less than
   ``sharding.read.coalesce_max_gap_bytes`` apart and the combined request size is less than
   ``sharding.read.coalesce_max_bytes``.

From f039e0aa89c0b6a5e0e7154624751ca602fc285b Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Mon, 2 Jun 2025 00:04:27 -0400
Subject: [PATCH 13/18] docstring clarification

---
 src/zarr/codecs/sharding.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index d5cff1a2cc..6c84f09f0d 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -334,7 +334,7 @@ async def finalize(
 
 
 class _ChunkCoordsByteSlice(NamedTuple):
-    """Holds a chunk's coordinates and it's byte range in a serialized shard."""
+    """Holds a chunk's coordinates and its byte range in a serialized shard."""
 
     coords: ChunkCoords
     byte_slice: slice
@@ -765,7 +765,7 @@ async def _load_partial_shard_maybe(
         all_chunk_coords: set[ChunkCoords],
     ) -> ShardMapping | None:
         """
-        Read bytes from `byte_getter` for the case where the read is less than a full shard.
+        Read chunks from `byte_getter` for the case where the read is less than a full shard.
         Returns a mapping of chunk coordinates to bytes.
         """
         shard_index = await self._load_shard_index_maybe(byte_getter, chunks_per_shard)

From af9107a714fbea8b618f7ff7dd5529f3d9bf76b8 Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Mon, 2 Jun 2025 12:39:03 -0400
Subject: [PATCH 14/18] trigger precommit on all python files changed in this
 pull request

trying to get the ruff format that's happening locally during pre-commit
to match the pre-commit run that is failing on CI.
---
 src/zarr/codecs/sharding.py        | 1 +
 src/zarr/core/config.py            | 1 +
 tests/test_codecs/test_sharding.py | 1 +
 tests/test_config.py               | 1 +
 4 files changed, 4 insertions(+)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 6c84f09f0d..725ae16ffb 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -1,3 +1,4 @@
+# NOOP change
 from __future__ import annotations
 
 from collections.abc import Iterable, Mapping, MutableMapping
diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
index 4622a939d8..73727eea1a 100644
--- a/src/zarr/core/config.py
+++ b/src/zarr/core/config.py
@@ -1,3 +1,4 @@
+# NOOP change
 """
 The config module is responsible for managing the configuration of zarr and is based on the Donfig python library.
 For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations
diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py
index cb14ee97dc..67b5e7aa9e 100644
--- a/tests/test_codecs/test_sharding.py
+++ b/tests/test_codecs/test_sharding.py
@@ -1,3 +1,4 @@
+# NOOP change
 import pickle
 from typing import Any
 from unittest.mock import AsyncMock
diff --git a/tests/test_config.py b/tests/test_config.py
index 81d85d9746..9e174e7932 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,3 +1,4 @@
+# NOOP change
 import os
 from collections.abc import Iterable
 from typing import Any

From 35238bdd125b2ec04f7736f9faf790f84377a9e8 Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Mon, 2 Jun 2025 12:40:22 -0400
Subject: [PATCH 15/18] revert trigger for pre-commit ruff format

---
 src/zarr/codecs/sharding.py        | 1 -
 src/zarr/core/config.py            | 1 -
 tests/test_codecs/test_sharding.py | 1 -
 tests/test_config.py               | 1 -
 4 files changed, 4 deletions(-)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 725ae16ffb..6c84f09f0d 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -1,4 +1,3 @@
-# NOOP change
 from __future__ import annotations
 
 from collections.abc import Iterable, Mapping, MutableMapping
diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
index 73727eea1a..4622a939d8 100644
--- a/src/zarr/core/config.py
+++ b/src/zarr/core/config.py
@@ -1,4 +1,3 @@
-# NOOP change
 """
 The config module is responsible for managing the configuration of zarr and is based on the Donfig python library.
 For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations
diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py
index 67b5e7aa9e..cb14ee97dc 100644
--- a/tests/test_codecs/test_sharding.py
+++ b/tests/test_codecs/test_sharding.py
@@ -1,4 +1,3 @@
-# NOOP change
 import pickle
 from typing import Any
 from unittest.mock import AsyncMock
diff --git a/tests/test_config.py b/tests/test_config.py
index 9e174e7932..81d85d9746 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,4 +1,3 @@
-# NOOP change
 import os
 from collections.abc import Iterable
 from typing import Any

From 7d55768161d455a5db10c0ae03500fbada62af15 Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Mon, 2 Jun 2025 14:16:05 -0400
Subject: [PATCH 16/18] Add changes/3004.feature.rst

---
 changes/3004.feature.rst | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 changes/3004.feature.rst

diff --git a/changes/3004.feature.rst b/changes/3004.feature.rst
new file mode 100644
index 0000000000..b15a5ec943
--- /dev/null
+++ b/changes/3004.feature.rst
@@ -0,0 +1,3 @@
+Optimizes reading more than one, but not all, chunks from a shard. Chunks are now read in parallel
+and reads of nearby chunks within the same shard are combined to reduce the number of calls to the store.
+See :ref:`user-guide-config` for more details.
\ No newline at end of file

From 4c6956088e4010e6403e31705cc3d3fa1b39120e Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Mon, 2 Jun 2025 22:10:56 -0400
Subject: [PATCH 17/18] Consistently return None on failure and test partial
 shard read failure modes

---
 src/zarr/codecs/sharding.py        |  10 ++-
 tests/test_codecs/test_sharding.py | 128 +++++++++++++++++++++++++++--
 2 files changed, 129 insertions(+), 9 deletions(-)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 6c84f09f0d..8001b24d62 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -778,8 +778,8 @@ async def _load_partial_shard_maybe(
             # Drop chunks where index lookup fails
             if (chunk_byte_slice := shard_index.get_chunk_slice(chunk_coords))
         ]
-        if len(chunks) == 0:
-            return {}
+        if len(chunks) < len(all_chunk_coords):
+            return None
 
         groups = self._coalesce_chunks(chunks)
 
@@ -791,6 +791,8 @@ async def _load_partial_shard_maybe(
 
         shard_dict: ShardMutableMapping = {}
         for d in shard_dicts:
+            if d is None:
+                return None
             shard_dict.update(d)
 
         return shard_dict
@@ -834,7 +836,7 @@ async def _get_group_bytes(
         group: list[_ChunkCoordsByteSlice],
         byte_getter: ByteGetter,
         prototype: BufferPrototype,
-    ) -> ShardMapping:
+    ) -> ShardMapping | None:
         """
         Reads a possibly coalesced group of one or more chunks from a shard.
         Returns a mapping of chunk coordinates to bytes.
@@ -848,7 +850,7 @@ async def _get_group_bytes(
             byte_range=RangeByteRequest(group_start, group_end),
         )
         if group_bytes is None:
-            return {}
+            return None
 
         # Extract the bytes corresponding to each chunk in group from group_bytes.
         shard_dict = {}
diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py
index cb14ee97dc..dbe64a32d5 100644
--- a/tests/test_codecs/test_sharding.py
+++ b/tests/test_codecs/test_sharding.py
@@ -111,7 +111,9 @@ def test_sharding_scalar(
     indirect=["array_fixture"],
 )
 def test_sharding_partial(
-    store: Store, array_fixture: npt.NDArray[Any], index_location: ShardingCodecIndexLocation
+    store: Store,
+    array_fixture: npt.NDArray[Any],
+    index_location: ShardingCodecIndexLocation,
 ) -> None:
     data = array_fixture
     spath = StorePath(store)
@@ -147,7 +149,9 @@ def test_sharding_partial(
     indirect=["array_fixture"],
 )
 def test_sharding_partial_readwrite(
-    store: Store, array_fixture: npt.NDArray[Any], index_location: ShardingCodecIndexLocation
+    store: Store,
+    array_fixture: npt.NDArray[Any],
+    index_location: ShardingCodecIndexLocation,
 ) -> None:
     data = array_fixture
     spath = StorePath(store)
@@ -179,7 +183,9 @@ def test_sharding_partial_readwrite(
 @pytest.mark.parametrize("index_location", ["start", "end"])
 @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"])
 def test_sharding_partial_read(
-    store: Store, array_fixture: npt.NDArray[Any], index_location: ShardingCodecIndexLocation
+    store: Store,
+    array_fixture: npt.NDArray[Any],
+    index_location: ShardingCodecIndexLocation,
 ) -> None:
     data = array_fixture
     spath = StorePath(store)
@@ -338,6 +344,114 @@ def test_sharding_multiple_chunks_partial_shard_read(
         assert isinstance(kwargs["byte_range"], (SuffixByteRequest, RangeByteRequest))
 
 
+@pytest.mark.parametrize("index_location", ["start", "end"])
+@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"])
+def test_sharding_partial_shard_read__index_load_fails(
+    store: Store, index_location: ShardingCodecIndexLocation
+) -> None:
+    """Test fill value is returned when the call to the store to load the bytes of the shard's chunk index fails."""
+    array_shape = (16,)
+    shard_shape = (16,)
+    chunk_shape = (8,)
+    data = np.arange(np.prod(array_shape), dtype="float32").reshape(array_shape)
+    fill_value = -999
+
+    store_mock = AsyncMock(wraps=store, spec=store.__class__)
+    # loading the index is the first call to .get() so returning None will simulate an index load failure
+    store_mock.get.return_value = None
+
+    a = zarr.create_array(
+        StorePath(store_mock),
+        shape=data.shape,
+        chunks=chunk_shape,
+        shards={"shape": shard_shape, "index_location": index_location},
+        compressors=BloscCodec(cname="lz4"),
+        dtype=data.dtype,
+        fill_value=fill_value,
+    )
+    a[:] = data
+
+    # Read from one of two chunks in a shard to test the partial shard read path
+    assert a[0] == fill_value
+    assert a[0] != data[0]
+
+
+@pytest.mark.parametrize("index_location", ["start", "end"])
+@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"])
+def test_sharding_partial_shard_read__index_chunk_slice_fails(
+    store: Store,
+    index_location: ShardingCodecIndexLocation,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Test fill value is returned when looking up a chunk's byte slice within a shard fails."""
+    array_shape = (16,)
+    shard_shape = (16,)
+    chunk_shape = (8,)
+    data = np.arange(np.prod(array_shape), dtype="float32").reshape(array_shape)
+    fill_value = -999
+
+    monkeypatch.setattr(
+        "zarr.codecs.sharding._ShardIndex.get_chunk_slice",
+        lambda self, chunk_coords: None,
+    )
+
+    a = zarr.create_array(
+        StorePath(store),
+        shape=data.shape,
+        chunks=chunk_shape,
+        shards={"shape": shard_shape, "index_location": index_location},
+        compressors=BloscCodec(cname="lz4"),
+        dtype=data.dtype,
+        fill_value=fill_value,
+    )
+    a[:] = data
+
+    # Read from one of two chunks in a shard to test the partial shard read path
+    assert a[0] == fill_value
+    assert a[0] != data[0]
+
+
+@pytest.mark.parametrize("index_location", ["start", "end"])
+@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"])
+def test_sharding_partial_shard_read__chunk_load_fails(
+    store: Store, index_location: ShardingCodecIndexLocation
+) -> None:
+    """Test fill value is returned when the call to the store to load a chunk's bytes fails."""
+    array_shape = (16,)
+    shard_shape = (16,)
+    chunk_shape = (8,)
+    data = np.arange(np.prod(array_shape), dtype="float32").reshape(array_shape)
+    fill_value = -999
+
+    store_mock = AsyncMock(wraps=store, spec=store.__class__)
+
+    a = zarr.create_array(
+        StorePath(store_mock),
+        shape=data.shape,
+        chunks=chunk_shape,
+        shards={"shape": shard_shape, "index_location": index_location},
+        compressors=BloscCodec(cname="lz4"),
+        dtype=data.dtype,
+        fill_value=fill_value,
+    )
+    a[:] = data
+
+    # Set up store mock after array creation to only modify calls during array indexing
+    # Succeed on first call (index load), fail on subsequent calls (chunk loads)
+    async def first_success_then_fail(*args: Any, **kwargs: Any) -> Any:
+        if store_mock.get.call_count == 1:
+            return await store.get(*args, **kwargs)
+        else:
+            return None
+
+    store_mock.get.reset_mock()
+    store_mock.get.side_effect = first_success_then_fail
+
+    # Read from one of two chunks in a shard to test the partial shard read path
+    assert a[0] == fill_value
+    assert a[0] != data[0]
+
+
 @pytest.mark.parametrize(
     "array_fixture",
     [
@@ -348,7 +462,9 @@ def test_sharding_multiple_chunks_partial_shard_read(
 @pytest.mark.parametrize("index_location", ["start", "end"])
 @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"])
 def test_sharding_partial_overwrite(
-    store: Store, array_fixture: npt.NDArray[Any], index_location: ShardingCodecIndexLocation
+    store: Store,
+    array_fixture: npt.NDArray[Any],
+    index_location: ShardingCodecIndexLocation,
 ) -> None:
     data = array_fixture[:10, :10, :10]
     spath = StorePath(store)
@@ -578,7 +694,9 @@ async def test_sharding_with_empty_inner_chunk(
 )
 @pytest.mark.parametrize("chunks_per_shard", [(5, 2), (2, 5), (5, 5)])
 async def test_sharding_with_chunks_per_shard(
-    store: Store, index_location: ShardingCodecIndexLocation, chunks_per_shard: tuple[int]
+    store: Store,
+    index_location: ShardingCodecIndexLocation,
+    chunks_per_shard: tuple[int],
 ) -> None:
     chunk_shape = (2, 1)
     shape = tuple(x * y for x, y in zip(chunks_per_shard, chunk_shape, strict=False))

From 0dd4f9ad70ed2aac42467ce16cca2e0aa47a645a Mon Sep 17 00:00:00 2001
From: Alden Keefe Sampson <aldenkeefesampson@gmail.com>
Date: Fri, 6 Jun 2025 16:34:21 -0400
Subject: [PATCH 18/18] Use range of integers as out_selection not slice in
 CoordinateIndexer To fix issue when using vindex with repeated indexes in
 indexer

---
 src/zarr/core/indexing.py | 2 +-
 tests/test_properties.py  | 8 --------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py
index c11889f7f4..0e0bb664d8 100644
--- a/src/zarr/core/indexing.py
+++ b/src/zarr/core/indexing.py
@@ -1193,7 +1193,7 @@ def __iter__(self) -> Iterator[ChunkProjection]:
             stop = self.chunk_nitems_cumsum[chunk_rix]
             out_selection: slice | npt.NDArray[np.intp]
             if self.sel_sort is None:
-                out_selection = slice(start, stop)
+                out_selection = np.arange(start, stop)
             else:
                 out_selection = self.sel_sort[start:stop]
 
diff --git a/tests/test_properties.py b/tests/test_properties.py
index d48dfe2fef..0af632e026 100644
--- a/tests/test_properties.py
+++ b/tests/test_properties.py
@@ -152,14 +152,6 @@ def test_vindex(data: st.DataObject) -> None:
     actual = zarray.vindex[indexer]
     assert_array_equal(nparray[indexer], actual)
 
-    # FIXME!
-    # when the indexer is such that a value gets overwritten multiple times,
-    # I think the output depends on chunking.
-    # new_data = data.draw(npst.arrays(shape=st.just(actual.shape), dtype=nparray.dtype))
-    # nparray[indexer] = new_data
-    # zarray.vindex[indexer] = new_data
-    # assert_array_equal(nparray, zarray[:])
-
 
 @given(store=stores, meta=array_metadata())  # type: ignore[misc]
 async def test_roundtrip_array_metadata_from_store(