Backport PR #3484: Further .mtx reading improvements (#3540)

meeseeksmachine · flying-sheep · web-flow · commit 259a8e22d62e · 2025-03-28T17:59:07.000Z
Co-authored-by: Philipp A &lt;flying-sheep@web.de&gt;
diff --git a/src/scanpy/datasets/_ebi_expression_atlas.py b/src/scanpy/datasets/_ebi_expression_atlas.py
@@ -11,13 +11,17 @@
 from scipy import sparse
 
 from .. import logging as logg
+from .._compat import add_note
 from .._settings import settings
 from .._utils._doctests import doctest_internet
 from ..readwrite import _download
 from ._utils import check_datasetdir_exists
 
 if TYPE_CHECKING:
-    from typing import BinaryIO
+    from pandas._typing import ReadCsvBuffer
+
+
+CHUNK_SIZE = int(1e7)
 
 
 def _filter_boring(dataframe: pd.DataFrame) -> pd.DataFrame:
@@ -33,7 +37,7 @@ def sniff_url(accession: str):
         with urlopen(base_url):  # Check if server up/ dataset exists
             pass
     except HTTPError as e:
-        e.msg = f"{e.msg} ({base_url})"  # Report failed url
+        add_note(e, base_url)
         raise
 
 
@@ -58,31 +62,35 @@ def download_experiment(accession: str):
     )
 
 
-def read_mtx_from_stream(stream: BinaryIO) -> sparse.csr_matrix:
+def read_mtx_from_stream(stream: ReadCsvBuffer[bytes]) -> sparse.csr_matrix:
     curline = stream.readline()
     while curline.startswith(b"%"):
         curline = stream.readline()
-    n, m, _ = (int(x) for x in curline[:-1].split(b" "))
+    n, m, e = map(int, curline[:-1].split(b" "))
 
+    dtype_data = np.float32
     max_int32 = np.iinfo(np.int32).max
-    coord_dtype = np.int64 if n > max_int32 or m > max_int32 else np.int32
+    dtype_coord = np.int64 if n > max_int32 or m > max_int32 else np.int32
 
-    chunks = pd.read_csv(
+    data = np.ndarray((e,), dtype=dtype_data)
+    i = np.ndarray((e,), dtype=dtype_coord)
+    j = np.ndarray((e,), dtype=dtype_coord)
+    start = 0
+    with pd.read_csv(
         stream,
         sep=r"\s+",
         header=None,
-        dtype={0: coord_dtype, 1: coord_dtype, 2: np.float32},
-        chunksize=1e7,
-    )
-    data = np.array([], dtype=np.float64)
-    i = np.array([], dtype=int)
-    j = np.array([], dtype=int)
-    for chunk in chunks:
-        data = np.append(data, chunk[2])
-        i = np.append(i, chunk[1] - 1)
-        j = np.append(j, chunk[0] - 1)
-    mtx = sparse.csr_matrix((data, (i, j)), shape=(m, n))
-    return mtx
+        dtype={0: dtype_coord, 1: dtype_coord, 2: dtype_data},
+        chunksize=CHUNK_SIZE,
+    ) as reader:
+        chunk: pd.DataFrame
+        for chunk in reader:
+            l = chunk.shape[0]
+            data[start : start + l] = chunk[2]
+            i[start : start + l] = chunk[1] - 1
+            j[start : start + l] = chunk[0] - 1
+            start += l
+    return sparse.csr_matrix((data, (i, j)), shape=(m, n))
 
 
 def read_expression_from_archive(archive: ZipFile) -> anndata.AnnData:
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -78,7 +78,14 @@ def test_pbmc3k_processed():
 
 
 @pytest.mark.internet
-def test_ebi_expression_atlas():
+def test_ebi_expression_atlas(monkeypatch: pytest.MonkeyPatch):
+    from scanpy.datasets import _ebi_expression_atlas as ea_mod
+
+    # make sure we use chunks when testing.
+    # This dataset has <8M entries, so 4M entries/chunk = 2 chunks
+    assert hasattr(ea_mod, "CHUNK_SIZE")
+    monkeypatch.setattr(ea_mod, "CHUNK_SIZE", int(4e6))
+
     adata = sc.datasets.ebi_expression_atlas("E-MTAB-4888")
     # The shape changes sometimes
     assert 2261 <= adata.shape[0] <= 2315