Skip to content

Commit 259a8e2

Browse files
Backport PR #3484: Further .mtx reading improvements (#3540)
Co-authored-by: Philipp A <[email protected]>
1 parent 3b4978d commit 259a8e2

File tree

2 files changed

+34
-19
lines changed

2 files changed

+34
-19
lines changed

src/scanpy/datasets/_ebi_expression_atlas.py

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,17 @@
1111
from scipy import sparse
1212

1313
from .. import logging as logg
14+
from .._compat import add_note
1415
from .._settings import settings
1516
from .._utils._doctests import doctest_internet
1617
from ..readwrite import _download
1718
from ._utils import check_datasetdir_exists
1819

1920
if TYPE_CHECKING:
20-
from typing import BinaryIO
21+
from pandas._typing import ReadCsvBuffer
22+
23+
24+
CHUNK_SIZE = int(1e7)
2125

2226

2327
def _filter_boring(dataframe: pd.DataFrame) -> pd.DataFrame:
@@ -33,7 +37,7 @@ def sniff_url(accession: str):
3337
with urlopen(base_url): # Check if server up/ dataset exists
3438
pass
3539
except HTTPError as e:
36-
e.msg = f"{e.msg} ({base_url})" # Report failed url
40+
add_note(e, base_url)
3741
raise
3842

3943

@@ -58,31 +62,35 @@ def download_experiment(accession: str):
5862
)
5963

6064

61-
def read_mtx_from_stream(stream: BinaryIO) -> sparse.csr_matrix:
65+
def read_mtx_from_stream(stream: ReadCsvBuffer[bytes]) -> sparse.csr_matrix:
6266
curline = stream.readline()
6367
while curline.startswith(b"%"):
6468
curline = stream.readline()
65-
n, m, _ = (int(x) for x in curline[:-1].split(b" "))
69+
n, m, e = map(int, curline[:-1].split(b" "))
6670

71+
dtype_data = np.float32
6772
max_int32 = np.iinfo(np.int32).max
68-
coord_dtype = np.int64 if n > max_int32 or m > max_int32 else np.int32
73+
dtype_coord = np.int64 if n > max_int32 or m > max_int32 else np.int32
6974

70-
chunks = pd.read_csv(
75+
data = np.ndarray((e,), dtype=dtype_data)
76+
i = np.ndarray((e,), dtype=dtype_coord)
77+
j = np.ndarray((e,), dtype=dtype_coord)
78+
start = 0
79+
with pd.read_csv(
7180
stream,
7281
sep=r"\s+",
7382
header=None,
74-
dtype={0: coord_dtype, 1: coord_dtype, 2: np.float32},
75-
chunksize=1e7,
76-
)
77-
data = np.array([], dtype=np.float64)
78-
i = np.array([], dtype=int)
79-
j = np.array([], dtype=int)
80-
for chunk in chunks:
81-
data = np.append(data, chunk[2])
82-
i = np.append(i, chunk[1] - 1)
83-
j = np.append(j, chunk[0] - 1)
84-
mtx = sparse.csr_matrix((data, (i, j)), shape=(m, n))
85-
return mtx
83+
dtype={0: dtype_coord, 1: dtype_coord, 2: dtype_data},
84+
chunksize=CHUNK_SIZE,
85+
) as reader:
86+
chunk: pd.DataFrame
87+
for chunk in reader:
88+
l = chunk.shape[0]
89+
data[start : start + l] = chunk[2]
90+
i[start : start + l] = chunk[1] - 1
91+
j[start : start + l] = chunk[0] - 1
92+
start += l
93+
return sparse.csr_matrix((data, (i, j)), shape=(m, n))
8694

8795

8896
def read_expression_from_archive(archive: ZipFile) -> anndata.AnnData:

tests/test_datasets.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,14 @@ def test_pbmc3k_processed():
7878

7979

8080
@pytest.mark.internet
81-
def test_ebi_expression_atlas():
81+
def test_ebi_expression_atlas(monkeypatch: pytest.MonkeyPatch):
82+
from scanpy.datasets import _ebi_expression_atlas as ea_mod
83+
84+
# make sure we use chunks when testing.
85+
# This dataset has <8M entries, so 4M entries/chunk = 2 chunks
86+
assert hasattr(ea_mod, "CHUNK_SIZE")
87+
monkeypatch.setattr(ea_mod, "CHUNK_SIZE", int(4e6))
88+
8289
adata = sc.datasets.ebi_expression_atlas("E-MTAB-4888")
8390
# The shape changes sometimes
8491
assert 2261 <= adata.shape[0] <= 2315

0 commit comments

Comments
 (0)