1111from scipy import sparse
1212
1313from .. import logging as logg
14+ from .._compat import add_note
1415from .._settings import settings
1516from .._utils ._doctests import doctest_internet
1617from ..readwrite import _download
1718from ._utils import check_datasetdir_exists
1819
1920if TYPE_CHECKING :
20- from typing import BinaryIO
21+ from pandas ._typing import ReadCsvBuffer
22+
23+
24+ CHUNK_SIZE = int (1e7 )
2125
2226
2327def _filter_boring (dataframe : pd .DataFrame ) -> pd .DataFrame :
@@ -33,7 +37,7 @@ def sniff_url(accession: str):
3337 with urlopen (base_url ): # Check if server up/ dataset exists
3438 pass
3539 except HTTPError as e :
36- e . msg = f" { e . msg } ( { base_url } )" # Report failed url
40+ add_note ( e , base_url )
3741 raise
3842
3943
@@ -58,31 +62,35 @@ def download_experiment(accession: str):
5862 )
5963
6064
61- def read_mtx_from_stream (stream : BinaryIO ) -> sparse .csr_matrix :
65+ def read_mtx_from_stream (stream : ReadCsvBuffer [ bytes ] ) -> sparse .csr_matrix :
6266 curline = stream .readline ()
6367 while curline .startswith (b"%" ):
6468 curline = stream .readline ()
65- n , m , _ = (int ( x ) for x in curline [:- 1 ].split (b" " ))
69+ n , m , e = map (int , curline [:- 1 ].split (b" " ))
6670
71+ dtype_data = np .float32
6772 max_int32 = np .iinfo (np .int32 ).max
68- coord_dtype = np .int64 if n > max_int32 or m > max_int32 else np .int32
73+ dtype_coord = np .int64 if n > max_int32 or m > max_int32 else np .int32
6974
70- chunks = pd .read_csv (
75+ data = np .ndarray ((e ,), dtype = dtype_data )
76+ i = np .ndarray ((e ,), dtype = dtype_coord )
77+ j = np .ndarray ((e ,), dtype = dtype_coord )
78+ start = 0
79+ with pd .read_csv (
7180 stream ,
7281 sep = r"\s+" ,
7382 header = None ,
74- dtype = {0 : coord_dtype , 1 : coord_dtype , 2 : np .float32 },
75- chunksize = 1e7 ,
76- )
77- data = np .array ([], dtype = np .float64 )
78- i = np .array ([], dtype = int )
79- j = np .array ([], dtype = int )
80- for chunk in chunks :
81- data = np .append (data , chunk [2 ])
82- i = np .append (i , chunk [1 ] - 1 )
83- j = np .append (j , chunk [0 ] - 1 )
84- mtx = sparse .csr_matrix ((data , (i , j )), shape = (m , n ))
85- return mtx
83+ dtype = {0 : dtype_coord , 1 : dtype_coord , 2 : dtype_data },
84+ chunksize = CHUNK_SIZE ,
85+ ) as reader :
86+ chunk : pd .DataFrame
87+ for chunk in reader :
88+ l = chunk .shape [0 ]
89+ data [start : start + l ] = chunk [2 ]
90+ i [start : start + l ] = chunk [1 ] - 1
91+ j [start : start + l ] = chunk [0 ] - 1
92+ start += l
93+ return sparse .csr_matrix ((data , (i , j )), shape = (m , n ))
8694
8795
8896def read_expression_from_archive (archive : ZipFile ) -> anndata .AnnData :
0 commit comments