|
1 | 1 | # typing.Self and "|" union syntax don't exist in Python 3.9 |
2 | 2 | from __future__ import annotations |
3 | 3 |
|
4 | | -from collections.abc import Sequence |
| 4 | +from pathlib import Path |
5 | 5 |
|
6 | 6 | import pandas as pd |
7 | 7 | import pyarrow as pa |
@@ -33,10 +33,11 @@ def read_parquet( |
33 | 33 |
|
34 | 34 | Parameters |
35 | 35 | ---------- |
36 | | - data: str, Upath, or file-like object |
| 36 | + data: str, list or str, Path, Upath, or file-like object |
37 | 37 | Path to the data or a file-like object. If a string is passed, it can be a single file name, |
38 | 38 | directory name, or a remote path (e.g., HTTP/HTTPS or S3). If a file-like object is passed, |
39 | | - it must support the `read` method. |
| 39 | + it must support the `read` method. You can also pass the `filesystem` argument with |
| 40 | + a `pyarrow.fs` object, which will be passed to `pyarrow.parquet.read_table()`. |
40 | 41 | columns : list, default=None |
41 | 42 | If not None, only these columns will be read from the file. |
42 | 43 | reject_nesting: list or str, default=None |
@@ -93,26 +94,13 @@ def read_parquet( |
93 | 94 | reject_nesting = [reject_nesting] |
94 | 95 |
|
95 | 96 | # First load through pyarrow |
96 | | - # Check if `data` is a file-like object or a sequence |
97 | | - if hasattr(data, "read") or ( |
98 | | - isinstance(data, Sequence) and not isinstance(data, str | bytes | bytearray) |
99 | | - ): |
100 | | - # If `data` is a file-like object or a sequence, pass it directly to pyarrow |
| 97 | + # If `filesystem` is specified - use it |
| 98 | + if kwargs.get("filesystem") is not None: |
101 | 99 | table = pq.read_table(data, columns=columns, **kwargs) |
| 100 | + # Otherwise convert with a special function |
102 | 101 | else: |
103 | | - # Try creating pyarrow-native filesystem |
104 | | - try: |
105 | | - fs, path = pa.fs.FileSystem.from_uri(data) |
106 | | - except (TypeError, pa.ArrowInvalid): |
107 | | - # Otherwise, treat `data` as an URI for fsspec-supported silesystem and use UPath |
108 | | - upath = UPath(data) |
109 | | - # Use smaller block size for better performance |
110 | | - if upath.protocol in ("http", "https"): |
111 | | - upath = UPath(upath, block_size=FSSPEC_BLOCK_SIZE) |
112 | | - path = upath.path |
113 | | - fs = upath.fs |
114 | | - filesystem = kwargs.pop("filesystem", fs) |
115 | | - table = pq.read_table(path, columns=columns, filesystem=filesystem, **kwargs) |
| 102 | + data, filesystem = _transform_read_parquet_data_arg(data) |
| 103 | + table = pq.read_table(data, filesystem=filesystem, columns=columns, **kwargs) |
116 | 104 |
|
117 | 105 | # Resolve partial loading of nested structures |
118 | 106 | # Using pyarrow to avoid naming conflicts from partial loading ("flux" vs "lc.flux") |
@@ -172,6 +160,56 @@ def read_parquet( |
172 | 160 | return from_pyarrow(table, reject_nesting=reject_nesting, autocast_list=autocast_list) |
173 | 161 |
|
174 | 162 |
|
| 163 | +def _transform_read_parquet_data_arg(data): |
| 164 | + """Transform `data` argument of read_parquet to pq.read_parquet's `source` and `filesystem`""" |
| 165 | + # Check if a list, run the function recursively and check that filesystems are all the same |
| 166 | + if isinstance(data, list): |
| 167 | + paths = [] |
| 168 | + first_fs = None |
| 169 | + for i, d in enumerate(data): |
| 170 | + path, fs = _transform_read_parquet_data_arg(d) |
| 171 | + paths.append(path) |
| 172 | + if i == 0: |
| 173 | + first_fs = fs |
| 174 | + elif fs != first_fs: |
| 175 | + raise ValueError( |
| 176 | + f"All filesystems in the list should be the same, first fs: {first_fs}, {i + 1} fs: {fs}" |
| 177 | + ) |
| 178 | + return paths, first_fs |
| 179 | + # Check if a file-like object |
| 180 | + if hasattr(data, "read"): |
| 181 | + return data, None |
| 182 | + # Check if `data` is a Path |
| 183 | + # Check if `data` is a UPath and use it |
| 184 | + if isinstance(data, UPath): |
| 185 | + return data.path, data.fs |
| 186 | + if isinstance(data, Path): |
| 187 | + return data, None |
| 188 | + # It should be a string now |
| 189 | + if not isinstance(data, str): |
| 190 | + raise TypeError("data must be a file-like object, Path, UPath, list, or str") |
| 191 | + |
| 192 | + # Try creating pyarrow-native filesystem assuming that `data` is a URI |
| 193 | + try: |
| 194 | + fs, path = pa.fs.FileSystem.from_uri(data) |
| 195 | + # If the convertion failed, continue |
| 196 | + except (TypeError, pa.ArrowInvalid): |
| 197 | + pass |
| 198 | + # If not, use pyarrow filesystem |
| 199 | + else: |
| 200 | + return path, fs |
| 201 | + |
| 202 | + # Otherwise, treat `data` as a URI or a local path |
| 203 | + upath = UPath(data) |
| 204 | + # If it is a local path, use pyarrow's filesystem |
| 205 | + if upath.protocol == "": |
| 206 | + return upath.path, None |
| 207 | + # If HTTP, change the default UPath object to use a smaller block size |
| 208 | + if upath.protocol in ("http", "https"): |
| 209 | + upath = UPath(upath, block_size=FSSPEC_BLOCK_SIZE) |
| 210 | + return upath.path, upath.fs |
| 211 | + |
| 212 | + |
175 | 213 | def from_pyarrow( |
176 | 214 | table: pa.Table, |
177 | 215 | reject_nesting: list[str] | str | None = None, |
|
0 commit comments