@@ -89,7 +89,7 @@ async def _get_chunks(url: str, chunk_size: int) -> Generator[bytes, None, None]
89
89
pbar .update (len (value ))
90
90
pbar .close ()
91
91
except JsException :
92
- raise Exception (f"Failed to read dataset at { url } " ) from None
92
+ raise Exception (f"Failed to read dataset at ' { url } '. " ) from None
93
93
else :
94
94
import requests # pyright: ignore
95
95
from requests .exceptions import ConnectionError # pyright: ignore
@@ -99,7 +99,7 @@ async def _get_chunks(url: str, chunk_size: int) -> Generator[bytes, None, None]
99
99
# If requests.get fails, it will return readable error
100
100
if response .status_code >= 400 :
101
101
raise Exception (
102
- f"received status code { response .status_code } from { url } "
102
+ f"received status code { response .status_code } from ' { url } '. "
103
103
)
104
104
pbar = tqdm (
105
105
miniters = 1 ,
@@ -111,28 +111,36 @@ async def _get_chunks(url: str, chunk_size: int) -> Generator[bytes, None, None]
111
111
pbar .update (len (chunk ))
112
112
pbar .close ()
113
113
except ConnectionError :
114
- raise Exception (f"Failed to read dataset at { url } " ) from None
114
+ raise Exception (f"Failed to read dataset at '{ url } '." ) from None
115
+
116
+
117
+ def _rmrf (path : Path ) -> None :
118
+ if path .is_dir ():
119
+ shutil .rmtree (path )
120
+ else :
121
+ path .unlink ()
115
122
116
123
117
124
def _verify_files_dont_exist (
118
- paths : Iterable [Union [ str , Path ] ], remove_if_exist : bool = False
125
+ paths : Iterable [Path ], remove_if_exist : bool = False
119
126
) -> None :
120
127
"""
121
128
Verifies all paths in 'paths' don't exist.
122
- :param paths: A iterable of strs or pathlib.Paths .
123
- :param remove_if_exist=False: Removes file at path if they already exist.
129
+ :param paths: A iterable of pathlib.Path s .
130
+ :param remove_if_exist=False: Remove each file at each path in paths if they already exist.
124
131
:returns: None
125
- :raises FileExistsError: On the first path found that already exists.
132
+ :raises FileExistsError: On the first path found that already exists if remove_if_exist is False .
126
133
"""
127
134
for path in paths :
128
- path = Path ( path )
129
- if path .exists ():
135
+ # Could be a broken symlink => path.exists() is False
136
+ if path .exists () or path . is_symlink () :
130
137
if remove_if_exist :
131
- if path .is_symlink ():
132
- realpath = path .resolve ()
133
- path .unlink (realpath )
134
- else :
135
- shutil .rmtree (path )
138
+ while path .is_symlink ():
139
+ temp = path .readlink ()
140
+ path .unlink (missing_ok = True )
141
+ path = temp
142
+ if path .exists ():
143
+ _rmrf (path )
136
144
else :
137
145
raise FileExistsError (f"Error: File '{ path } ' already exists." )
138
146
@@ -224,14 +232,13 @@ async def prepare(
224
232
path = Path .cwd () if path is None else Path (path )
225
233
# Check if path contains /tmp
226
234
if Path ("/tmp" ) in path .parents :
227
- raise ValueError ("path must not be in /tmp" )
235
+ raise ValueError ("path must not be in /tmp. " )
228
236
elif path .is_file ():
229
- raise ValueError ("Datasets must be prepared to directories, not files" )
237
+ raise ValueError ("Datasets must be prepared to directories, not files. " )
230
238
# Create the target path if it doesn't exist yet
231
239
path .mkdir (exist_ok = True )
232
240
233
241
# For avoiding collisions with any other files the user may have downloaded to /tmp/
234
-
235
242
dname = f"skills-network-{ hash (url )} "
236
243
# The file to extract data to. If not jupyterlite, to be symlinked to as well
237
244
extract_dir = path if _is_jupyterlite () else Path (f"/tmp/{ dname } " )
@@ -247,44 +254,52 @@ async def prepare(
247
254
shutil .rmtree (extract_dir )
248
255
extract_dir .mkdir ()
249
256
250
- if tarfile .is_tarfile (tmp_download_file ):
251
- with tarfile .open (tmp_download_file ) as tf :
252
- _verify_files_dont_exist (
253
- [
254
- path / child .name
255
- for child in map (Path , tf .getnames ())
256
- if len (child .parents ) == 1 and _is_file_to_symlink (child )
257
- ],
258
- overwrite ,
259
- ) # Only check if top-level fileobject
260
- pbar = tqdm (iterable = tf .getmembers (), total = len (tf .getmembers ()))
261
- pbar .set_description (f"Extracting { filename } " )
262
- for member in pbar :
263
- tf .extract (member = member , path = extract_dir )
264
- tmp_download_file .unlink ()
265
- elif zipfile .is_zipfile (tmp_download_file ):
266
- with zipfile .ZipFile (tmp_download_file ) as zf :
267
- _verify_files_dont_exist (
268
- [
269
- path / child .name
270
- for child in map (Path , zf .namelist ())
271
- if len (child .parents ) == 1 and _is_file_to_symlink (child )
272
- ],
273
- overwrite ,
274
- )
275
- pbar = tqdm (iterable = zf .infolist (), total = len (zf .infolist ()))
276
- pbar .set_description (f"Extracting { filename } " )
277
- for member in pbar :
278
- zf .extract (member = member , path = extract_dir )
279
- tmp_download_file .unlink ()
280
- else :
281
- _verify_files_dont_exist ([path / filename ], overwrite )
282
- shutil .move (tmp_download_file , extract_dir / filename )
257
+ try :
258
+ if tarfile .is_tarfile (tmp_download_file ):
259
+ with tarfile .open (tmp_download_file ) as tf :
260
+ _verify_files_dont_exist (
261
+ [
262
+ path / child .name
263
+ for child in map (Path , tf .getnames ())
264
+ if len (child .parents ) == 1 and _is_file_to_symlink (child )
265
+ ], # Only check if top-level fileobject
266
+ remove_if_exist = overwrite ,
267
+ )
268
+ pbar = tqdm (iterable = tf .getmembers (), total = len (tf .getmembers ()))
269
+ pbar .set_description (f"Extracting { filename } " )
270
+ for member in pbar :
271
+ tf .extract (member = member , path = extract_dir )
272
+ tmp_download_file .unlink ()
273
+ elif zipfile .is_zipfile (tmp_download_file ):
274
+ with zipfile .ZipFile (tmp_download_file ) as zf :
275
+ _verify_files_dont_exist (
276
+ [
277
+ path / child .name
278
+ for child in map (Path , zf .namelist ())
279
+ if len (child .parents ) == 1 and _is_file_to_symlink (child )
280
+ ], # Only check if top-level fileobject
281
+ remove_if_exist = overwrite ,
282
+ )
283
+ pbar = tqdm (iterable = zf .infolist (), total = len (zf .infolist ()))
284
+ pbar .set_description (f"Extracting { filename } " )
285
+ for member in pbar :
286
+ zf .extract (member = member , path = extract_dir )
287
+ tmp_download_file .unlink ()
288
+ else :
289
+ _verify_files_dont_exist ([path / filename ], remove_if_exist = overwrite )
290
+ shutil .move (tmp_download_file , extract_dir / filename )
291
+ except FileExistsError as e :
292
+ raise FileExistsError (
293
+ str (e )
294
+ + "\n If you want to overwrite any existing files, use prepare(..., overwrite=True)."
295
+ ) from None
283
296
284
297
# If in jupyterlite environment, the extract_dir = path, so the files are already there.
285
298
if not _is_jupyterlite ():
286
299
# If not in jupyterlite environment, symlink top-level file objects in extract_dir
287
300
for child in filter (_is_file_to_symlink , extract_dir .iterdir ()):
301
+ if (path / child .name ).is_symlink () and overwrite :
302
+ (path / child .name ).unlink ()
288
303
(path / child .name ).symlink_to (child , target_is_directory = child .is_dir ())
289
304
290
305
if verbose :
@@ -295,29 +310,6 @@ def setup() -> None:
295
310
if _is_jupyterlite ():
296
311
tqdm .monitor_interval = 0
297
312
298
- try :
299
- import sys # pyright: ignore
300
-
301
- ipython = get_ipython ()
302
-
303
- def hide_traceback (
304
- exc_tuple = None ,
305
- filename = None ,
306
- tb_offset = None ,
307
- exception_only = False ,
308
- running_compiled_code = False ,
309
- ):
310
- etype , value , tb = sys .exc_info ()
311
- value .__cause__ = None # suppress chained exceptions
312
- return ipython ._showtraceback (
313
- etype , value , ipython .InteractiveTB .get_exception_only (etype , value )
314
- )
315
-
316
- ipython .showtraceback = hide_traceback
317
-
318
- except NameError :
319
- pass
320
-
321
313
322
314
setup ()
323
315
0 commit comments