-
Couldn't load subscription status.
- Fork 3k
Description
Describe the bug
I'm using Huggingface Datasets library to load the dataset in google colab
When I do,
data = train_dataset.select(range(10))
or
train_datasets = train_dataset.map(
process_data_to_model_inputs,
batched=True,
batch_size=batch_size,
remove_columns=["article", "abstract"],
)
I get following error: module 'dill._dill' has no attribute 'log'
I've tried downgrading the dill version from latest to 0.2.8, but no luck.
Stack trace:
ModuleNotFoundError Traceback (most recent call last)
/usr/local/lib/python3.9/dist-packages/datasets/utils/py_utils.py in _no_cache_fields(obj)
367 try:
--> 368 import transformers as tr
369ModuleNotFoundError: No module named 'transformers'
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
17 frames
in
----> 1 test = train_dataset.select(range(10))/usr/local/lib/python3.9/dist-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
155 }
156 # apply actual function
--> 157 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
158 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
159 # re-apply format to the output/usr/local/lib/python3.9/dist-packages/datasets/fingerprint.py in wrapper(*args, **kwargs)
155 if kwargs.get(fingerprint_name) is None:
156 kwargs_for_fingerprint["fingerprint_name"] = fingerprint_name
--> 157 kwargs[fingerprint_name] = update_fingerprint(
158 self._fingerprint, transform, kwargs_for_fingerprint
159 )/usr/local/lib/python3.9/dist-packages/datasets/fingerprint.py in update_fingerprint(fingerprint, transform, transform_args)
103 for key in sorted(transform_args):
104 hasher.update(key)
--> 105 hasher.update(transform_args[key])
106 return hasher.hexdigest()
107/usr/local/lib/python3.9/dist-packages/datasets/fingerprint.py in update(self, value)
55 def update(self, value):
56 self.m.update(f"=={type(value)}==".encode("utf8"))
---> 57 self.m.update(self.hash(value).encode("utf-8"))
58
59 def hexdigest(self):/usr/local/lib/python3.9/dist-packages/datasets/fingerprint.py in hash(cls, value)
51 return cls.dispatch[type(value)](cls, value)
52 else:
---> 53 return cls.hash_default(value)
54
55 def update(self, value):/usr/local/lib/python3.9/dist-packages/datasets/fingerprint.py in hash_default(cls, value)
44 @classmethod
45 def hash_default(cls, value):
---> 46 return cls.hash_bytes(dumps(value))
47
48 @classmethod/usr/local/lib/python3.9/dist-packages/datasets/utils/py_utils.py in dumps(obj)
387 file = StringIO()
388 with _no_cache_fields(obj):
--> 389 dump(obj, file)
390 return file.getvalue()
391/usr/local/lib/python3.9/dist-packages/datasets/utils/py_utils.py in dump(obj, file)
359 def dump(obj, file):
360 """pickle an object to a file"""
--> 361 Pickler(file, recurse=True).dump(obj)
362 return
363/usr/local/lib/python3.9/dist-packages/dill/_dill.py in dump(self, obj)
392 return
393
--> 394 def load_session(filename='/tmp/session.pkl', main=None):
395 """update the main module with the state from the session file"""
396 if main is None: main = _main_module/usr/lib/python3.9/pickle.py in dump(self, obj)
485 if self.proto >= 4:
486 self.framer.start_framing()
--> 487 self.save(obj)
488 self.write(STOP)
489 self.framer.end_framing()/usr/local/lib/python3.9/dist-packages/dill/_dill.py in save(self, obj, save_persistent_id)
386 pickler._byref = False # disable pickling by name reference
387 pickler._recurse = False # disable pickling recursion for globals
--> 388 pickler._session = True # is best indicator of when pickling a session
389 pickler.dump(main)
390 finally:/usr/lib/python3.9/pickle.py in save(self, obj, save_persistent_id)
558 f = self.dispatch.get(t)
559 if f is not None:
--> 560 f(self, obj) # Call unbound method with explicit self
561 return
562/usr/local/lib/python3.9/dist-packages/dill/_dill.py in save_singleton(pickler, obj)
/usr/lib/python3.9/pickle.py in save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
689 write(NEWOBJ)
690 else:
--> 691 save(func)
692 save(args)
693 write(REDUCE)/usr/local/lib/python3.9/dist-packages/dill/_dill.py in save(self, obj, save_persistent_id)
386 pickler._byref = False # disable pickling by name reference
387 pickler._recurse = False # disable pickling recursion for globals
--> 388 pickler._session = True # is best indicator of when pickling a session
389 pickler.dump(main)
390 finally:/usr/lib/python3.9/pickle.py in save(self, obj, save_persistent_id)
558 f = self.dispatch.get(t)
559 if f is not None:
--> 560 f(self, obj) # Call unbound method with explicit self
561 return
562/usr/local/lib/python3.9/dist-packages/datasets/utils/py_utils.py in save_function(pickler, obj)
583 dill._dill.log.info("# F1")
584 else:
--> 585 dill._dill.log.info("F2: %s" % obj)
586 name = getattr(obj, "qualname", getattr(obj, "name", None))
587 dill._dill.StockPickler.save_global(pickler, obj, name=name)AttributeError: module 'dill._dill' has no attribute 'log'
Steps to reproduce the bug
After loading the dataset(eg: https://huggingface.co/datasets/scientific_papers) in google colab
do either
data = train_dataset.select(range(10))
or
train_datasets = train_dataset.map(
process_data_to_model_inputs,
batched=True,
batch_size=batch_size,
remove_columns=["article", "abstract"],
)
Expected behavior
The map and select function should work
Environment info
dataset: https://huggingface.co/datasets/scientific_papers
dill = 0.3.6
python= 3.9.16
transformer = 4.2.0