Skip to content

Datasets map and select(range()) is giving dill error #5645

@Tanya-11

Description

@Tanya-11

Describe the bug

I'm using Huggingface Datasets library to load the dataset in google colab

When I do,

data = train_dataset.select(range(10))

or

train_datasets = train_dataset.map(
process_data_to_model_inputs,
batched=True,
batch_size=batch_size,
remove_columns=["article", "abstract"],
)

I get following error: module 'dill._dill' has no attribute 'log'
I've tried downgrading the dill version from latest to 0.2.8, but no luck.

Stack trace:


ModuleNotFoundError Traceback (most recent call last)
/usr/local/lib/python3.9/dist-packages/datasets/utils/py_utils.py in _no_cache_fields(obj)
367 try:
--> 368 import transformers as tr
369

ModuleNotFoundError: No module named 'transformers'

During handling of the above exception, another exception occurred:

AttributeError Traceback (most recent call last)
17 frames
in
----> 1 test = train_dataset.select(range(10))

/usr/local/lib/python3.9/dist-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
155 }
156 # apply actual function
--> 157 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
158 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
159 # re-apply format to the output

/usr/local/lib/python3.9/dist-packages/datasets/fingerprint.py in wrapper(*args, **kwargs)
155 if kwargs.get(fingerprint_name) is None:
156 kwargs_for_fingerprint["fingerprint_name"] = fingerprint_name
--> 157 kwargs[fingerprint_name] = update_fingerprint(
158 self._fingerprint, transform, kwargs_for_fingerprint
159 )

/usr/local/lib/python3.9/dist-packages/datasets/fingerprint.py in update_fingerprint(fingerprint, transform, transform_args)
103 for key in sorted(transform_args):
104 hasher.update(key)
--> 105 hasher.update(transform_args[key])
106 return hasher.hexdigest()
107

/usr/local/lib/python3.9/dist-packages/datasets/fingerprint.py in update(self, value)
55 def update(self, value):
56 self.m.update(f"=={type(value)}==".encode("utf8"))
---> 57 self.m.update(self.hash(value).encode("utf-8"))
58
59 def hexdigest(self):

/usr/local/lib/python3.9/dist-packages/datasets/fingerprint.py in hash(cls, value)
51 return cls.dispatch[type(value)](cls, value)
52 else:
---> 53 return cls.hash_default(value)
54
55 def update(self, value):

/usr/local/lib/python3.9/dist-packages/datasets/fingerprint.py in hash_default(cls, value)
44 @classmethod
45 def hash_default(cls, value):
---> 46 return cls.hash_bytes(dumps(value))
47
48 @classmethod

/usr/local/lib/python3.9/dist-packages/datasets/utils/py_utils.py in dumps(obj)
387 file = StringIO()
388 with _no_cache_fields(obj):
--> 389 dump(obj, file)
390 return file.getvalue()
391

/usr/local/lib/python3.9/dist-packages/datasets/utils/py_utils.py in dump(obj, file)
359 def dump(obj, file):
360 """pickle an object to a file"""
--> 361 Pickler(file, recurse=True).dump(obj)
362 return
363

/usr/local/lib/python3.9/dist-packages/dill/_dill.py in dump(self, obj)
392 return
393
--> 394 def load_session(filename='/tmp/session.pkl', main=None):
395 """update the main module with the state from the session file"""
396 if main is None: main = _main_module

/usr/lib/python3.9/pickle.py in dump(self, obj)
485 if self.proto >= 4:
486 self.framer.start_framing()
--> 487 self.save(obj)
488 self.write(STOP)
489 self.framer.end_framing()

/usr/local/lib/python3.9/dist-packages/dill/_dill.py in save(self, obj, save_persistent_id)
386 pickler._byref = False # disable pickling by name reference
387 pickler._recurse = False # disable pickling recursion for globals
--> 388 pickler._session = True # is best indicator of when pickling a session
389 pickler.dump(main)
390 finally:

/usr/lib/python3.9/pickle.py in save(self, obj, save_persistent_id)
558 f = self.dispatch.get(t)
559 if f is not None:
--> 560 f(self, obj) # Call unbound method with explicit self
561 return
562

/usr/local/lib/python3.9/dist-packages/dill/_dill.py in save_singleton(pickler, obj)

/usr/lib/python3.9/pickle.py in save_reduce(self, func, args, state, listitems, dictitems, state_setter, obj)
689 write(NEWOBJ)
690 else:
--> 691 save(func)
692 save(args)
693 write(REDUCE)

/usr/local/lib/python3.9/dist-packages/dill/_dill.py in save(self, obj, save_persistent_id)
386 pickler._byref = False # disable pickling by name reference
387 pickler._recurse = False # disable pickling recursion for globals
--> 388 pickler._session = True # is best indicator of when pickling a session
389 pickler.dump(main)
390 finally:

/usr/lib/python3.9/pickle.py in save(self, obj, save_persistent_id)
558 f = self.dispatch.get(t)
559 if f is not None:
--> 560 f(self, obj) # Call unbound method with explicit self
561 return
562

/usr/local/lib/python3.9/dist-packages/datasets/utils/py_utils.py in save_function(pickler, obj)
583 dill._dill.log.info("# F1")
584 else:
--> 585 dill._dill.log.info("F2: %s" % obj)
586 name = getattr(obj, "qualname", getattr(obj, "name", None))
587 dill._dill.StockPickler.save_global(pickler, obj, name=name)

AttributeError: module 'dill._dill' has no attribute 'log'

Steps to reproduce the bug

After loading the dataset(eg: https://huggingface.co/datasets/scientific_papers) in google colab

do either

data = train_dataset.select(range(10))

or

train_datasets = train_dataset.map(
process_data_to_model_inputs,
batched=True,
batch_size=batch_size,
remove_columns=["article", "abstract"],
)

Expected behavior

The map and select function should work

Environment info

dataset: https://huggingface.co/datasets/scientific_papers
dill = 0.3.6
python= 3.9.16
transformer = 4.2.0

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions