From ab13325d1c42a186d987c2d424a2412fbd2b151c Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 16 Jul 2022 11:43:23 -0300 Subject: [PATCH 01/30] code formatting changes --- dill/_dill.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 1c8af8dc..12daabe7 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -346,9 +346,11 @@ def _module_map(): return modmap SESSION_IMPORTED_AS_TYPES = (ModuleType, TypeType, FunctionType, MethodType, BuiltinMethodType) -SESSION_IMPORTED_AS_MODULES = ('ctypes', 'typing', 'subprocess', 'threading', - r'concurrent\.futures(\.\w+)?', r'multiprocessing(\.\w+)?') -SESSION_IMPORTED_AS_MODULES = tuple(re.compile(x) for x in SESSION_IMPORTED_AS_MODULES) + +SESSION_IMPORTED_AS_MODULES = [re.compile(x) for x in ( + 'ctypes', 'typing', 'subprocess', 'threading', + r'concurrent\.futures(\.\w+)?', r'multiprocessing(\.\w+)?' +)] def _lookup_module(modmap, name, obj, main_module): """lookup name or id of obj if module is imported""" @@ -740,18 +742,18 @@ def load_module( if is_runtime_mod: pickle_main = pickle_main.partition('.')[-1] error_msg = "can't update{} module{} %r with the saved state of{} module{} %r" + if main.__name__ != pickle_main: + raise ValueError(error_msg.format("", "", "", "") % (main.__name__, pickle_main)) if is_runtime_mod and is_main_imported: raise ValueError( error_msg.format(" imported", "", "", "-type object") - % (main.__name__, pickle_main) + % (main.__name__, main.__name__) ) if not is_runtime_mod and not is_main_imported: raise ValueError( error_msg.format("", "-type object", " imported", "") - % (pickle_main, main.__name__) + % (main.__name__, main.__name__) ) - if main.__name__ != pickle_main: - raise ValueError(error_msg.format("", "", "", "") % (main.__name__, pickle_main)) # This is for find_class() to be able to locate it. if not is_main_imported: From 33ca2ed80907a0fe6ead78855a12e36263ef8362 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 16 Jul 2022 12:31:29 -0300 Subject: [PATCH 02/30] remove unnecessary '_main_modified' attribute from pickler --- dill/_dill.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 12daabe7..da64ac25 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -512,21 +512,22 @@ def dump_module( main = _import_module(main) if not isinstance(main, ModuleType): raise TypeError("%r is not a module" % main) + original_main = main + if refimported: + main = _stash_modules(main) if hasattr(filename, 'write'): file = filename else: file = open(filename, 'wb') try: pickler = Pickler(file, protocol, **kwds) - pickler._original_main = main - if refimported: - main = _stash_modules(main) + if main is not original_main: + pickler._original_main = original_main pickler._main = main #FIXME: dill.settings are disabled pickler._byref = False # disable pickling by name reference pickler._recurse = False # disable pickling recursion for globals pickler._session = True # is best indicator of when pickling a session pickler._first_pass = True - pickler._main_modified = main is not pickler._original_main pickler.dump(main) finally: if file is not filename: # if newly opened file @@ -2317,8 +2318,7 @@ def save_function(pickler, obj): logger.trace(pickler, "F1: %s", obj) _recurse = getattr(pickler, '_recurse', None) _postproc = getattr(pickler, '_postproc', None) - _main_modified = getattr(pickler, '_main_modified', None) - _original_main = getattr(pickler, '_original_main', __builtin__)#'None' + _original_main = getattr(pickler, '_original_main', None) postproc_list = [] if _recurse: # recurse to get all globals referred to by obj @@ -2335,7 +2335,7 @@ def save_function(pickler, obj): # If the globals is the __dict__ from the module being saved as a # session, substitute it by the dictionary being actually saved. - if _main_modified and globs_copy is _original_main.__dict__: + if _original_main is not None and globs_copy is _original_main.__dict__: globs_copy = getattr(pickler, '_main', _original_main).__dict__ globs = globs_copy # If the globals is a module __dict__, do not save it in the pickle. From ad8db21fd674d06815237559e37bae3b91505694 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 16 Jul 2022 13:03:32 -0300 Subject: [PATCH 03/30] new _open() function to handle file names and file-like objects --- dill/_dill.py | 64 +++++++++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index da64ac25..6799125d 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -320,13 +320,23 @@ def loads(str, ignore=None, **kwds): ### End: Shorthands ### ### Pickle the Interpreter Session +import contextlib import pathlib import re import tempfile +from contextlib import suppress from types import SimpleNamespace TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) +def _open(file, mode): + """return a context manager with an opened file-like object""" + attr = 'write' if 'w' in mode else 'read' + if not hasattr(file, attr): + return open(file, mode) + else: + return contextlib.nullcontext(file) + def _module_map(): """get map of imported modules""" from collections import defaultdict @@ -515,11 +525,7 @@ def dump_module( original_main = main if refimported: main = _stash_modules(main) - if hasattr(filename, 'write'): - file = filename - else: - file = open(filename, 'wb') - try: + with _open(filename, 'wb') as file: pickler = Pickler(file, protocol, **kwds) if main is not original_main: pickler._original_main = original_main @@ -529,9 +535,6 @@ def dump_module( pickler._session = True # is best indicator of when pickling a session pickler._first_pass = True pickler.dump(main) - finally: - if file is not filename: # if newly opened file - file.close() return # Backward compatibility. @@ -709,11 +712,7 @@ def load_module( raise TypeError("both 'module' and 'main' arguments were used") module = kwds.pop('main') main = module - if hasattr(filename, 'read'): - file = filename - else: - file = open(filename, 'rb') - try: + with _open(filename, 'rb') as file: file = _make_peekable(file) #FIXME: dill.settings are disabled unpickler = Unpickler(file, **kwds) @@ -756,19 +755,16 @@ def load_module( % (main.__name__, main.__name__) ) - # This is for find_class() to be able to locate it. - if not is_main_imported: - runtime_main = '__runtime__.%s' % main.__name__ - sys.modules[runtime_main] = main - - loaded = unpickler.load() - finally: - if not hasattr(filename, 'read'): # if newly opened file - file.close() try: - del sys.modules[runtime_main] - except (KeyError, NameError): - pass + if not is_main_imported: + # This is for find_class() to be able to locate it. + runtime_main = '__runtime__.%s' % main.__name__ + sys.modules[runtime_main] = main + loaded = unpickler.load() + finally: + if not is_main_imported: + del sys.modules[runtime_main] + assert loaded is main _restore_modules(unpickler, main) if main is _main_module or main is module: @@ -839,11 +835,7 @@ def load_module_asdict( """ if 'module' in kwds: raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") - if hasattr(filename, 'read'): - file = filename - else: - file = open(filename, 'rb') - try: + with _open(filename, 'rb') as file: file = _make_peekable(file) main_name = _identify_module(file) old_main = sys.modules.get(main_name) @@ -854,18 +846,14 @@ def load_module_asdict( main.__dict__.update(old_main.__dict__) else: main.__builtins__ = __builtin__ - sys.modules[main_name] = main - load_module(file, **kwds) - finally: - if not hasattr(filename, 'read'): # if newly opened file - file.close() try: + sys.modules[main_name] = main + load_module(file, **kwds) + finally: if old_main is None: del sys.modules[main_name] else: sys.modules[main_name] = old_main - except NameError: # failed before setting old_main - pass main.__session__ = str(filename) return main.__dict__ @@ -914,7 +902,7 @@ def __init__(self, file, *args, **kwds): self._fmode = settings['fmode'] if _fmode is None else _fmode self._recurse = settings['recurse'] if _recurse is None else _recurse self._postproc = OrderedDict() - self._file = file + self._file = file # for the logger def dump(self, obj): #NOTE: if settings change, need to update attributes # register if the object is a numpy ufunc From da4cc072c0ef5d644acee636f8de4f7eb7ebf482 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 16 Jul 2022 13:42:01 -0300 Subject: [PATCH 04/30] merge function _make_peekable() with _open() --- dill/_dill.py | 90 ++++++++++++++++++++++++++------------------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 6799125d..dd59e6f9 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -329,13 +329,53 @@ def loads(str, ignore=None, **kwds): TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) -def _open(file, mode): +class _PeekableReader: + """lightweight readable stream wrapper that implements peek()""" + def __init__(self, stream): + self.stream = stream + def read(self, n): + return self.stream.read(n) + def readline(self): + return self.stream.readline() + def tell(self): + return self.stream.tell() + def close(self): + return self.stream.close() + def peek(self, n): + stream = self.stream + try: + if hasattr(stream, 'flush'): stream.flush() + position = stream.tell() + stream.seek(position) # assert seek() works before reading + chunk = stream.read(n) + stream.seek(position) + return chunk + except (AttributeError, OSError): + raise NotImplementedError("stream is not peekable: %r", stream) from None + +def _open(file, mode, *, peekable=False): """return a context manager with an opened file-like object""" + import io attr = 'write' if 'w' in mode else 'read' - if not hasattr(file, attr): - return open(file, mode) - else: + was_open = hasattr(file, attr) + if not was_open: + file = open(file, mode) + if attr == 'read' and peekable and not hasattr(file, 'peek'): + # Try our best to return the stream as an object with a peek() method. + if hasattr(file, 'tell') and hasattr(file, 'seek'): + file = _PeekableReader(file) + else: + try: + file = io.BufferedReader(file) + except Exception: + # Stream won't be peekable, but will fail gracefully in _identify_module(). + file = _PeekableReader(file) + if was_open: # should not close at exit return contextlib.nullcontext(file) + elif type(file) == _PeekableReader: + return contextlib.closing(file) + else: + return file def _module_map(): """get map of imported modules""" @@ -543,42 +583,6 @@ def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, ** dump_module(filename, module=main, refimported=byref, **kwds) dump_session.__doc__ = dump_module.__doc__ -class _PeekableReader: - """lightweight stream wrapper that implements peek()""" - def __init__(self, stream): - self.stream = stream - def read(self, n): - return self.stream.read(n) - def readline(self): - return self.stream.readline() - def tell(self): - return self.stream.tell() - def close(self): - return self.stream.close() - def peek(self, n): - stream = self.stream - try: - if hasattr(stream, 'flush'): stream.flush() - position = stream.tell() - stream.seek(position) # assert seek() works before reading - chunk = stream.read(n) - stream.seek(position) - return chunk - except (AttributeError, OSError): - raise NotImplementedError("stream is not peekable: %r", stream) from None - -def _make_peekable(stream): - """return stream as an object with a peek() method""" - import io - if hasattr(stream, 'peek'): - return stream - if not (hasattr(stream, 'tell') and hasattr(stream, 'seek')): - try: - return io.BufferedReader(stream) - except Exception: - pass - return _PeekableReader(stream) - def _identify_module(file, main=None): """identify the name of the module stored in the given file-type object""" from pickletools import genops @@ -712,8 +716,7 @@ def load_module( raise TypeError("both 'module' and 'main' arguments were used") module = kwds.pop('main') main = module - with _open(filename, 'rb') as file: - file = _make_peekable(file) + with _open(filename, 'rb', peekable=True) as file: #FIXME: dill.settings are disabled unpickler = Unpickler(file, **kwds) unpickler._session = True @@ -835,8 +838,7 @@ def load_module_asdict( """ if 'module' in kwds: raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") - with _open(filename, 'rb') as file: - file = _make_peekable(file) + with _open(filename, 'rb', peekable=True) as file: main_name = _identify_module(file) old_main = sys.modules.get(main_name) main = ModuleType(main_name) From 1732e3d217ebd40f4b7dd9a0577e34a24e38f7af Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 16 Jul 2022 14:09:37 -0300 Subject: [PATCH 05/30] new function is_module_pickle() --- dill/__init__.py | 8 ++++---- dill/_dill.py | 32 +++++++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/dill/__init__.py b/dill/__init__.py index 028112dc..e45d5146 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -25,10 +25,10 @@ from ._dill import ( dump, dumps, load, loads, dump_module, load_module, load_module_asdict, - dump_session, load_session, Pickler, Unpickler, register, copy, pickle, - pickles, check, HIGHEST_PROTOCOL, DEFAULT_PROTOCOL, PicklingError, - UnpicklingError, HANDLE_FMODE, CONTENTS_FMODE, FILE_FMODE, PickleError, - PickleWarning, PicklingWarning, UnpicklingWarning, + dump_session, load_session, is_module_pickle, Pickler, Unpickler, register, + copy, pickle, pickles, check, HIGHEST_PROTOCOL, DEFAULT_PROTOCOL, + PicklingError, UnpicklingError, HANDLE_FMODE, CONTENTS_FMODE, FILE_FMODE, + PickleError, PickleWarning, PicklingWarning, UnpicklingWarning, ) from . import source, temp, detect diff --git a/dill/_dill.py b/dill/_dill.py index dd59e6f9..017ca347 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -17,11 +17,11 @@ """ __all__ = [ 'dump', 'dumps', 'load', 'loads', 'dump_module', 'load_module', - 'load_module_asdict', 'dump_session', 'load_session', 'Pickler', 'Unpickler', - 'register', 'copy', 'pickle', 'pickles', 'check', 'HIGHEST_PROTOCOL', - 'DEFAULT_PROTOCOL', 'PicklingError', 'UnpicklingError', 'HANDLE_FMODE', - 'CONTENTS_FMODE', 'FILE_FMODE', 'PickleError', 'PickleWarning', - 'PicklingWarning', 'UnpicklingWarning' + 'load_module_asdict', 'dump_session', 'load_session', 'is_module_pickle', + 'Pickler', 'Unpickler', 'register', 'copy', 'pickle', 'pickles', 'check', + 'HIGHEST_PROTOCOL', 'DEFAULT_PROTOCOL', 'PicklingError', 'UnpicklingError', + 'HANDLE_FMODE', 'CONTENTS_FMODE', 'FILE_FMODE', 'PickleError', + 'PickleWarning', 'PicklingWarning', 'UnpicklingWarning', ] __module__ = 'dill' @@ -606,6 +606,28 @@ def _identify_module(file, main=None): return None raise UnpicklingError("unable to identify main module") from error +def is_module_pickle(filename, importable: bool = True) -> bool: + """Check if a file is a module state pickle file. + + Parameters: + filename: a path-like object or a readable stream. + importable: expected kind of the file's saved module. Use `True` for + importable modules (the default) or `False` for module-type objects. + + Returns: + `True` if the pickle file at ``filename`` was generated with + :py:func:`dump_module` **AND** the module whose state is saved in it is + of the kind specified by the ``importable`` argument. `False` otherwise. + """ + with _open(filename, 'rb', peekable=True) as file: + try: + pickle_main = _identify_module(file) + except UnpicklingError: + return False + else: + is_runtime_mod = pickle_main.startswith('__runtime__.') + return importable ^ is_runtime_mod + def load_module( filename = str(TEMPDIR/'session.pkl'), module: Union[ModuleType, str] = None, From 2fdd31d6d1bd855f54877cd5721a8659561ea8cb Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 16 Jul 2022 18:51:23 -0300 Subject: [PATCH 06/30] move session-related code to session.py submodule --- dill/__init__.py | 14 +- dill/_dill.py | 567 -------------------------------------------- dill/session.py | 593 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 601 insertions(+), 573 deletions(-) create mode 100644 dill/session.py diff --git a/dill/__init__.py b/dill/__init__.py index e45d5146..8f8429bd 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -24,13 +24,15 @@ from ._dill import ( - dump, dumps, load, loads, dump_module, load_module, load_module_asdict, - dump_session, load_session, is_module_pickle, Pickler, Unpickler, register, - copy, pickle, pickles, check, HIGHEST_PROTOCOL, DEFAULT_PROTOCOL, - PicklingError, UnpicklingError, HANDLE_FMODE, CONTENTS_FMODE, FILE_FMODE, - PickleError, PickleWarning, PicklingWarning, UnpicklingWarning, + Pickler, Unpickler, + check, copy, dump, dumps, load, loads, pickle, pickles, register, + DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, CONTENTS_FMODE, FILE_FMODE, HANDLE_FMODE, + PickleError, PickleWarning, PicklingError, PicklingWarning, UnpicklingError, + UnpicklingWarning, ) -from . import source, temp, detect +from .session import dump_module, is_module_pickle, load_module, load_module_asdict +from .session import dump_session, load_session # backward compatibility +from . import detect, session, source, temp # get global settings from .settings import settings diff --git a/dill/_dill.py b/dill/_dill.py index 017ca347..aaa14101 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -30,8 +30,6 @@ from .logger import adapter as logger from .logger import trace as _trace -from typing import Optional, Union - import os import sys diff = None @@ -319,570 +317,6 @@ def loads(str, ignore=None, **kwds): ### End: Shorthands ### -### Pickle the Interpreter Session -import contextlib -import pathlib -import re -import tempfile -from contextlib import suppress -from types import SimpleNamespace - -TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) - -class _PeekableReader: - """lightweight readable stream wrapper that implements peek()""" - def __init__(self, stream): - self.stream = stream - def read(self, n): - return self.stream.read(n) - def readline(self): - return self.stream.readline() - def tell(self): - return self.stream.tell() - def close(self): - return self.stream.close() - def peek(self, n): - stream = self.stream - try: - if hasattr(stream, 'flush'): stream.flush() - position = stream.tell() - stream.seek(position) # assert seek() works before reading - chunk = stream.read(n) - stream.seek(position) - return chunk - except (AttributeError, OSError): - raise NotImplementedError("stream is not peekable: %r", stream) from None - -def _open(file, mode, *, peekable=False): - """return a context manager with an opened file-like object""" - import io - attr = 'write' if 'w' in mode else 'read' - was_open = hasattr(file, attr) - if not was_open: - file = open(file, mode) - if attr == 'read' and peekable and not hasattr(file, 'peek'): - # Try our best to return the stream as an object with a peek() method. - if hasattr(file, 'tell') and hasattr(file, 'seek'): - file = _PeekableReader(file) - else: - try: - file = io.BufferedReader(file) - except Exception: - # Stream won't be peekable, but will fail gracefully in _identify_module(). - file = _PeekableReader(file) - if was_open: # should not close at exit - return contextlib.nullcontext(file) - elif type(file) == _PeekableReader: - return contextlib.closing(file) - else: - return file - -def _module_map(): - """get map of imported modules""" - from collections import defaultdict - modmap = SimpleNamespace( - by_name=defaultdict(list), - by_id=defaultdict(list), - top_level={}, - ) - for modname, module in sys.modules.items(): - if modname in ('__main__', '__mp_main__') or not isinstance(module, ModuleType): - continue - if '.' not in modname: - modmap.top_level[id(module)] = modname - for objname, modobj in module.__dict__.items(): - modmap.by_name[objname].append((modobj, modname)) - modmap.by_id[id(modobj)].append((modobj, objname, modname)) - return modmap - -SESSION_IMPORTED_AS_TYPES = (ModuleType, TypeType, FunctionType, MethodType, BuiltinMethodType) - -SESSION_IMPORTED_AS_MODULES = [re.compile(x) for x in ( - 'ctypes', 'typing', 'subprocess', 'threading', - r'concurrent\.futures(\.\w+)?', r'multiprocessing(\.\w+)?' -)] - -def _lookup_module(modmap, name, obj, main_module): - """lookup name or id of obj if module is imported""" - for modobj, modname in modmap.by_name[name]: - if modobj is obj and sys.modules[modname] is not main_module: - return modname, name - __module__ = getattr(obj, '__module__', None) - if isinstance(obj, SESSION_IMPORTED_AS_TYPES) or (__module__ is not None - and any(regex.fullmatch(__module__) for regex in SESSION_IMPORTED_AS_MODULES)): - for modobj, objname, modname in modmap.by_id[id(obj)]: - if sys.modules[modname] is not main_module: - return modname, objname - return None, None - -def _stash_modules(main_module): - modmap = _module_map() - newmod = ModuleType(main_module.__name__) - - imported = [] - imported_as = [] - imported_top_level = [] # keep separated for backward compatibility - original = {} - for name, obj in main_module.__dict__.items(): - if obj is main_module: - original[name] = newmod # self-reference - elif obj is main_module.__dict__: - original[name] = newmod.__dict__ - # Avoid incorrectly matching a singleton value in another package (ex.: __doc__). - elif any(obj is singleton for singleton in (None, False, True)) \ - or isinstance(obj, ModuleType) and _is_builtin_module(obj): # always saved by ref - original[name] = obj - else: - source_module, objname = _lookup_module(modmap, name, obj, main_module) - if source_module is not None: - if objname == name: - imported.append((source_module, name)) - else: - imported_as.append((source_module, objname, name)) - else: - try: - imported_top_level.append((modmap.top_level[id(obj)], name)) - except KeyError: - original[name] = obj - - if len(original) < len(main_module.__dict__): - newmod.__dict__.update(original) - newmod.__dill_imported = imported - newmod.__dill_imported_as = imported_as - newmod.__dill_imported_top_level = imported_top_level - if getattr(newmod, '__loader__', None) is None and _is_imported_module(main_module): - # Trick _is_imported_module() to force saving as an imported module. - newmod.__loader__ = True # will be discarded by save_module() - return newmod - else: - return main_module - -def _restore_modules(unpickler, main_module): - try: - for modname, name in main_module.__dict__.pop('__dill_imported'): - main_module.__dict__[name] = unpickler.find_class(modname, name) - for modname, objname, name in main_module.__dict__.pop('__dill_imported_as'): - main_module.__dict__[name] = unpickler.find_class(modname, objname) - for modname, name in main_module.__dict__.pop('__dill_imported_top_level'): - main_module.__dict__[name] = __import__(modname) - except KeyError: - pass - -#NOTE: 06/03/15 renamed main_module to main -def dump_module( - filename = str(TEMPDIR/'session.pkl'), - module: Union[ModuleType, str] = None, - refimported: bool = False, - **kwds -) -> None: - """Pickle the current state of :py:mod:`__main__` or another module to a file. - - Save the contents of :py:mod:`__main__` (e.g. from an interactive - interpreter session), an imported module, or a module-type object (e.g. - built with :py:class:`~types.ModuleType`), to a file. The pickled - module can then be restored with the function :py:func:`load_module`. - - Parameters: - filename: a path-like object or a writable stream. - module: a module object or the name of an importable module. If `None` - (the default), :py:mod:`__main__` is saved. - refimported: if `True`, all objects identified as having been imported - into the module's namespace are saved by reference. *Note:* this is - similar but independent from ``dill.settings[`byref`]``, as - ``refimported`` refers to virtually all imported objects, while - ``byref`` only affects select objects. - **kwds: extra keyword arguments passed to :py:class:`Pickler()`. - - Raises: - :py:exc:`PicklingError`: if pickling fails. - - Examples: - - - Save current interpreter session state: - - >>> import dill - >>> squared = lambda x: x*x - >>> dill.dump_module() # save state of __main__ to /tmp/session.pkl - - - Save the state of an imported/importable module: - - >>> import dill - >>> import pox - >>> pox.plus_one = lambda x: x+1 - >>> dill.dump_module('pox_session.pkl', module=pox) - - - Save the state of a non-importable, module-type object: - - >>> import dill - >>> from types import ModuleType - >>> foo = ModuleType('foo') - >>> foo.values = [1,2,3] - >>> import math - >>> foo.sin = math.sin - >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) - - - Restore the state of the saved modules: - - >>> import dill - >>> dill.load_module() - >>> squared(2) - 4 - >>> pox = dill.load_module('pox_session.pkl') - >>> pox.plus_one(1) - 2 - >>> foo = dill.load_module('foo_session.pkl') - >>> [foo.sin(x) for x in foo.values] - [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] - - *Changed in version 0.3.6:* Function ``dump_session()`` was renamed to - ``dump_module()``. Parameters ``main`` and ``byref`` were renamed to - ``module`` and ``refimported``, respectively. - - Note: - Currently, ``dill.settings['byref']`` and ``dill.settings['recurse']`` - don't apply to this function.` - """ - for old_par, par in [('main', 'module'), ('byref', 'refimported')]: - if old_par in kwds: - message = "The argument %r has been renamed %r" % (old_par, par) - if old_par == 'byref': - message += " to distinguish it from dill.settings['byref']" - warnings.warn(message + ".", PendingDeprecationWarning) - if locals()[par]: # the defaults are None and False - raise TypeError("both %r and %r arguments were used" % (par, old_par)) - refimported = kwds.pop('byref', refimported) - module = kwds.pop('main', module) - - from .settings import settings - protocol = settings['protocol'] - main = module - if main is None: - main = _main_module - elif isinstance(main, str): - main = _import_module(main) - if not isinstance(main, ModuleType): - raise TypeError("%r is not a module" % main) - original_main = main - if refimported: - main = _stash_modules(main) - with _open(filename, 'wb') as file: - pickler = Pickler(file, protocol, **kwds) - if main is not original_main: - pickler._original_main = original_main - pickler._main = main #FIXME: dill.settings are disabled - pickler._byref = False # disable pickling by name reference - pickler._recurse = False # disable pickling recursion for globals - pickler._session = True # is best indicator of when pickling a session - pickler._first_pass = True - pickler.dump(main) - return - -# Backward compatibility. -def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, **kwds): - warnings.warn("dump_session() has been renamed dump_module()", PendingDeprecationWarning) - dump_module(filename, module=main, refimported=byref, **kwds) -dump_session.__doc__ = dump_module.__doc__ - -def _identify_module(file, main=None): - """identify the name of the module stored in the given file-type object""" - from pickletools import genops - UNICODE = {'UNICODE', 'BINUNICODE', 'SHORT_BINUNICODE'} - found_import = False - try: - for opcode, arg, pos in genops(file.peek(256)): - if not found_import: - if opcode.name in ('GLOBAL', 'SHORT_BINUNICODE') and \ - arg.endswith('_import_module'): - found_import = True - else: - if opcode.name in UNICODE: - return arg - else: - raise UnpicklingError("reached STOP without finding main module") - except (NotImplementedError, ValueError) as error: - # ValueError occours when the end of the chunk is reached (without a STOP). - if isinstance(error, NotImplementedError) and main is not None: - # file is not peekable, but we have main. - return None - raise UnpicklingError("unable to identify main module") from error - -def is_module_pickle(filename, importable: bool = True) -> bool: - """Check if a file is a module state pickle file. - - Parameters: - filename: a path-like object or a readable stream. - importable: expected kind of the file's saved module. Use `True` for - importable modules (the default) or `False` for module-type objects. - - Returns: - `True` if the pickle file at ``filename`` was generated with - :py:func:`dump_module` **AND** the module whose state is saved in it is - of the kind specified by the ``importable`` argument. `False` otherwise. - """ - with _open(filename, 'rb', peekable=True) as file: - try: - pickle_main = _identify_module(file) - except UnpicklingError: - return False - else: - is_runtime_mod = pickle_main.startswith('__runtime__.') - return importable ^ is_runtime_mod - -def load_module( - filename = str(TEMPDIR/'session.pkl'), - module: Union[ModuleType, str] = None, - **kwds -) -> Optional[ModuleType]: - """Update the selected module (default is :py:mod:`__main__`) with - the state saved at ``filename``. - - Restore a module to the state saved with :py:func:`dump_module`. The - saved module can be :py:mod:`__main__` (e.g. an interpreter session), - an imported module, or a module-type object (e.g. created with - :py:class:`~types.ModuleType`). - - When restoring the state of a non-importable module-type object, the - current instance of this module may be passed as the argument ``main``. - Otherwise, a new instance is created with :py:class:`~types.ModuleType` - and returned. - - Parameters: - filename: a path-like object or a readable stream. - module: a module object or the name of an importable module; - the module name and kind (i.e. imported or non-imported) must - match the name and kind of the module stored at ``filename``. - **kwds: extra keyword arguments passed to :py:class:`Unpickler()`. - - Raises: - :py:exc:`UnpicklingError`: if unpickling fails. - :py:exc:`ValueError`: if the argument ``main`` and module saved - at ``filename`` are incompatible. - - Returns: - A module object, if the saved module is not :py:mod:`__main__` or - a module instance wasn't provided with the argument ``main``. - - Examples: - - - Save the state of some modules: - - >>> import dill - >>> squared = lambda x: x*x - >>> dill.dump_module() # save state of __main__ to /tmp/session.pkl - >>> - >>> import pox # an imported module - >>> pox.plus_one = lambda x: x+1 - >>> dill.dump_module('pox_session.pkl', module=pox) - >>> - >>> from types import ModuleType - >>> foo = ModuleType('foo') # a module-type object - >>> foo.values = [1,2,3] - >>> import math - >>> foo.sin = math.sin - >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) - - - Restore the state of the interpreter: - - >>> import dill - >>> dill.load_module() # updates __main__ from /tmp/session.pkl - >>> squared(2) - 4 - - - Load the saved state of an importable module: - - >>> import dill - >>> pox = dill.load_module('pox_session.pkl') - >>> pox.plus_one(1) - 2 - >>> import sys - >>> pox in sys.modules.values() - True - - - Load the saved state of a non-importable module-type object: - - >>> import dill - >>> foo = dill.load_module('foo_session.pkl') - >>> [foo.sin(x) for x in foo.values] - [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] - >>> import math - >>> foo.sin is math.sin # foo.sin was saved by reference - True - >>> import sys - >>> foo in sys.modules.values() - False - - - Update the state of a non-importable module-type object: - - >>> import dill - >>> from types import ModuleType - >>> foo = ModuleType('foo') - >>> foo.values = ['a','b'] - >>> foo.sin = lambda x: x*x - >>> dill.load_module('foo_session.pkl', module=foo) - >>> [foo.sin(x) for x in foo.values] - [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] - - *Changed in version 0.3.6:* Function ``load_session()`` was renamed to - ``load_module()``. Parameter ``main`` was renamed to ``module``. - - See also: - :py:func:`load_module_asdict` to load the contents of module saved - with :py:func:`dump_module` into a dictionary. - """ - if 'main' in kwds: - warnings.warn( - "The argument 'main' has been renamed 'module'.", - PendingDeprecationWarning - ) - if module is not None: - raise TypeError("both 'module' and 'main' arguments were used") - module = kwds.pop('main') - main = module - with _open(filename, 'rb', peekable=True) as file: - #FIXME: dill.settings are disabled - unpickler = Unpickler(file, **kwds) - unpickler._session = True - - # Resolve unpickler._main - pickle_main = _identify_module(file, main) - if main is None and pickle_main is not None: - main = pickle_main - if isinstance(main, str): - if main.startswith('__runtime__.'): - # Create runtime module to load the session into. - main = ModuleType(main.partition('.')[-1]) - else: - main = _import_module(main) - if main is not None: - if not isinstance(main, ModuleType): - raise TypeError("%r is not a module" % main) - unpickler._main = main - else: - main = unpickler._main - - # Check against the pickle's main. - is_main_imported = _is_imported_module(main) - if pickle_main is not None: - is_runtime_mod = pickle_main.startswith('__runtime__.') - if is_runtime_mod: - pickle_main = pickle_main.partition('.')[-1] - error_msg = "can't update{} module{} %r with the saved state of{} module{} %r" - if main.__name__ != pickle_main: - raise ValueError(error_msg.format("", "", "", "") % (main.__name__, pickle_main)) - if is_runtime_mod and is_main_imported: - raise ValueError( - error_msg.format(" imported", "", "", "-type object") - % (main.__name__, main.__name__) - ) - if not is_runtime_mod and not is_main_imported: - raise ValueError( - error_msg.format("", "-type object", " imported", "") - % (main.__name__, main.__name__) - ) - - try: - if not is_main_imported: - # This is for find_class() to be able to locate it. - runtime_main = '__runtime__.%s' % main.__name__ - sys.modules[runtime_main] = main - loaded = unpickler.load() - finally: - if not is_main_imported: - del sys.modules[runtime_main] - - assert loaded is main - _restore_modules(unpickler, main) - if main is _main_module or main is module: - return None - else: - return main - -# Backward compatibility. -def load_session(filename=str(TEMPDIR/'session.pkl'), main=None, **kwds): - warnings.warn("load_session() has been renamed load_module().", PendingDeprecationWarning) - load_module(filename, module=main, **kwds) -load_session.__doc__ = load_module.__doc__ - -def load_module_asdict( - filename = str(TEMPDIR/'session.pkl'), - update: bool = False, - **kwds -) -> dict: - """ - Load the contents of a saved module into a dictionary. - - ``load_module_asdict()`` is the near-equivalent of:: - - lambda filename: vars(dill.load_module(filename)).copy() - - however, does not alter the original module. Also, the path of - the loaded module is stored in the ``__session__`` attribute. - - Parameters: - filename: a path-like object or a readable stream - update: if `True`, initialize the dictionary with the current state - of the module prior to loading the state stored at filename. - **kwds: extra keyword arguments passed to :py:class:`Unpickler()` - - Raises: - :py:exc:`UnpicklingError`: if unpickling fails - - Returns: - A copy of the restored module's dictionary. - - Note: - If ``update`` is True, the corresponding module may first be imported - into the current namespace before the saved state is loaded from - filename to the dictionary. Note that any module that is imported into - the current namespace as a side-effect of using ``update`` will not be - modified by loading the saved module in filename to a dictionary. - - Example: - >>> import dill - >>> alist = [1, 2, 3] - >>> anum = 42 - >>> dill.dump_module() - >>> anum = 0 - >>> new_var = 'spam' - >>> main = dill.load_module_asdict() - >>> main['__name__'], main['__session__'] - ('__main__', '/tmp/session.pkl') - >>> main is globals() # loaded objects don't reference globals - False - >>> main['alist'] == alist - True - >>> main['alist'] is alist # was saved by value - False - >>> main['anum'] == anum # changed after the session was saved - False - >>> new_var in main # would be True if the option 'update' was set - False - """ - if 'module' in kwds: - raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") - with _open(filename, 'rb', peekable=True) as file: - main_name = _identify_module(file) - old_main = sys.modules.get(main_name) - main = ModuleType(main_name) - if update: - if old_main is None: - old_main = _import_module(main_name) - main.__dict__.update(old_main.__dict__) - else: - main.__builtins__ = __builtin__ - try: - sys.modules[main_name] = main - load_module(file, **kwds) - finally: - if old_main is None: - del sys.modules[main_name] - else: - sys.modules[main_name] = old_main - main.__session__ = str(filename) - return main.__dict__ - -### End: Pickle the Interpreter - class MetaCatchingDict(dict): def get(self, key, default=None): try: @@ -2462,7 +1896,6 @@ def save_capsule(pickler, obj): _incedental_reverse_typemap['PyCapsuleType'] = PyCapsuleType _reverse_typemap['PyCapsuleType'] = PyCapsuleType _incedental_types.add(PyCapsuleType) - SESSION_IMPORTED_AS_TYPES += (PyCapsuleType,) else: _testcapsule = None diff --git a/dill/session.py b/dill/session.py new file mode 100644 index 00000000..dc26ae99 --- /dev/null +++ b/dill/session.py @@ -0,0 +1,593 @@ +#!/usr/bin/env python +# +# Author: Mike McKerns (mmckerns @caltech and @uqfoundation) +# Author: Leonardo Gama (@leogama) +# Copyright (c) 2008-2015 California Institute of Technology. +# Copyright (c) 2016-2022 The Uncertainty Quantification Foundation. +# License: 3-clause BSD. The full license text is available at: +# - https://github.com/uqfoundation/dill/blob/master/LICENSE +""" +Pickle and restore the intepreter session. +""" + +__all__ = [ + 'dump_module', 'is_module_pickle', 'load_module', 'load_module_asdict', + 'dump_session', 'load_session' # backward compatibility +] + +import logging +logger = logging.getLogger('dill.session') + +import builtins +import contextlib +import pathlib +import re +import sys +import tempfile +import warnings +from types import SimpleNamespace + +from dill import _dill, Pickler, Unpickler +from ._dill import ( + BuiltinMethodType, FunctionType, MethodType, ModuleType, TypeType, + _import_module, _is_builtin_module, _is_imported_module, _main_module, + _reverse_typemap, +) + +# Type hints. +from typing import Optional, Union + +TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) + +class _PeekableReader: + """lightweight readable stream wrapper that implements peek()""" + def __init__(self, stream): + self.stream = stream + def read(self, n): + return self.stream.read(n) + def readline(self): + return self.stream.readline() + def tell(self): + return self.stream.tell() + def close(self): + return self.stream.close() + def peek(self, n): + stream = self.stream + try: + if hasattr(stream, 'flush'): stream.flush() + position = stream.tell() + stream.seek(position) # assert seek() works before reading + chunk = stream.read(n) + stream.seek(position) + return chunk + except (AttributeError, OSError): + raise NotImplementedError("stream is not peekable: %r", stream) from None + +def _open(file, mode, *, peekable=False): + """return a context manager with an opened file-like object""" + import io + attr = 'write' if 'w' in mode else 'read' + was_open = hasattr(file, attr) + if not was_open: + file = open(file, mode) + if attr == 'read' and peekable and not hasattr(file, 'peek'): + # Try our best to return the stream as an object with a peek() method. + if hasattr(file, 'tell') and hasattr(file, 'seek'): + file = _PeekableReader(file) + else: + try: + file = io.BufferedReader(file) + except Exception: + # Stream won't be peekable, but will fail gracefully in _identify_module(). + file = _PeekableReader(file) + if was_open: # should not close at exit + return contextlib.nullcontext(file) + elif type(file) == _PeekableReader: + return contextlib.closing(file) + else: + return file + +def _module_map(): + """get map of imported modules""" + from collections import defaultdict + modmap = SimpleNamespace( + by_name=defaultdict(list), + by_id=defaultdict(list), + top_level={}, + ) + for modname, module in sys.modules.items(): + if modname in ('__main__', '__mp_main__') or not isinstance(module, ModuleType): + continue + if '.' not in modname: + modmap.top_level[id(module)] = modname + for objname, modobj in module.__dict__.items(): + modmap.by_name[objname].append((modobj, modname)) + modmap.by_id[id(modobj)].append((modobj, objname, modname)) + return modmap + +IMPORTED_AS_TYPES = (ModuleType, TypeType, FunctionType, MethodType, BuiltinMethodType) +PyCapsuleType = _reverse_typemap.get('PyCapsuleType') +if PyCapsuleType is not None: IMPORTED_AS_TYPES += (PyCapsuleType,) + +IMPORTED_AS_MODULES = [re.compile(x) for x in ( + 'ctypes', 'typing', 'subprocess', 'threading', + r'concurrent\.futures(\.\w+)?', r'multiprocessing(\.\w+)?' +)] + +def _lookup_module(modmap, name, obj, main_module): + """lookup name or id of obj if module is imported""" + for modobj, modname in modmap.by_name[name]: + if modobj is obj and sys.modules[modname] is not main_module: + return modname, name + __module__ = getattr(obj, '__module__', None) + if isinstance(obj, IMPORTED_AS_TYPES) or (__module__ is not None + and any(regex.fullmatch(__module__) for regex in IMPORTED_AS_MODULES)): + for modobj, objname, modname in modmap.by_id[id(obj)]: + if sys.modules[modname] is not main_module: + return modname, objname + return None, None + +def _stash_modules(main_module): + modmap = _module_map() + newmod = ModuleType(main_module.__name__) + + imported = [] + imported_as = [] + imported_top_level = [] # keep separated for backward compatibility + original = {} + for name, obj in main_module.__dict__.items(): + if obj is main_module: + original[name] = newmod # self-reference + elif obj is main_module.__dict__: + original[name] = newmod.__dict__ + # Avoid incorrectly matching a singleton value in another package (ex.: __doc__). + elif any(obj is singleton for singleton in (None, False, True)) \ + or isinstance(obj, ModuleType) and _is_builtin_module(obj): # always saved by ref + original[name] = obj + else: + source_module, objname = _lookup_module(modmap, name, obj, main_module) + if source_module is not None: + if objname == name: + imported.append((source_module, name)) + else: + imported_as.append((source_module, objname, name)) + else: + try: + imported_top_level.append((modmap.top_level[id(obj)], name)) + except KeyError: + original[name] = obj + + if len(original) < len(main_module.__dict__): + newmod.__dict__.update(original) + newmod.__dill_imported = imported + newmod.__dill_imported_as = imported_as + newmod.__dill_imported_top_level = imported_top_level + if getattr(newmod, '__loader__', None) is None and _is_imported_module(main_module): + # Trick _is_imported_module() to force saving as an imported module. + newmod.__loader__ = True # will be discarded by save_module() + return newmod + else: + return main_module + +def _restore_modules(unpickler, main_module): + try: + for modname, name in main_module.__dict__.pop('__dill_imported'): + main_module.__dict__[name] = unpickler.find_class(modname, name) + for modname, objname, name in main_module.__dict__.pop('__dill_imported_as'): + main_module.__dict__[name] = unpickler.find_class(modname, objname) + for modname, name in main_module.__dict__.pop('__dill_imported_top_level'): + main_module.__dict__[name] = __import__(modname) + except KeyError: + pass + +def dump_module( + filename = str(TEMPDIR/'session.pkl'), + module: Union[ModuleType, str] = None, + refimported: bool = False, + **kwds +) -> None: + """Pickle the current state of :py:mod:`__main__` or another module to a file. + + Save the contents of :py:mod:`__main__` (e.g. from an interactive + interpreter session), an imported module, or a module-type object (e.g. + built with :py:class:`~types.ModuleType`), to a file. The pickled + module can then be restored with the function :py:func:`load_module`. + + Parameters: + filename: a path-like object or a writable stream. + module: a module object or the name of an importable module. If `None` + (the default), :py:mod:`__main__` is saved. + refimported: if `True`, all objects identified as having been imported + into the module's namespace are saved by reference. *Note:* this is + similar but independent from ``dill.settings[`byref`]``, as + ``refimported`` refers to virtually all imported objects, while + ``byref`` only affects select objects. + **kwds: extra keyword arguments passed to :py:class:`Pickler()`. + + Raises: + :py:exc:`PicklingError`: if pickling fails. + + Examples: + + - Save current interpreter session state: + + >>> import dill + >>> squared = lambda x: x*x + >>> dill.dump_module() # save state of __main__ to /tmp/session.pkl + + - Save the state of an imported/importable module: + + >>> import dill + >>> import pox + >>> pox.plus_one = lambda x: x+1 + >>> dill.dump_module('pox_session.pkl', module=pox) + + - Save the state of a non-importable, module-type object: + + >>> import dill + >>> from types import ModuleType + >>> foo = ModuleType('foo') + >>> foo.values = [1,2,3] + >>> import math + >>> foo.sin = math.sin + >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) + + - Restore the state of the saved modules: + + >>> import dill + >>> dill.load_module() + >>> squared(2) + 4 + >>> pox = dill.load_module('pox_session.pkl') + >>> pox.plus_one(1) + 2 + >>> foo = dill.load_module('foo_session.pkl') + >>> [foo.sin(x) for x in foo.values] + [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] + + *Changed in version 0.3.6:* Function ``dump_session()`` was renamed to + ``dump_module()``. Parameters ``main`` and ``byref`` were renamed to + ``module`` and ``refimported``, respectively. + + Note: + Currently, ``dill.settings['byref']`` and ``dill.settings['recurse']`` + don't apply to this function.` + """ + for old_par, par in [('main', 'module'), ('byref', 'refimported')]: + if old_par in kwds: + message = "The argument %r has been renamed %r" % (old_par, par) + if old_par == 'byref': + message += " to distinguish it from dill.settings['byref']" + warnings.warn(message + ".", PendingDeprecationWarning) + if locals()[par]: # the defaults are None and False + raise TypeError("both %r and %r arguments were used" % (par, old_par)) + refimported = kwds.pop('byref', refimported) + module = kwds.pop('main', module) + + from .settings import settings + protocol = settings['protocol'] + main = module + if main is None: + main = _main_module + elif isinstance(main, str): + main = _import_module(main) + if not isinstance(main, ModuleType): + raise TypeError("%r is not a module" % main) + original_main = main + if refimported: + main = _stash_modules(main) + with _open(filename, 'wb') as file: + pickler = Pickler(file, protocol, **kwds) + if main is not original_main: + pickler._original_main = original_main + pickler._main = main #FIXME: dill.settings are disabled + pickler._byref = False # disable pickling by name reference + pickler._recurse = False # disable pickling recursion for globals + pickler._session = True # is best indicator of when pickling a session + pickler._first_pass = True + pickler.dump(main) + return + +# Backward compatibility. +def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, **kwds): + warnings.warn("dump_session() has been renamed dump_module()", PendingDeprecationWarning) + dump_module(filename, module=main, refimported=byref, **kwds) +dump_session.__doc__ = dump_module.__doc__ + +def _identify_module(file, main=None): + """identify the name of the module stored in the given file-type object""" + from pickletools import genops + UNICODE = {'UNICODE', 'BINUNICODE', 'SHORT_BINUNICODE'} + found_import = False + try: + for opcode, arg, pos in genops(file.peek(256)): + if not found_import: + if opcode.name in ('GLOBAL', 'SHORT_BINUNICODE') and \ + arg.endswith('_import_module'): + found_import = True + else: + if opcode.name in UNICODE: + return arg + else: + raise UnpicklingError("reached STOP without finding main module") + except (NotImplementedError, ValueError) as error: + # ValueError occours when the end of the chunk is reached (without a STOP). + if isinstance(error, NotImplementedError) and main is not None: + # file is not peekable, but we have main. + return None + raise UnpicklingError("unable to identify main module") from error + +def is_module_pickle(filename, importable: bool = True) -> bool: + """Check if a file is a module state pickle file. + + Parameters: + filename: a path-like object or a readable stream. + importable: expected kind of the file's saved module. Use `True` for + importable modules (the default) or `False` for module-type objects. + + Returns: + `True` if the pickle file at ``filename`` was generated with + :py:func:`dump_module` **AND** the module whose state is saved in it is + of the kind specified by the ``importable`` argument. `False` otherwise. + """ + with _open(filename, 'rb', peekable=True) as file: + try: + pickle_main = _identify_module(file) + except UnpicklingError: + return False + else: + is_runtime_mod = pickle_main.startswith('__runtime__.') + return importable ^ is_runtime_mod + +def load_module( + filename = str(TEMPDIR/'session.pkl'), + module: Union[ModuleType, str] = None, + **kwds +) -> Optional[ModuleType]: + """Update the selected module (default is :py:mod:`__main__`) with + the state saved at ``filename``. + + Restore a module to the state saved with :py:func:`dump_module`. The + saved module can be :py:mod:`__main__` (e.g. an interpreter session), + an imported module, or a module-type object (e.g. created with + :py:class:`~types.ModuleType`). + + When restoring the state of a non-importable module-type object, the + current instance of this module may be passed as the argument ``main``. + Otherwise, a new instance is created with :py:class:`~types.ModuleType` + and returned. + + Parameters: + filename: a path-like object or a readable stream. + module: a module object or the name of an importable module; + the module name and kind (i.e. imported or non-imported) must + match the name and kind of the module stored at ``filename``. + **kwds: extra keyword arguments passed to :py:class:`Unpickler()`. + + Raises: + :py:exc:`UnpicklingError`: if unpickling fails. + :py:exc:`ValueError`: if the argument ``main`` and module saved + at ``filename`` are incompatible. + + Returns: + A module object, if the saved module is not :py:mod:`__main__` or + a module instance wasn't provided with the argument ``main``. + + Examples: + + - Save the state of some modules: + + >>> import dill + >>> squared = lambda x: x*x + >>> dill.dump_module() # save state of __main__ to /tmp/session.pkl + >>> + >>> import pox # an imported module + >>> pox.plus_one = lambda x: x+1 + >>> dill.dump_module('pox_session.pkl', module=pox) + >>> + >>> from types import ModuleType + >>> foo = ModuleType('foo') # a module-type object + >>> foo.values = [1,2,3] + >>> import math + >>> foo.sin = math.sin + >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) + + - Restore the state of the interpreter: + + >>> import dill + >>> dill.load_module() # updates __main__ from /tmp/session.pkl + >>> squared(2) + 4 + + - Load the saved state of an importable module: + + >>> import dill + >>> pox = dill.load_module('pox_session.pkl') + >>> pox.plus_one(1) + 2 + >>> import sys + >>> pox in sys.modules.values() + True + + - Load the saved state of a non-importable module-type object: + + >>> import dill + >>> foo = dill.load_module('foo_session.pkl') + >>> [foo.sin(x) for x in foo.values] + [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] + >>> import math + >>> foo.sin is math.sin # foo.sin was saved by reference + True + >>> import sys + >>> foo in sys.modules.values() + False + + - Update the state of a non-importable module-type object: + + >>> import dill + >>> from types import ModuleType + >>> foo = ModuleType('foo') + >>> foo.values = ['a','b'] + >>> foo.sin = lambda x: x*x + >>> dill.load_module('foo_session.pkl', module=foo) + >>> [foo.sin(x) for x in foo.values] + [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] + + *Changed in version 0.3.6:* Function ``load_session()`` was renamed to + ``load_module()``. Parameter ``main`` was renamed to ``module``. + + See also: + :py:func:`load_module_asdict` to load the contents of module saved + with :py:func:`dump_module` into a dictionary. + """ + if 'main' in kwds: + warnings.warn( + "The argument 'main' has been renamed 'module'.", + PendingDeprecationWarning + ) + if module is not None: + raise TypeError("both 'module' and 'main' arguments were used") + module = kwds.pop('main') + main = module + with _open(filename, 'rb', peekable=True) as file: + #FIXME: dill.settings are disabled + unpickler = Unpickler(file, **kwds) + unpickler._session = True + + # Resolve unpickler._main + pickle_main = _identify_module(file, main) + if main is None and pickle_main is not None: + main = pickle_main + if isinstance(main, str): + if main.startswith('__runtime__.'): + # Create runtime module to load the session into. + main = ModuleType(main.partition('.')[-1]) + else: + main = _import_module(main) + if main is not None: + if not isinstance(main, ModuleType): + raise TypeError("%r is not a module" % main) + unpickler._main = main + else: + main = unpickler._main + + # Check against the pickle's main. + is_main_imported = _is_imported_module(main) + if pickle_main is not None: + is_runtime_mod = pickle_main.startswith('__runtime__.') + if is_runtime_mod: + pickle_main = pickle_main.partition('.')[-1] + error_msg = "can't update{} module{} %r with the saved state of{} module{} %r" + if main.__name__ != pickle_main: + raise ValueError(error_msg.format("", "", "", "") % (main.__name__, pickle_main)) + if is_runtime_mod and is_main_imported: + raise ValueError( + error_msg.format(" imported", "", "", "-type object") + % (main.__name__, main.__name__) + ) + if not is_runtime_mod and not is_main_imported: + raise ValueError( + error_msg.format("", "-type object", " imported", "") + % (main.__name__, main.__name__) + ) + + try: + if not is_main_imported: + # This is for find_class() to be able to locate it. + runtime_main = '__runtime__.%s' % main.__name__ + sys.modules[runtime_main] = main + loaded = unpickler.load() + finally: + if not is_main_imported: + del sys.modules[runtime_main] + + assert loaded is main + _restore_modules(unpickler, main) + if main is _main_module or main is module: + return None + else: + return main + +# Backward compatibility. +def load_session(filename=str(TEMPDIR/'session.pkl'), main=None, **kwds): + warnings.warn("load_session() has been renamed load_module().", PendingDeprecationWarning) + load_module(filename, module=main, **kwds) +load_session.__doc__ = load_module.__doc__ + +def load_module_asdict( + filename = str(TEMPDIR/'session.pkl'), + update: bool = False, + **kwds +) -> dict: + """ + Load the contents of a saved module into a dictionary. + + ``load_module_asdict()`` is the near-equivalent of:: + + lambda filename: vars(dill.load_module(filename)).copy() + + however, does not alter the original module. Also, the path of + the loaded module is stored in the ``__session__`` attribute. + + Parameters: + filename: a path-like object or a readable stream + update: if `True`, initialize the dictionary with the current state + of the module prior to loading the state stored at filename. + **kwds: extra keyword arguments passed to :py:class:`Unpickler()` + + Raises: + :py:exc:`UnpicklingError`: if unpickling fails + + Returns: + A copy of the restored module's dictionary. + + Note: + If ``update`` is True, the corresponding module may first be imported + into the current namespace before the saved state is loaded from + filename to the dictionary. Note that any module that is imported into + the current namespace as a side-effect of using ``update`` will not be + modified by loading the saved module in filename to a dictionary. + + Example: + >>> import dill + >>> alist = [1, 2, 3] + >>> anum = 42 + >>> dill.dump_module() + >>> anum = 0 + >>> new_var = 'spam' + >>> main = dill.load_module_asdict() + >>> main['__name__'], main['__session__'] + ('__main__', '/tmp/session.pkl') + >>> main is globals() # loaded objects don't reference globals + False + >>> main['alist'] == alist + True + >>> main['alist'] is alist # was saved by value + False + >>> main['anum'] == anum # changed after the session was saved + False + >>> new_var in main # would be True if the option 'update' was set + False + """ + if 'module' in kwds: + raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") + with _open(filename, 'rb', peekable=True) as file: + main_name = _identify_module(file) + old_main = sys.modules.get(main_name) + main = ModuleType(main_name) + if update: + if old_main is None: + old_main = _import_module(main_name) + main.__dict__.update(old_main.__dict__) + else: + main.__builtins__ = builtins + try: + sys.modules[main_name] = main + load_module(file, **kwds) + finally: + if old_main is None: + del sys.modules[main_name] + else: + sys.modules[main_name] = old_main + main.__session__ = str(filename) + return main.__dict__ From 6b557550e68d944c3318e34c309aa9d1da974cf1 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 18 Jul 2022 23:49:35 -0300 Subject: [PATCH 07/30] session: deal with modules with unpickleable objects --- dill/_dill.py | 104 +++++++++++++++++++++++++++++++------ dill/logger.py | 22 ++++---- dill/session.py | 22 +++++++- dill/settings.py | 4 ++ dill/tests/test_session.py | 2 +- 5 files changed, 125 insertions(+), 29 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index aaa14101..668534db 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -40,6 +40,8 @@ #XXX: get types from .objtypes ? import builtins as __builtin__ from pickle import _Pickler as StockPickler, Unpickler as StockUnpickler +from pickle import BINPUT, DICT, EMPTY_DICT, LONG_BINPUT, MARK, PUT, SETITEM +from struct import pack from _thread import LockType from _thread import RLock as RLockType #from io import IOBase @@ -234,6 +236,9 @@ def __reduce_ex__(self, protocol): #: Pickles the entire file (handle and contents), preserving mode and position. FILE_FMODE = 2 +# Exceptions commonly raised by unpicklable objects. +UNPICKLEABLE_ERRORS = (PicklingError, TypeError, NotImplementedError) + ### Shorthands (modified from python2.5/lib/pickle.py) def copy(obj, *args, **kwds): """ @@ -349,16 +354,18 @@ class Pickler(StockPickler): def __init__(self, file, *args, **kwds): settings = Pickler.settings _byref = kwds.pop('byref', None) - #_strictio = kwds.pop('strictio', None) _fmode = kwds.pop('fmode', None) _recurse = kwds.pop('recurse', None) + #_refonfail = kwds.pop('refonfail', None) + #_strictio = kwds.pop('strictio', None) StockPickler.__init__(self, file, *args, **kwds) self._main = _main_module self._diff_cache = {} self._byref = settings['byref'] if _byref is None else _byref - self._strictio = False #_strictio self._fmode = settings['fmode'] if _fmode is None else _fmode self._recurse = settings['recurse'] if _recurse is None else _recurse + self._refonfail = False #settings['dump_module']['refonfail'] if _refonfail is None else _refonfail + self._strictio = False #_strictio self._postproc = OrderedDict() self._file = file # for the logger @@ -395,7 +402,7 @@ def save_numpy_dtype(pickler, obj): if NumpyArrayType and ndarraysubclassinstance(obj): @register(type(obj)) def save_numpy_array(pickler, obj): - logger.trace(pickler, "Nu: (%s, %s)", obj.shape, obj.dtype) + logger.trace(pickler, "Nu: (%s, %s)", obj.shape, obj.dtype, obj=obj) npdict = getattr(obj, '__dict__', None) f, args, state = obj.__reduce__() pickler.save_reduce(_create_array, (f,args,state,npdict), obj=obj) @@ -407,9 +414,68 @@ def save_numpy_array(pickler, obj): raise PicklingError(msg) logger.trace_setup(self) StockPickler.dump(self, obj) - dump.__doc__ = StockPickler.dump.__doc__ + def save(self, obj, save_persistent_id=True, *, name=None): + """If self._refonfail is True, try to save object by reference if pickling fails.""" + if not self._refonfail: + super().save(obj, save_persistent_id) + return + if self.framer.current_frame: + # protocol >= 4 + self.framer.commit_frame() + stream = self.framer.current_frame + else: + stream = self._file + position = stream.tell() + memo_size = len(self.memo) + try: + super().save(obj, save_persistent_id) + except UNPICKLEABLE_ERRORS + (AttributeError,) as error_stack: + # AttributeError may happen in save_global() call for child object. + if (type(error_stack) == AttributeError + and "no attribute '__name__'" not in error_stack.args[0]): + raise + # roll back the stream + stream.seek(position) + stream.truncate() + # roll back memo + for _ in range(len(self.memo) - memo_size): + self.memo.popitem() # LIFO order is guaranteed for since 3.7 + try: + self.save_global(obj, name) + except (AttributeError, PicklingError) as error: + if getattr(self, '_trace_stack', None) and id(obj) == self._trace_stack[-1]: + # roll back trace state + self._trace_stack.pop() + self._size_stack.pop() + raise error from error_stack + logger.trace(self, "# X: fallback to save_global: <%s object at %#012x>", + type(obj).__name__, id(obj), obj=obj) + + def _save_module_dict(self, obj): + """ + Use object name in the module namespace as a last resource to try to + save it by reference when pickling fails. + + Modified from Pickler.save_dict() and Pickler._batch_setitems(). + """ + if not self._refonfail: + super().save_dict(obj) + return + if self.bin: + self.write(EMPTY_DICT) + else: # proto 0 -- can't use EMPTY_DICT + self.write(MARK + DICT) + self.memoize(obj) + for k, v in obj.items(): + self.save(k) + if hasattr(v, '__name__') or hasattr(v, '__qualname__'): + self.save(v) + else: + self.save(v, name=k) + self.write(SETITEM) + class Unpickler(StockUnpickler): """python's Unpickler extended to interpreter sessions and more types""" from .settings import settings @@ -1173,26 +1239,30 @@ def _repr_dict(obj): @register(dict) def save_module_dict(pickler, obj): - if is_dill(pickler, child=False) and obj == pickler._main.__dict__ and \ + pickler_is_dill = is_dill(pickler, child=False) + if pickler_is_dill and obj == pickler._main.__dict__ and \ not (pickler._session and pickler._first_pass): - logger.trace(pickler, "D1: %s", _repr_dict(obj)) # obj + logger.trace(pickler, "D1: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8')) logger.trace(pickler, "# D1") - elif (not is_dill(pickler, child=False)) and (obj == _main_module.__dict__): - logger.trace(pickler, "D3: %s", _repr_dict(obj)) # obj + elif (not pickler_is_dill) and (obj == _main_module.__dict__): + logger.trace(pickler, "D3: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8')) #XXX: works in general? logger.trace(pickler, "# D3") elif '__name__' in obj and obj != _main_module.__dict__ \ and type(obj['__name__']) is str \ and obj is getattr(_import_module(obj['__name__'],True), '__dict__', None): - logger.trace(pickler, "D4: %s", _repr_dict(obj)) # obj + logger.trace(pickler, "D4: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c%s\n__dict__\n' % obj['__name__'], 'UTF-8')) logger.trace(pickler, "# D4") + elif pickler_is_dill and pickler._session and pickler._first_pass: + # we only care about session the first pass thru + pickler._first_pass = False + logger.trace(pickler, "D5: %s", _repr_dict(obj), obj=obj) + pickler._save_module_dict(obj) + logger.trace(pickler, "# D5") else: - logger.trace(pickler, "D2: %s", _repr_dict(obj)) # obj - if is_dill(pickler, child=False) and pickler._session: - # we only care about session the first pass thru - pickler._first_pass = False + logger.trace(pickler, "D2: %s", _repr_dict(obj), obj=obj) StockPickler.save_dict(pickler, obj) logger.trace(pickler, "# D2") return @@ -1491,7 +1561,7 @@ def save_cell(pickler, obj): if MAPPING_PROXY_TRICK: @register(DictProxyType) def save_dictproxy(pickler, obj): - logger.trace(pickler, "Mp: %s", _repr_dict(obj)) # obj + logger.trace(pickler, "Mp: %s", _repr_dict(obj), obj=obj) mapping = obj | _dictproxy_helper_instance pickler.save_reduce(DictProxyType, (mapping,), obj=obj) logger.trace(pickler, "# Mp") @@ -1499,7 +1569,7 @@ def save_dictproxy(pickler, obj): else: @register(DictProxyType) def save_dictproxy(pickler, obj): - logger.trace(pickler, "Mp: %s", _repr_dict(obj)) # obj + logger.trace(pickler, "Mp: %s", _repr_dict(obj), obj=obj) pickler.save_reduce(DictProxyType, (obj.copy(),), obj=obj) logger.trace(pickler, "# Mp") return @@ -1575,7 +1645,7 @@ def save_weakproxy(pickler, obj): logger.trace(pickler, "%s: %s", _t, obj) except ReferenceError: _t = "R3" - logger.trace(pickler, "%s: %s", _t, sys.exc_info()[1]) + logger.trace(pickler, "%s: %s", _t, sys.exc_info()[1], obj=obj) #callable = bool(getattr(refobj, '__call__', None)) if type(obj) is CallableProxyType: callable = True else: callable = False @@ -1914,7 +1984,7 @@ def pickles(obj,exact=False,safe=False,**kwds): """ if safe: exceptions = (Exception,) # RuntimeError, ValueError else: - exceptions = (TypeError, AssertionError, NotImplementedError, PicklingError, UnpicklingError) + exceptions = UNPICKLEABLE_ERRORS + (AssertionError, UnpicklingError) try: pik = copy(obj, **kwds) #FIXME: should check types match first, then check content if "exact" diff --git a/dill/logger.py b/dill/logger.py index fedff6bf..7b6afcdd 100644 --- a/dill/logger.py +++ b/dill/logger.py @@ -129,18 +129,22 @@ def trace_setup(self, pickler): if not dill._dill.is_dill(pickler, child=False): return if self.isEnabledFor(logging.INFO): - pickler._trace_depth = 1 + pickler._trace_stack = [] pickler._size_stack = [] else: - pickler._trace_depth = None - def trace(self, pickler, msg, *args, **kwargs): - if not hasattr(pickler, '_trace_depth'): + pickler._trace_stack = None + def trace(self, pickler, msg, *args, obj=None, **kwargs): + if not hasattr(pickler, '_trace_stack'): logger.info(msg, *args, **kwargs) return - if pickler._trace_depth is None: + if pickler._trace_stack is None: return extra = kwargs.get('extra', {}) pushed_obj = msg.startswith('#') + if not pushed_obj: + if obj is None: + obj = args[-1] + pickler._trace_stack.append(id(obj)) size = None try: # Streams are not required to be tellable. @@ -159,13 +163,11 @@ def trace(self, pickler, msg, *args, **kwargs): else: size -= pickler._size_stack.pop() extra['size'] = size - if pushed_obj: - pickler._trace_depth -= 1 - extra['depth'] = pickler._trace_depth + extra['depth'] = len(pickler._trace_stack) kwargs['extra'] = extra self.info(msg, *args, **kwargs) - if not pushed_obj: - pickler._trace_depth += 1 + if pushed_obj: + pickler._trace_stack.pop() class TraceFormatter(logging.Formatter): """ diff --git a/dill/session.py b/dill/session.py index dc26ae99..9e545f1c 100644 --- a/dill/session.py +++ b/dill/session.py @@ -184,9 +184,10 @@ def dump_module( filename = str(TEMPDIR/'session.pkl'), module: Union[ModuleType, str] = None, refimported: bool = False, + refonfail: bool = False, **kwds ) -> None: - """Pickle the current state of :py:mod:`__main__` or another module to a file. + R"""Pickle the current state of :py:mod:`__main__` or another module to a file. Save the contents of :py:mod:`__main__` (e.g. from an interactive interpreter session), an imported module, or a module-type object (e.g. @@ -202,6 +203,10 @@ def dump_module( similar but independent from ``dill.settings[`byref`]``, as ``refimported`` refers to virtually all imported objects, while ``byref`` only affects select objects. + refonfail: if `True`, objects that fail to be saved by value will try to + be saved by reference. If it also fails, saving their parent + objects by reference will be attempted recursively. In the worst + case scenario, the module itself may be saved by reference. **kwds: extra keyword arguments passed to :py:class:`Pickler()`. Raises: @@ -232,6 +237,15 @@ def dump_module( >>> foo.sin = math.sin >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) + - Save the state of a module with unpickleable objects: + + >>> import dill + >>> import os + >>> os.altsep = '\\' + >>> dill.dump_module('os_session.pkl', module=os) + PicklingError: ... + >>> dill.dump_module('os_session.pkl', module=os, refonfail=True) + - Restore the state of the saved modules: >>> import dill @@ -244,6 +258,9 @@ def dump_module( >>> foo = dill.load_module('foo_session.pkl') >>> [foo.sin(x) for x in foo.values] [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] + >>> os = dill.load_module('os_session.pkl') + >>> print(os.altsep.join('path')) + p\a\t\h *Changed in version 0.3.6:* Function ``dump_session()`` was renamed to ``dump_module()``. Parameters ``main`` and ``byref`` were renamed to @@ -266,6 +283,8 @@ def dump_module( from .settings import settings protocol = settings['protocol'] + if refimported is None: refimported = settings['dump_module']['refimported'] + if refonfail is None: refonfail = settings['dump_module']['refonfail'] main = module if main is None: main = _main_module @@ -283,6 +302,7 @@ def dump_module( pickler._main = main #FIXME: dill.settings are disabled pickler._byref = False # disable pickling by name reference pickler._recurse = False # disable pickling recursion for globals + pickler._refonfail = refonfail pickler._session = True # is best indicator of when pickling a session pickler._first_pass = True pickler.dump(main) diff --git a/dill/settings.py b/dill/settings.py index b105d2e8..22c55458 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -19,6 +19,10 @@ 'fmode' : 0, #HANDLE_FMODE 'recurse' : False, 'ignore' : False, + 'dump_module' : { + 'refimported': False, + 'refonfail' : False, + }, } del DEFAULT_PROTOCOL diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 9124802c..51128916 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -197,7 +197,7 @@ def test_runtime_module(): runtime = ModuleType(modname) runtime.x = 42 - mod = dill._dill._stash_modules(runtime) + mod = dill.session._stash_modules(runtime) if mod is not runtime: print("There are objects to save by referenece that shouldn't be:", mod.__dill_imported, mod.__dill_imported_as, mod.__dill_imported_top_level, From aac47b5fded6e69535a006dce79381b0764e9d17 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 19 Jul 2022 11:01:51 -0300 Subject: [PATCH 08/30] disable framing when using the 'refonfail' option --- dill/_dill.py | 27 ++++++++++++--------------- dill/logger.py | 2 +- dill/session.py | 18 ++++++++++++++---- 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 668534db..146ab9b6 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -367,7 +367,7 @@ def __init__(self, file, *args, **kwds): self._refonfail = False #settings['dump_module']['refonfail'] if _refonfail is None else _refonfail self._strictio = False #_strictio self._postproc = OrderedDict() - self._file = file # for the logger + self._file_tell = getattr(file, 'tell', None) # for logger and refonfail def dump(self, obj): #NOTE: if settings change, need to update attributes # register if the object is a numpy ufunc @@ -421,32 +421,29 @@ def save(self, obj, save_persistent_id=True, *, name=None): if not self._refonfail: super().save(obj, save_persistent_id) return - if self.framer.current_frame: - # protocol >= 4 - self.framer.commit_frame() - stream = self.framer.current_frame - else: - stream = self._file - position = stream.tell() + # Disable framing (right after the framer.init_framing() call at dump()). + self.framer.current_frame = None + # Store initial state. + position = self._file_tell() memo_size = len(self.memo) try: super().save(obj, save_persistent_id) except UNPICKLEABLE_ERRORS + (AttributeError,) as error_stack: - # AttributeError may happen in save_global() call for child object. + # AttributeError may happen in the save_global() call from a child object. if (type(error_stack) == AttributeError and "no attribute '__name__'" not in error_stack.args[0]): raise - # roll back the stream - stream.seek(position) - stream.truncate() - # roll back memo + # Roll back the stream. + self._file_seek(position) + self._file_truncate() + # Roll back memo. for _ in range(len(self.memo) - memo_size): - self.memo.popitem() # LIFO order is guaranteed for since 3.7 + self.memo.popitem() # LIFO order is guaranteed since 3.7 try: self.save_global(obj, name) except (AttributeError, PicklingError) as error: if getattr(self, '_trace_stack', None) and id(obj) == self._trace_stack[-1]: - # roll back trace state + # Roll back trace state. self._trace_stack.pop() self._size_stack.pop() raise error from error_stack diff --git a/dill/logger.py b/dill/logger.py index 7b6afcdd..385c862d 100644 --- a/dill/logger.py +++ b/dill/logger.py @@ -148,7 +148,7 @@ def trace(self, pickler, msg, *args, obj=None, **kwargs): size = None try: # Streams are not required to be tellable. - size = pickler._file.tell() + size = pickler._file_tell() frame = pickler.framer.current_frame try: size += frame.tell() diff --git a/dill/session.py b/dill/session.py index 9e545f1c..3da31318 100644 --- a/dill/session.py +++ b/dill/session.py @@ -203,10 +203,12 @@ def dump_module( similar but independent from ``dill.settings[`byref`]``, as ``refimported`` refers to virtually all imported objects, while ``byref`` only affects select objects. - refonfail: if `True`, objects that fail to be saved by value will try to - be saved by reference. If it also fails, saving their parent + refonfail: if `True`, objects that fail to pickle by value will try to + be saved by reference. If this also fails, saving their parent objects by reference will be attempted recursively. In the worst - case scenario, the module itself may be saved by reference. + case scenario, the module itself may be saved by reference. Note: + The file-like object must be seekable and truncable with this + option set. **kwds: extra keyword arguments passed to :py:class:`Pickler()`. Raises: @@ -302,9 +304,17 @@ def dump_module( pickler._main = main #FIXME: dill.settings are disabled pickler._byref = False # disable pickling by name reference pickler._recurse = False # disable pickling recursion for globals - pickler._refonfail = refonfail pickler._session = True # is best indicator of when pickling a session pickler._first_pass = True + if refonfail: + pickler._refonfail = True # False by default + pickler._file_seek = getattr(file, 'seek', None) + pickler._file_truncate = getattr(file, 'truncate', None) + if hasattr(file, 'seekable') and not file.seekable(): + pickler._file_seek = None + if pickler._file_seek is None or pickler._file_truncate is None: + raise TypeError("file must have 'tell', 'seek' and 'truncate'" + " attributes if the 'refonfail' option is set.") pickler.dump(main) return From 5e4d91233afb996783a456603bff0138e5d1eca0 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 21 Jul 2022 19:41:05 -0300 Subject: [PATCH 09/30] rename is_module_pickle() to is_pickled_module(); fix _dill's __all__ --- dill/__init__.py | 2 +- dill/_dill.py | 11 +++++------ dill/session.py | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/dill/__init__.py b/dill/__init__.py index 8f8429bd..b540ebd3 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -30,7 +30,7 @@ PickleError, PickleWarning, PicklingError, PicklingWarning, UnpicklingError, UnpicklingWarning, ) -from .session import dump_module, is_module_pickle, load_module, load_module_asdict +from .session import dump_module, is_pickled_module, load_module, load_module_asdict from .session import dump_session, load_session # backward compatibility from . import detect, session, source, temp diff --git a/dill/_dill.py b/dill/_dill.py index 146ab9b6..d734075b 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -16,12 +16,11 @@ Test against CH16+ Std. Lib. ... TBD. """ __all__ = [ - 'dump', 'dumps', 'load', 'loads', 'dump_module', 'load_module', - 'load_module_asdict', 'dump_session', 'load_session', 'is_module_pickle', - 'Pickler', 'Unpickler', 'register', 'copy', 'pickle', 'pickles', 'check', - 'HIGHEST_PROTOCOL', 'DEFAULT_PROTOCOL', 'PicklingError', 'UnpicklingError', - 'HANDLE_FMODE', 'CONTENTS_FMODE', 'FILE_FMODE', 'PickleError', - 'PickleWarning', 'PicklingWarning', 'UnpicklingWarning', + 'Pickler','Unpickler', + 'check','copy','dump','dumps','load','loads','pickle','pickles','register', + 'DEFAULT_PROTOCOL','HIGHEST_PROTOCOL','CONTENTS_FMODE','FILE_FMODE','HANDLE_FMODE', + 'PickleError','PickleWarning','PicklingError','PicklingWarning','UnpicklingError', + 'UnpicklingWarning', ] __module__ = 'dill' diff --git a/dill/session.py b/dill/session.py index 3da31318..021a0fd9 100644 --- a/dill/session.py +++ b/dill/session.py @@ -11,7 +11,7 @@ """ __all__ = [ - 'dump_module', 'is_module_pickle', 'load_module', 'load_module_asdict', + 'dump_module', 'is_pickled_module', 'load_module', 'load_module_asdict', 'dump_session', 'load_session' # backward compatibility ] @@ -347,7 +347,7 @@ def _identify_module(file, main=None): return None raise UnpicklingError("unable to identify main module") from error -def is_module_pickle(filename, importable: bool = True) -> bool: +def is_pickled_module(filename, importable: bool = True) -> bool: """Check if a file is a module state pickle file. Parameters: From 699f30ae69c40386df98904367ab49d6dbf58ed2 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 21 Jul 2022 21:58:29 -0300 Subject: [PATCH 10/30] sync with branch 'session-excludes' --- dill/session.py | 22 +++++++++++----------- dill/settings.py | 3 ++- dill/tests/test_session.py | 4 ++-- docs/source/dill.rst | 14 +++++++++++++- 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/dill/session.py b/dill/session.py index 021a0fd9..fee7899b 100644 --- a/dill/session.py +++ b/dill/session.py @@ -183,8 +183,9 @@ def _restore_modules(unpickler, main_module): def dump_module( filename = str(TEMPDIR/'session.pkl'), module: Union[ModuleType, str] = None, - refimported: bool = False, - refonfail: bool = False, + *, + refimported: bool = None, + refonfail: bool = None, **kwds ) -> None: R"""Pickle the current state of :py:mod:`__main__` or another module to a file. @@ -285,8 +286,10 @@ def dump_module( from .settings import settings protocol = settings['protocol'] - if refimported is None: refimported = settings['dump_module']['refimported'] - if refonfail is None: refonfail = settings['dump_module']['refonfail'] + if refimported is None: + refimported = settings['dump_module']['refimported'] + if refonfail is None: + refonfail = settings['dump_module']['refonfail'] main = module if main is None: main = _main_module @@ -486,7 +489,7 @@ def load_module( # Resolve unpickler._main pickle_main = _identify_module(file, main) - if main is None and pickle_main is not None: + if main is None: main = pickle_main if isinstance(main, str): if main.startswith('__runtime__.'): @@ -494,12 +497,9 @@ def load_module( main = ModuleType(main.partition('.')[-1]) else: main = _import_module(main) - if main is not None: - if not isinstance(main, ModuleType): - raise TypeError("%r is not a module" % main) - unpickler._main = main - else: - main = unpickler._main + if not isinstance(main, ModuleType): + raise TypeError("%r is not a module" % main) + unpickler._main = main # Check against the pickle's main. is_main_imported = _is_imported_module(main) diff --git a/dill/settings.py b/dill/settings.py index 22c55458..df1d30a4 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -9,6 +9,8 @@ global settings for Pickler """ +__all__ = ['settings'] + from pickle import DEFAULT_PROTOCOL settings = { @@ -26,4 +28,3 @@ } del DEFAULT_PROTOCOL - diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 51128916..6a6ce22e 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -11,6 +11,7 @@ import __main__ from contextlib import suppress from io import BytesIO +from types import ModuleType import dill @@ -192,7 +193,6 @@ def test_session_other(): assert module.selfref is module def test_runtime_module(): - from types import ModuleType modname = '__runtime__' runtime = ModuleType(modname) runtime.x = 42 @@ -230,7 +230,7 @@ def test_refimported_imported_as(): import concurrent.futures import types import typing - mod = sys.modules['__test__'] = types.ModuleType('__test__') + mod = sys.modules['__test__'] = ModuleType('__test__') dill.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) mod.Dict = collections.UserDict # select by type mod.AsyncCM = typing.AsyncContextManager # select by __module__ diff --git a/docs/source/dill.rst b/docs/source/dill.rst index 31d41c91..af64599c 100644 --- a/docs/source/dill.rst +++ b/docs/source/dill.rst @@ -11,7 +11,7 @@ dill module :special-members: :show-inheritance: :imported-members: - :exclude-members: dump_session, load_session +.. :exclude-members: detect module ------------- @@ -49,6 +49,18 @@ pointers module :imported-members: .. :exclude-members: +session module +--------------- + +.. automodule:: dill.session + :members: + :undoc-members: + :private-members: + :special-members: + :show-inheritance: + :imported-members: + :exclude-members: dump_session, load_session + settings module --------------- From 04968f35ec7f6b984c16f71fde2caa972570e227 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Fri, 22 Jul 2022 00:10:06 -0300 Subject: [PATCH 11/30] refonfail: save modules by reference using save_reduce() --- dill/_dill.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index d734075b..415286c9 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -347,6 +347,7 @@ class UnpicklingWarning(PickleWarning, UnpicklingError): class Pickler(StockPickler): """python's Pickler extended to interpreter sessions""" dispatch = MetaCatchingDict(StockPickler.dispatch.copy()) + _refonfail = False _session = False from .settings import settings @@ -355,7 +356,6 @@ def __init__(self, file, *args, **kwds): _byref = kwds.pop('byref', None) _fmode = kwds.pop('fmode', None) _recurse = kwds.pop('recurse', None) - #_refonfail = kwds.pop('refonfail', None) #_strictio = kwds.pop('strictio', None) StockPickler.__init__(self, file, *args, **kwds) self._main = _main_module @@ -363,7 +363,6 @@ def __init__(self, file, *args, **kwds): self._byref = settings['byref'] if _byref is None else _byref self._fmode = settings['fmode'] if _fmode is None else _fmode self._recurse = settings['recurse'] if _recurse is None else _recurse - self._refonfail = False #settings['dump_module']['refonfail'] if _refonfail is None else _refonfail self._strictio = False #_strictio self._postproc = OrderedDict() self._file_tell = getattr(file, 'tell', None) # for logger and refonfail @@ -439,7 +438,11 @@ def save(self, obj, save_persistent_id=True, *, name=None): for _ in range(len(self.memo) - memo_size): self.memo.popitem() # LIFO order is guaranteed since 3.7 try: - self.save_global(obj, name) + if isinstance(obj, ModuleType) and \ + (_is_builtin_module(obj) or obj is sys.modules['dill']): + self.save_reduce(_import_module, (obj.__name__,), obj=obj) + else: + self.save_global(obj, name) except (AttributeError, PicklingError) as error: if getattr(self, '_trace_stack', None) and id(obj) == self._trace_stack[-1]: # Roll back trace state. From a596126cb020ce4f5bcff8c170ff494e06cddbed Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Fri, 22 Jul 2022 00:28:08 -0300 Subject: [PATCH 12/30] unpickleable ctypes objects raise ValueError... --- dill/_dill.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 415286c9..abe0c5ed 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -235,8 +235,8 @@ def __reduce_ex__(self, protocol): #: Pickles the entire file (handle and contents), preserving mode and position. FILE_FMODE = 2 -# Exceptions commonly raised by unpicklable objects. -UNPICKLEABLE_ERRORS = (PicklingError, TypeError, NotImplementedError) +# Exceptions commonly raised by unpicklable objects in the Standard Library. +UNPICKLEABLE_ERRORS = (PicklingError, TypeError, ValueError, NotImplementedError) ### Shorthands (modified from python2.5/lib/pickle.py) def copy(obj, *args, **kwds): From f46d399b6dc65277f95d6d668f7555c628ec8c71 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sun, 24 Jul 2022 00:50:27 -0300 Subject: [PATCH 13/30] move common autodoc options to conf.py and exclude some special members Exclude special class members that are implementation details and give practically no useful information: - __dict__ (can generate really big strings) - __module__ - __slots__ - __weakref__ --- dill/__init__.py | 15 +++++---- dill/_dill.py | 6 ++-- dill/session.py | 4 +-- docs/source/conf.py | 8 +++++ docs/source/dill.rst | 73 ++++++-------------------------------------- 5 files changed, 29 insertions(+), 77 deletions(-) diff --git a/dill/__init__.py b/dill/__init__.py index 87e4eb42..3571f54e 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -11,10 +11,10 @@ from .__info__ import __version__, __author__, __doc__, __license__ except: # pragma: no cover import os - import sys + import sys parent = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) sys.path.append(parent) - # get distribution meta info + # get distribution meta info from version import (__version__, __author__, get_license_text, get_readme_as_rst) __license__ = get_license_text(os.path.join(parent, 'LICENSE')) @@ -24,14 +24,14 @@ from ._dill import ( - Pickler, Unpickler, - check, copy, dump, dumps, load, loads, pickle, pickles, register, - DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, CONTENTS_FMODE, FILE_FMODE, HANDLE_FMODE, + dump, dumps, load, loads, copy, + Pickler, Unpickler, register, pickle, pickles, check, + DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, HANDLE_FMODE, CONTENTS_FMODE, FILE_FMODE, PickleError, PickleWarning, PicklingError, PicklingWarning, UnpicklingError, UnpicklingWarning, ) from .session import ( - dump_module, is_pickled_module, load_module, load_module_asdict, + dump_module, load_module, load_module_asdict, is_pickled_module, dump_session, load_session # backward compatibility ) from . import detect, logger, session, source, temp @@ -42,8 +42,6 @@ # make sure "trace" is turned off logger.trace(False) -from importlib import reload - objects = {} # local import of dill._objects #from . import _objects @@ -68,6 +66,7 @@ def load_types(pickleable=True, unpickleable=True): Returns: None """ + from importlib import reload # local import of dill.objects from . import _objects if pickleable: diff --git a/dill/_dill.py b/dill/_dill.py index ca445bed..ab560a54 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -16,9 +16,9 @@ Test against CH16+ Std. Lib. ... TBD. """ __all__ = [ - 'Pickler','Unpickler', - 'check','copy','dump','dumps','load','loads','pickle','pickles','register', - 'DEFAULT_PROTOCOL','HIGHEST_PROTOCOL','CONTENTS_FMODE','FILE_FMODE','HANDLE_FMODE', + 'dump','dumps','load','loads','copy', + 'Pickler','Unpickler','register','pickle','pickles','check', + 'DEFAULT_PROTOCOL','HIGHEST_PROTOCOL','HANDLE_FMODE','CONTENTS_FMODE','FILE_FMODE', 'PickleError','PickleWarning','PicklingError','PicklingWarning','UnpicklingError', 'UnpicklingWarning', ] diff --git a/dill/session.py b/dill/session.py index 798b627a..626798e4 100644 --- a/dill/session.py +++ b/dill/session.py @@ -7,11 +7,11 @@ # License: 3-clause BSD. The full license text is available at: # - https://github.com/uqfoundation/dill/blob/master/LICENSE """ -Pickle and restore the intepreter session. +Pickle and restore the intepreter session or a module's state. """ __all__ = [ - 'dump_module', 'is_pickled_module', 'load_module', 'load_module_asdict', + 'dump_module', 'load_module', 'load_module_asdict', 'is_pickled_module', 'dump_session', 'load_session' # backward compatibility ] diff --git a/docs/source/conf.py b/docs/source/conf.py index ead9ed06..ff34cd55 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -66,6 +66,14 @@ # extension config github_project_url = "https://github.com/uqfoundation/dill" autoclass_content = 'both' +autodoc_default_options = { + 'members': True, + 'undoc-members': True, + 'private-members': True, + 'special-members': True, + 'show-inheritance': True, + 'exclude-members': '__dict__, __module__, __slots__, __weakref__', +} autodoc_typehints = 'description' napoleon_include_init_with_doc = True napoleon_include_private_with_doc = False diff --git a/docs/source/dill.rst b/docs/source/dill.rst index 2770af2a..db81dffe 100644 --- a/docs/source/dill.rst +++ b/docs/source/dill.rst @@ -5,107 +5,52 @@ dill module ----------- .. automodule:: dill._dill - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + detect module ------------- .. automodule:: dill.detect - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: ismethod, isfunction, istraceback, isframe, iscode, parent, reference, at, parents, children +.. :exclude-members: +ismethod, isfunction, istraceback, isframe, iscode, parent, reference, at, parents, children logger module ------------- .. automodule:: dill.logger - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + objtypes module --------------- .. automodule:: dill.objtypes - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + pointers module --------------- .. automodule:: dill.pointers - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + session module --------------- .. automodule:: dill.session - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: - :exclude-members: dump_session, load_session + :exclude-members: +dump_session, load_session settings module --------------- .. automodule:: dill.settings - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + source module ------------- .. automodule:: dill.source - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: +.. :exclude-members: + temp module ----------- .. automodule:: dill.temp - :members: - :undoc-members: - :private-members: - :special-members: - :show-inheritance: - :imported-members: -.. :exclude-members: - +.. :exclude-members: + From bef579561ec9aeac8788ff26b6291dae2cd20d78 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sun, 24 Jul 2022 01:01:47 -0300 Subject: [PATCH 14/30] don't document trace() twice --- docs/source/dill.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/dill.rst b/docs/source/dill.rst index db81dffe..e18607db 100644 --- a/docs/source/dill.rst +++ b/docs/source/dill.rst @@ -17,7 +17,7 @@ logger module ------------- .. automodule:: dill.logger -.. :exclude-members: + + :exclude-members: +trace objtypes module --------------- From 953b5e049af28fe55c33586decae1de4602e4323 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 26 Jul 2022 20:46:20 -0300 Subject: [PATCH 15/30] fix is_pickled_module() --- dill/_dill.py | 2 +- dill/session.py | 38 ++++++++++++++++++++++++-------------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index ab560a54..9e2568e3 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -39,7 +39,7 @@ #XXX: get types from .objtypes ? import builtins as __builtin__ from pickle import _Pickler as StockPickler, Unpickler as StockUnpickler -from pickle import BINPUT, DICT, EMPTY_DICT, LONG_BINPUT, MARK, PUT, SETITEM +from pickle import DICT, EMPTY_DICT, MARK, SETITEM from struct import pack from _thread import LockType from _thread import RLock as RLockType diff --git a/dill/session.py b/dill/session.py index 626798e4..92861bbe 100644 --- a/dill/session.py +++ b/dill/session.py @@ -20,7 +20,7 @@ import sys import warnings -from dill import _dill, Pickler, Unpickler +from dill import _dill, Pickler, Unpickler, UnpicklingError from ._dill import ( BuiltinMethodType, FunctionType, MethodType, ModuleType, TypeType, _import_module, _is_builtin_module, _is_imported_module, _main_module, @@ -326,20 +326,30 @@ def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, ** def _identify_module(file, main=None): """identify the name of the module stored in the given file-type object""" - from pickletools import genops - UNICODE = {'UNICODE', 'BINUNICODE', 'SHORT_BINUNICODE'} - found_import = False + import pickletools + NEUTRAL = {'PROTO', 'FRAME', 'PUT', 'BINPUT', 'MEMOIZE', 'MARK', 'STACK_GLOBAL'} + opcodes = ((opcode.name, arg) for opcode, arg, pos in pickletools.genops(file.peek(256)) + if opcode.name not in NEUTRAL) try: - for opcode, arg, pos in genops(file.peek(256)): - if not found_import: - if opcode.name in ('GLOBAL', 'SHORT_BINUNICODE') and \ - arg.endswith('_import_module'): - found_import = True - else: - if opcode.name in UNICODE: - return arg - else: - raise UnpicklingError("reached STOP without finding main module") + opcode, arg = next(opcodes) + if (opcode, arg) == ('SHORT_BINUNICODE', 'dill._dill'): + # The file uses STACK_GLOBAL instead of GLOBAL. + opcode, arg = next(opcodes) + if not (opcode in ('SHORT_BINUNICODE', 'GLOBAL') and arg.split()[-1] == '_import_module'): + raise ValueError + opcode, arg = next(opcodes) + if not opcode in ('SHORT_BINUNICODE', 'BINUNICODE', 'UNICODE'): + raise ValueError + module_name = arg + if not ( + next(opcodes)[0] in ('TUPLE1', 'TUPLE') and + next(opcodes)[0] == 'REDUCE' and + next(opcodes)[0] in ('EMPTY_DICT', 'DICT') + ): + raise ValueError + return module_name + except StopIteration: + raise UnpicklingError("reached STOP without finding main module") from None except (NotImplementedError, ValueError) as error: # ValueError occours when the end of the chunk is reached (without a STOP). if isinstance(error, NotImplementedError) and main is not None: From e5da1c8373e083fc64bcea7aaa83c040273e4a97 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 30 Jul 2022 09:39:13 -0300 Subject: [PATCH 16/30] deteail the effects of 'module' argument in load_module() and rename 'main' to 'module' on doctstrings --- dill/session.py | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/dill/session.py b/dill/session.py index 92861bbe..52fd0359 100644 --- a/dill/session.py +++ b/dill/session.py @@ -349,13 +349,13 @@ def _identify_module(file, main=None): raise ValueError return module_name except StopIteration: - raise UnpicklingError("reached STOP without finding main module") from None + raise UnpicklingError("reached STOP without finding module") from None except (NotImplementedError, ValueError) as error: # ValueError occours when the end of the chunk is reached (without a STOP). if isinstance(error, NotImplementedError) and main is not None: # file is not peekable, but we have main. return None - raise UnpicklingError("unable to identify main module") from error + raise UnpicklingError("unable to identify module") from error def is_pickled_module(filename, importable: bool = True) -> bool: """Check if a file is a module state pickle file. @@ -393,10 +393,39 @@ def load_module( :py:class:`~types.ModuleType`). When restoring the state of a non-importable module-type object, the - current instance of this module may be passed as the argument ``main``. + current instance of this module may be passed as the argument ``module``. Otherwise, a new instance is created with :py:class:`~types.ModuleType` and returned. + Passing a `module` argument forces dill to verify that the module being + loaded is compatible with the argument value. Additionally, if the argument + is a module (instead of a module name), it supresses the return value. + + This call loads ``math`` and returns it at the end: + + >>> import dill + >>> # load module -> restore state -> return module + >>> dill.load_module('math_session.pkl') + + + Passing the module name does the same as above, but also verifies that the + module loaded, restored and returned is indeed ``math``: + + >>> import dill + >>> # load module -> check name/kind -> restore state -> return module + >>> dill.load_module('math_session.pkl', module='math') + + >>> dill.load_module('math_session.pkl', module='cmath') + ValueError: can't update module 'cmath' with the saved state of module 'math' + + Passing the module itself instead of its name have the additional effect of + supressing the return value (and the module is already loaded at this point): + + >>> import dill + >>> import math + >>> # check name/kind -> restore state -> return None + >>> dill.load_module('math_session.pkl', module=math) + Parameters: filename: a path-like object or a readable stream. module: a module object or the name of an importable module; @@ -406,12 +435,12 @@ def load_module( Raises: :py:exc:`UnpicklingError`: if unpickling fails. - :py:exc:`ValueError`: if the argument ``main`` and module saved + :py:exc:`ValueError`: if the argument ``module`` and module saved at ``filename`` are incompatible. Returns: A module object, if the saved module is not :py:mod:`__main__` or - a module instance wasn't provided with the argument ``main``. + a module instance wasn't provided with the argument ``module``. Examples: From 2e4887c3d6e90fcd3eb2fa48619c93580dcf3337 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 30 Jul 2022 10:12:00 -0300 Subject: [PATCH 17/30] Better describe the side effects and the usage of the returned value from load_module() --- dill/session.py | 85 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 30 deletions(-) diff --git a/dill/session.py b/dill/session.py index 52fd0359..7d16b0ae 100644 --- a/dill/session.py +++ b/dill/session.py @@ -268,7 +268,7 @@ def dump_module( Note: Currently, ``dill.settings['byref']`` and ``dill.settings['recurse']`` - don't apply to this function.` + don't apply to this function. """ for old_par, par in [('main', 'module'), ('byref', 'refimported')]: if old_par in kwds: @@ -397,35 +397,6 @@ def load_module( Otherwise, a new instance is created with :py:class:`~types.ModuleType` and returned. - Passing a `module` argument forces dill to verify that the module being - loaded is compatible with the argument value. Additionally, if the argument - is a module (instead of a module name), it supresses the return value. - - This call loads ``math`` and returns it at the end: - - >>> import dill - >>> # load module -> restore state -> return module - >>> dill.load_module('math_session.pkl') - - - Passing the module name does the same as above, but also verifies that the - module loaded, restored and returned is indeed ``math``: - - >>> import dill - >>> # load module -> check name/kind -> restore state -> return module - >>> dill.load_module('math_session.pkl', module='math') - - >>> dill.load_module('math_session.pkl', module='cmath') - ValueError: can't update module 'cmath' with the saved state of module 'math' - - Passing the module itself instead of its name have the additional effect of - supressing the return value (and the module is already loaded at this point): - - >>> import dill - >>> import math - >>> # check name/kind -> restore state -> return None - >>> dill.load_module('math_session.pkl', module=math) - Parameters: filename: a path-like object or a readable stream. module: a module object or the name of an importable module; @@ -442,6 +413,60 @@ def load_module( A module object, if the saved module is not :py:mod:`__main__` or a module instance wasn't provided with the argument ``module``. + Passing an argument to ``module`` forces `dill` to verify that the module + being loaded is compatible with the argument value. Additionally, if the + argument is a module (instead of a module name), it supresses the return + value. Each case and behavior is exemplified below: + + 1. `module`: ``None`` --- This call loads a previously saved state of + the module ``math`` and returns this at the end: + + >>> import dill + >>> # load module -> restore state -> return module + >>> dill.load_module('math_session.pkl') + + + 2. `module`: ``str`` --- Passing the module name does the same as above, + but also verifies that the module loaded, restored and returned is + indeed ``math``: + + >>> import dill + >>> # load module -> check name/kind -> restore state -> return module + >>> dill.load_module('math_session.pkl', module='math') + + >>> dill.load_module('math_session.pkl', module='cmath') + ValueError: can't update module 'cmath' with the saved state of module 'math' + + 3. `module`: ``ModuleType`` --- Passing the module itself instead of its + name have the additional effect of supressing the return value (and the + module is already loaded at this point): + + >>> import dill + >>> import math + >>> # check name/kind -> restore state -> return None + >>> dill.load_module('math_session.pkl', module=math) + + For imported modules, the return value is meant as a convenience, so that + the function call can substitute an ``import`` statement. Therefore these + statements: + + >>> import dill + >>> math2 = dill.load_module('math_session.pkl', module='math') + + are equivalent to these: + + >>> import dill + >>> import math as math2 + >>> dill.load_module('math_session.pkl', module=math2) + + Note that, in both cases, ``math2`` is just a reference to + ``sys.modules['math']``: + + >>> import math + >>> import sys + >>> math is math2 is sys.modules['math'] + True + Examples: - Save the state of some modules: From be319c8baa3f883a0c7d290c0000f6b3b2bfe329 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 30 Jul 2022 15:07:02 -0300 Subject: [PATCH 18/30] describe session module behavior and use cases --- dill/session.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/dill/session.py b/dill/session.py index 7d16b0ae..72e8c2c2 100644 --- a/dill/session.py +++ b/dill/session.py @@ -8,8 +8,30 @@ # - https://github.com/uqfoundation/dill/blob/master/LICENSE """ Pickle and restore the intepreter session or a module's state. + +The functions :py:func:`dump_module`, :py:func:`load_module` and +:py:func:`load_module_asdict` are capable of saving and restoring, as long as +objects are pickleable, the complete state of a module. For imported modules +that are pickled, `dill` assumes that they are importable when unpickling. + +Contrary of using :py:func:`dill.dump` and :py:func:`dill.load` to save and load +a module object, :py:func:`dill.dump_module` always try to pickle the module by +value (including built-in modules). Also, options like +``dill.settings['byref']`` and ``dill.settings['recurse']`` don't affect its +behavior. + +However, if a module contains references to objects originating from other +modules, that would prevent it from pickling or drastically increase its disk +size, they can be saved by reference instead of by value using the option +``refimported``. + +With :py:func:`dump_module`, namespace filters may be used to restrict the list +of variables pickled to a subset of those in the module, based on their names or +values. Also, using :py:func:`load_module_asdict` allows one to load the +variables from different saved states of the same module into dictionaries. """ + __all__ = [ 'dump_module', 'load_module', 'load_module_asdict', 'is_pickled_module', 'dump_session', 'load_session' # backward compatibility From a9ea8830fa5d567fe69cf574b1cf7a41b8a134b7 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Mon, 1 Aug 2022 19:38:37 -0300 Subject: [PATCH 19/30] add Python License and copyright notice for modified code as specified by the licensce --- LICENSE | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ dill/_dill.py | 9 +++++++-- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/LICENSE b/LICENSE index 6e9cde5a..a76105cd 100644 --- a/LICENSE +++ b/LICENSE @@ -33,3 +33,56 @@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--------------------------------------------------------------------------- + +Copyright (c) 2001-2022 Python Software Foundation. +All Rights Reserved. + +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 + +1. This LICENSE AGREEMENT is between the Python Software Foundation +("PSF"), and the Individual or Organization ("Licensee") accessing and +otherwise using this software ("Python") in source or binary form and +its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, PSF +hereby grants Licensee a nonexclusive, royalty-free, world-wide +license to reproduce, analyze, test, perform and/or display publicly, +prepare derivative works, distribute, and otherwise use Python alone +or in any derivative version, provided, however, that PSF's License +Agreement and PSF's notice of copyright, i.e., "Copyright (c) 2001, +2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, +2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022 Python +Software Foundation; All Rights Reserved" are retained in Python alone +or in any derivative version prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python. + +4. PSF is making Python available to Licensee on an "AS IS" +basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any +relationship of agency, partnership, or joint venture between PSF and +Licensee. This License Agreement does not grant permission to use PSF +trademarks or trade name in a trademark sense to endorse or promote +products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using Python, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. diff --git a/dill/_dill.py b/dill/_dill.py index 9e2568e3..cafa0d94 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -460,9 +460,14 @@ def _save_module_dict(self, obj): """ Use object name in the module namespace as a last resource to try to save it by reference when pickling fails. - - Modified from Pickler.save_dict() and Pickler._batch_setitems(). """ + # Modified from Python Standard Library's pickle._Pickler.save_dict() + # and pickle._Pickler._batch_setitems(). + # + # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved + # Changes summary: use SETITEM for all pickle protocols and + # conditionally pass an extra argument to a custom implementation of + # the method 'save'. if not self._refonfail: super().save_dict(obj) return From b7224313604ee2b3fca8a80479b7219c7ca8c3ec Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 2 Aug 2022 13:52:36 -0300 Subject: [PATCH 20/30] revert addition of PSF license; add link to license --- LICENSE | 53 --------------------------------------------------- dill/_dill.py | 3 ++- 2 files changed, 2 insertions(+), 54 deletions(-) diff --git a/LICENSE b/LICENSE index a76105cd..6e9cde5a 100644 --- a/LICENSE +++ b/LICENSE @@ -33,56 +33,3 @@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ---------------------------------------------------------------------------- - -Copyright (c) 2001-2022 Python Software Foundation. -All Rights Reserved. - -PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 - -1. This LICENSE AGREEMENT is between the Python Software Foundation -("PSF"), and the Individual or Organization ("Licensee") accessing and -otherwise using this software ("Python") in source or binary form and -its associated documentation. - -2. Subject to the terms and conditions of this License Agreement, PSF -hereby grants Licensee a nonexclusive, royalty-free, world-wide -license to reproduce, analyze, test, perform and/or display publicly, -prepare derivative works, distribute, and otherwise use Python alone -or in any derivative version, provided, however, that PSF's License -Agreement and PSF's notice of copyright, i.e., "Copyright (c) 2001, -2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, -2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022 Python -Software Foundation; All Rights Reserved" are retained in Python alone -or in any derivative version prepared by Licensee. - -3. In the event Licensee prepares a derivative work that is based on -or incorporates Python or any part thereof, and wants to make -the derivative work available to others as provided herein, then -Licensee hereby agrees to include in any such work a brief summary of -the changes made to Python. - -4. PSF is making Python available to Licensee on an "AS IS" -basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR -IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND -DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS -FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT -INFRINGE ANY THIRD PARTY RIGHTS. - -5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON -FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS -A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, -OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. - -6. This License Agreement will automatically terminate upon a material -breach of its terms and conditions. - -7. Nothing in this License Agreement shall be deemed to create any -relationship of agency, partnership, or joint venture between PSF and -Licensee. This License Agreement does not grant permission to use PSF -trademarks or trade name in a trademark sense to endorse or promote -products or services of Licensee, or any third party. - -8. By copying, installing or otherwise using Python, Licensee -agrees to be bound by the terms and conditions of this License -Agreement. diff --git a/dill/_dill.py b/dill/_dill.py index cafa0d94..564d7fe6 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -465,7 +465,8 @@ def _save_module_dict(self, obj): # and pickle._Pickler._batch_setitems(). # # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved - # Changes summary: use SETITEM for all pickle protocols and + # License Agreement: https://opensource.org/licenses/Python-2.0 + # Summary of changes: use SETITEM for all pickle protocols and # conditionally pass an extra argument to a custom implementation of # the method 'save'. if not self._refonfail: From 2a7e984873e1ffb4f8802d3e4885881d20c88ab5 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 3 Aug 2022 16:31:00 -0300 Subject: [PATCH 21/30] _open(): cover all the possible file opening modes --- dill/session.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dill/session.py b/dill/session.py index 72e8c2c2..fb8de0af 100644 --- a/dill/session.py +++ b/dill/session.py @@ -84,11 +84,13 @@ def peek(self, n): def _open(file, mode, *, peekable=False): """return a context manager with an opened file-like object""" import io - attr = 'write' if 'w' in mode else 'read' - was_open = hasattr(file, attr) + readonly = ('r' in mode and '+' not in mode) + if peekable and not readonly: + raise ValueError("the 'peekable' option is invalid for writable files") + was_open = hasattr(file, 'read' if readonly else 'write') if not was_open: file = open(file, mode) - if attr == 'read' and peekable and not hasattr(file, 'peek'): + if readonly and peekable and not hasattr(file, 'peek'): # Try our best to return the stream as an object with a peek() method. if hasattr(file, 'tell') and hasattr(file, 'seek'): file = _PeekableReader(file) From fa4fa85c7a00e03b3972d53f3f6a0ef3419117bd Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 3 Aug 2022 16:31:22 -0300 Subject: [PATCH 22/30] grammar --- dill/session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dill/session.py b/dill/session.py index fb8de0af..c8acbe41 100644 --- a/dill/session.py +++ b/dill/session.py @@ -229,7 +229,7 @@ def dump_module( be saved by reference. If this also fails, saving their parent objects by reference will be attempted recursively. In the worst case scenario, the module itself may be saved by reference. Note: - The file-like object must be seekable and truncable with this + The file-like object must be seekable and truncatable with this option set. **kwds: extra keyword arguments passed to :py:class:`Pickler()`. From 92318a7314bbd0bbed2fdc98398954c456387ad3 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Wed, 3 Aug 2022 17:32:42 -0300 Subject: [PATCH 23/30] better document Pickler.save() and Pickler._save_module_dict() --- dill/_dill.py | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 564d7fe6..0e7a1b20 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -419,7 +419,15 @@ def save_numpy_array(pickler, obj): dump.__doc__ = StockPickler.dump.__doc__ def save(self, obj, save_persistent_id=True, *, name=None): - """If self._refonfail is True, try to save object by reference if pickling fails.""" + # This method overrides StockPickler.save() and is called for every + # object pickled. When 'refonfail' is True, it tries to save the object + # by reference if pickling it fails with a common pickling error, as + # defined by the constant UNPICKLEABLE_ERRORS. If that also fails, then + # the exception is risen and, if this was called indirectly from another + # Pickler.save() call, the parent objects will try to be saved by + # reference recursively, until it succeeds or the exception propagates + # beyond the topmost save() call. The extra 'name' argument is passed + # to StockPickler.save_global(). if not self._refonfail: super().save(obj, save_persistent_id) return @@ -430,7 +438,7 @@ def save(self, obj, save_persistent_id=True, *, name=None): memo_size = len(self.memo) try: super().save(obj, save_persistent_id) - except UNPICKLEABLE_ERRORS + (AttributeError,) as error_stack: + except (AttributeError, *UNPICKLEABLE_ERRORS) as error_stack: # AttributeError may happen in the save_global() call from a child object. if (type(error_stack) == AttributeError and "no attribute '__name__'" not in error_stack.args[0]): @@ -441,6 +449,7 @@ def save(self, obj, save_persistent_id=True, *, name=None): # Roll back memo. for _ in range(len(self.memo) - memo_size): self.memo.popitem() # LIFO order is guaranteed since 3.7 + # Try to save object by reference. try: if isinstance(obj, ModuleType) and \ (_is_builtin_module(obj) or obj is sys.modules['dill']): @@ -457,21 +466,22 @@ def save(self, obj, save_persistent_id=True, *, name=None): type(obj).__name__, id(obj), obj=obj) def _save_module_dict(self, obj): + """Save a module's dictionary. + + If an object doesn't have a '__name__' attribute, pass the object's name + in the module's namespace to save(), so that it can be used with + save_global() to increase the chances of finding the object for saving + it by reference in the event of a failed serialization. """ - Use object name in the module namespace as a last resource to try to - save it by reference when pickling fails. - """ - # Modified from Python Standard Library's pickle._Pickler.save_dict() - # and pickle._Pickler._batch_setitems(). - # - # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved - # License Agreement: https://opensource.org/licenses/Python-2.0 - # Summary of changes: use SETITEM for all pickle protocols and - # conditionally pass an extra argument to a custom implementation of - # the method 'save'. if not self._refonfail: super().save_dict(obj) return + # Modified from Python Standard Library's pickle._Pickler.save_dict() + # and pickle._Pickler._batch_setitems(). Summary of changes: use + # 'SETITEM' for all pickle protocols and conditionally pass an extra + # argument to a custom implementation of the method 'save'. + # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved + # License Agreement: https://opensource.org/licenses/Python-2.0 if self.bin: self.write(EMPTY_DICT) else: # proto 0 -- can't use EMPTY_DICT From 0e365f5c91a8ddbc0dd91a5c3fb9da3d41e442eb Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 4 Aug 2022 02:01:25 -0300 Subject: [PATCH 24/30] move session settings to session.py; changes to refonfail --- dill/_dill.py | 41 ++++++++++++++++++++++++++++++----------- dill/session.py | 31 +++++++++++++++++++------------ dill/settings.py | 4 ---- 3 files changed, 49 insertions(+), 27 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 635e9c12..d039be5e 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -241,7 +241,7 @@ def __reduce_ex__(self, protocol): #: Pickles the entire file (handle and contents), preserving mode and position. FILE_FMODE = 2 -# Exceptions commonly raised by unpicklable objects in the Standard Library. +# Exceptions commonly raised by unpickleable objects in the Standard Library. UNPICKLEABLE_ERRORS = (PicklingError, TypeError, ValueError, NotImplementedError) ### Shorthands (modified from python2.5/lib/pickle.py) @@ -438,13 +438,25 @@ def save(self, obj, save_persistent_id=True, *, name=None): # Store initial state. position = self._file_tell() memo_size = len(self.memo) + saved_as_global = False try: super().save(obj, save_persistent_id) except (AttributeError, *UNPICKLEABLE_ERRORS) as error_stack: # AttributeError may happen in the save_global() call from a child object. - if (type(error_stack) == AttributeError - and "no attribute '__name__'" not in error_stack.args[0]): + if type(error_stack) == AttributeError \ + and "no attribute '__name__'" not in error_stack.args[0]: raise + if self._session and obj is self._main: + warnings.warn( + "module %r being saved by reference due to unpickleable" + " objects in its namespace" % self._main.__name__, + PicklingWarning, + stacklevel=5, + ) + message = ( + "# X: fallback to save as global: <%s object at %#012x>" + % (type(obj).__name__, id(obj)) + ) # Roll back the stream. self._file_seek(position) self._file_truncate() @@ -452,20 +464,27 @@ def save(self, obj, save_persistent_id=True, *, name=None): for _ in range(len(self.memo) - memo_size): self.memo.popitem() # LIFO order is guaranteed since 3.7 # Try to save object by reference. + if isinstance(obj, ModuleType) and \ + (_is_builtin_module(obj) or obj is sys.modules['dill']): + self.save_reduce(_import_module, (obj.__name__,), obj=obj) + logger.trace(self, message, obj=obj) + return + if self._session: + if name is None and not (hasattr(obj, '__name__') or hasattr(obj, '__qualname__')): + name = self._id_to_name.get(id(obj)) + if name is not None and self._main.__name__ not in {'__main__', '__main_mp__'}: + self.save_reduce(getattr, (self._main, name), obj=obj) + logger.trace(self, message, obj=obj) + return try: - if isinstance(obj, ModuleType) and \ - (_is_builtin_module(obj) or obj is sys.modules['dill']): - self.save_reduce(_import_module, (obj.__name__,), obj=obj) - else: - self.save_global(obj, name) + self.save_global(obj, name) + logger.trace(self, message, obj=obj) except (AttributeError, PicklingError) as error: if getattr(self, '_trace_stack', None) and id(obj) == self._trace_stack[-1]: # Roll back trace state. self._trace_stack.pop() self._size_stack.pop() raise error from error_stack - logger.trace(self, "# X: fallback to save_global: <%s object at %#012x>", - type(obj).__name__, id(obj), obj=obj) def _save_module_dict(self, obj): """Save a module's dictionary. @@ -562,7 +581,7 @@ def use_diff(on=True): Reduces size of pickles by only including object which have changed. Decreases pickle size but increases CPU time needed. - Also helps avoid some unpicklable objects. + Also helps avoid some unpickleable objects. MUST be called at start of script, otherwise changes will not be recorded. """ global _use_diff, diff diff --git a/dill/session.py b/dill/session.py index c8acbe41..38dcc626 100644 --- a/dill/session.py +++ b/dill/session.py @@ -57,6 +57,11 @@ TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) +settings = { + 'refimported': False, + 'refonfail' : True, +} + class _PeekableReader: """lightweight readable stream wrapper that implements peek()""" def __init__(self, stream): @@ -225,12 +230,13 @@ def dump_module( similar but independent from ``dill.settings[`byref`]``, as ``refimported`` refers to virtually all imported objects, while ``byref`` only affects select objects. - refonfail: if `True`, objects that fail to pickle by value will try to - be saved by reference. If this also fails, saving their parent - objects by reference will be attempted recursively. In the worst - case scenario, the module itself may be saved by reference. Note: - The file-like object must be seekable and truncatable with this - option set. + refonfail: if `True` (the default), objects that fail to pickle by value + will try to be saved by reference. If this also fails, saving their + parent objects by reference will be attempted recursively. In the + worst case scenario, the module itself may be saved by reference, + with a warning. Note: this option disables framing for pickle + protocol >= 4. Turning this off may improve unpickling speed, but + may cause a module to fail pickling. **kwds: extra keyword arguments passed to :py:class:`Pickler()`. Raises: @@ -305,12 +311,12 @@ def dump_module( refimported = kwds.pop('byref', refimported) module = kwds.pop('main', module) - from .settings import settings - protocol = settings['protocol'] + from .settings import settings as dill_settings + protocol = dill_settings['protocol'] if refimported is None: - refimported = settings['dump_module']['refimported'] + refimported = settings['refimported'] if refonfail is None: - refonfail = settings['dump_module']['refonfail'] + refonfail = settings['refonfail'] main = module if main is None: main = _main_module @@ -339,6 +345,7 @@ def dump_module( if pickler._file_seek is None or pickler._file_truncate is None: raise TypeError("file must have 'tell', 'seek' and 'truncate'" " attributes if the 'refonfail' option is set.") + pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) return @@ -367,8 +374,8 @@ def _identify_module(file, main=None): module_name = arg if not ( next(opcodes)[0] in ('TUPLE1', 'TUPLE') and - next(opcodes)[0] == 'REDUCE' and - next(opcodes)[0] in ('EMPTY_DICT', 'DICT') + next(opcodes)[0] == 'REDUCE' #and + #next(opcodes)[0] in ('EMPTY_DICT', 'DICT') ): raise ValueError return module_name diff --git a/dill/settings.py b/dill/settings.py index df1d30a4..140bfb5d 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -21,10 +21,6 @@ 'fmode' : 0, #HANDLE_FMODE 'recurse' : False, 'ignore' : False, - 'dump_module' : { - 'refimported': False, - 'refonfail' : False, - }, } del DEFAULT_PROTOCOL From 9c54e34c52b78db71c3b67dab6da6a5a523241aa Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 4 Aug 2022 19:06:25 -0300 Subject: [PATCH 25/30] add _TruncatableWriter to handle 'refonfail' with non-seekable streams --- dill/session.py | 73 +++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/dill/session.py b/dill/session.py index 38dcc626..cb77a40d 100644 --- a/dill/session.py +++ b/dill/session.py @@ -37,10 +37,11 @@ 'dump_session', 'load_session' # backward compatibility ] -import contextlib +import io import re import sys import warnings +from contextlib import AbstractContextManager, nullcontext, suppress from dill import _dill, Pickler, Unpickler, UnpicklingError from ._dill import ( @@ -62,10 +63,14 @@ 'refonfail' : True, } -class _PeekableReader: +class _PeekableReader(AbstractContextManager): """lightweight readable stream wrapper that implements peek()""" - def __init__(self, stream): + def __init__(self, stream, closing=True): self.stream = stream + self.closing = closing + def __exit__(self, *exc_info): + if self.closing: + self.stream.close() def read(self, n): return self.stream.read(n) def readline(self): @@ -86,31 +91,52 @@ def peek(self, n): except (AttributeError, OSError): raise NotImplementedError("stream is not peekable: %r", stream) from None -def _open(file, mode, *, peekable=False): +class _TruncatableWriter(io.BytesIO, AbstractContextManager): + """works as an unlimited buffer, writes to file on close""" + def __init__(self, stream, closing=True, *args, **kwds): + super().__init__(*args, **kwds) + self.stream = stream + self.closing = closing + def __exit__(self, *exc_info): + self.close() + def close(self): + self.stream.write(self.getvalue()) + with suppress(AttributeError): + self.stream.flush() + super().close() + if self.closing: + self.stream.close() + +def _open(file, mode, *, peekable=False, truncatable=False): """return a context manager with an opened file-like object""" - import io readonly = ('r' in mode and '+' not in mode) - if peekable and not readonly: + if not readonly and peekable: raise ValueError("the 'peekable' option is invalid for writable files") - was_open = hasattr(file, 'read' if readonly else 'write') - if not was_open: + if readonly and truncatable: + raise ValueError("the 'truncatable' option is invalid for read-only files") + should_close = not hasattr(file, 'read' if readonly else 'write') + if should_close: file = open(file, mode) - if readonly and peekable and not hasattr(file, 'peek'): - # Try our best to return the stream as an object with a peek() method. + # Wrap stream in a helper class if necessary. + if peekable and not hasattr(file, 'peek'): + # Try our best to return it as an object with a peek() method. if hasattr(file, 'tell') and hasattr(file, 'seek'): - file = _PeekableReader(file) + file = _PeekableReader(file, closing=should_close) else: try: file = io.BufferedReader(file) except Exception: - # Stream won't be peekable, but will fail gracefully in _identify_module(). - file = _PeekableReader(file) - if was_open: # should not close at exit - return contextlib.nullcontext(file) - elif type(file) == _PeekableReader: - return contextlib.closing(file) - else: + # It won't be peekable, but will fail gracefully in _identify_module(). + file = _PeekableReader(file, closing=should_close) + elif truncatable and ( + not hasattr(file, 'truncate') + or (hasattr(file, 'seekable') and not file.seekable()) + ): + file = _TruncatableWriter(file, closing=should_close) + if should_close or isinstance(file, (_PeekableReader, _TruncatableWriter)): return file + else: + return nullcontext(file) def _module_map(): """get map of imported modules""" @@ -327,7 +353,7 @@ def dump_module( original_main = main if refimported: main = _stash_modules(main) - with _open(filename, 'wb') as file: + with _open(filename, 'wb', truncatable=True) as file: pickler = Pickler(file, protocol, **kwds) if main is not original_main: pickler._original_main = original_main @@ -338,13 +364,8 @@ def dump_module( pickler._first_pass = True if refonfail: pickler._refonfail = True # False by default - pickler._file_seek = getattr(file, 'seek', None) - pickler._file_truncate = getattr(file, 'truncate', None) - if hasattr(file, 'seekable') and not file.seekable(): - pickler._file_seek = None - if pickler._file_seek is None or pickler._file_truncate is None: - raise TypeError("file must have 'tell', 'seek' and 'truncate'" - " attributes if the 'refonfail' option is set.") + pickler._file_seek = file.seek + pickler._file_truncate = file.truncate pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) return From ffdd180476b53a9b2e3244c2a781ab67acf7c010 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Thu, 4 Aug 2022 19:30:50 -0300 Subject: [PATCH 26/30] update 'refonfail' example --- dill/_dill.py | 1 - dill/session.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index d039be5e..0ab1ec81 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -438,7 +438,6 @@ def save(self, obj, save_persistent_id=True, *, name=None): # Store initial state. position = self._file_tell() memo_size = len(self.memo) - saved_as_global = False try: super().save(obj, save_persistent_id) except (AttributeError, *UNPICKLEABLE_ERRORS) as error_stack: diff --git a/dill/session.py b/dill/session.py index cb77a40d..a63bede0 100644 --- a/dill/session.py +++ b/dill/session.py @@ -298,9 +298,9 @@ def dump_module( >>> import dill >>> import os >>> os.altsep = '\\' - >>> dill.dump_module('os_session.pkl', module=os) + >>> dill.dump_module('os_session.pkl', module=os, refonfail=False) PicklingError: ... - >>> dill.dump_module('os_session.pkl', module=os, refonfail=True) + >>> dill.dump_module('os_session.pkl', module=os, refonfail=True) # the default - Restore the state of the saved modules: @@ -410,7 +410,7 @@ def _identify_module(file, main=None): raise UnpicklingError("unable to identify module") from error def is_pickled_module(filename, importable: bool = True) -> bool: - """Check if a file is a module state pickle file. + """Check if a file is a pickle file readable by :py:func:`load_module`. Parameters: filename: a path-like object or a readable stream. From f60d239156295876699221b77c9c13f8b0a625a5 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 13 Aug 2022 22:14:01 -0300 Subject: [PATCH 27/30] merge the two save() methods and save_module_dict() with _save_module_dict() --- dill/_dill.py | 123 ++++++++++++++++++++++++------------------------ dill/session.py | 1 - 2 files changed, 61 insertions(+), 63 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index 91a0b51f..bfcd6a67 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -326,8 +326,11 @@ class UnpicklingWarning(PickleWarning, UnpicklingError): class Pickler(StockPickler): """python's Pickler extended to interpreter sessions""" dispatch = MetaCatchingDict(StockPickler.dispatch.copy()) - _refonfail = False + _refimported = False + _refonfail = False # True in session.settings _session = False + _first_pass = False + _original_main = None from .settings import settings def __init__(self, file, *args, **kwds): @@ -346,12 +349,23 @@ def __init__(self, file, *args, **kwds): self._postproc = OrderedDict() self._file_tell = getattr(file, 'tell', None) # for logger and refonfail - def save(self, obj, save_persistent_id=True): - # register if the object is a numpy ufunc - # thanks to Paul Kienzle for pointing out ufuncs didn't pickle + def save(self, obj, save_persistent_id=True, *, name=None): + # This method overrides StockPickler.save() and is called for every + # object pickled. When 'refonfail' is True, it tries to save the object + # by reference if pickling it fails with a common pickling error, as + # defined by the constant UNPICKLEABLE_ERRORS. If that also fails, then + # the exception is risen and, if this was called indirectly from another + # Pickler.save() call, the parent objects will try to be saved by + # reference recursively, until it succeeds or the exception propagates + # beyond the topmost save() call. The extra 'name' argument is passed + # to StockPickler.save_global(). + + # numpy hack obj_type = type(obj) if NumpyArrayType and not (obj_type is type or obj_type in Pickler.dispatch): - if NumpyUfuncType and numpyufunc(obj_type): + # register if the object is a numpy ufunc + # thanks to Paul Kienzle for pointing out ufuncs didn't pickle + if numpyufunc(obj_type): @register(obj_type) def save_numpy_ufunc(pickler, obj): logger.trace(pickler, "Nu: %s", obj) @@ -365,7 +379,7 @@ def save_numpy_ufunc(pickler, obj): # def uload(name): return getattr(numpy, name) # copy_reg.pickle(NumpyUfuncType, udump, uload) # register if the object is a numpy dtype - if NumpyDType and numpydtype(obj_type): + if numpydtype(obj_type): @register(obj_type) def save_numpy_dtype(pickler, obj): logger.trace(pickler, "Dt: %s", obj) @@ -378,7 +392,7 @@ def save_numpy_dtype(pickler, obj): # def udump(f): return uload, (f.type,) # copy_reg.pickle(NumpyDTypeType, udump, uload) # register if the object is a subclassed numpy array instance - if NumpyArrayType and ndarraysubclassinstance(obj_type): + if ndarraysubclassinstance(obj_type): @register(obj_type) def save_numpy_array(pickler, obj): logger.trace(pickler, "Nu: (%s, %s)", obj.shape, obj.dtype, obj=obj) @@ -387,32 +401,17 @@ def save_numpy_array(pickler, obj): pickler.save_reduce(_create_array, (f,args,state,npdict), obj=obj) logger.trace(pickler, "# Nu") return - # end hack - if GENERATOR_FAIL and type(obj) == GeneratorType: + # end numpy hack + + if GENERATOR_FAIL and obj_type is GeneratorType: msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType raise PicklingError(msg) - StockPickler.save(self, obj, save_persistent_id) - - save.__doc__ = StockPickler.save.__doc__ - - def dump(self, obj): #NOTE: if settings change, need to update attributes - logger.trace_setup(self) - StockPickler.dump(self, obj) - dump.__doc__ = StockPickler.dump.__doc__ - def save(self, obj, save_persistent_id=True, *, name=None): - # This method overrides StockPickler.save() and is called for every - # object pickled. When 'refonfail' is True, it tries to save the object - # by reference if pickling it fails with a common pickling error, as - # defined by the constant UNPICKLEABLE_ERRORS. If that also fails, then - # the exception is risen and, if this was called indirectly from another - # Pickler.save() call, the parent objects will try to be saved by - # reference recursively, until it succeeds or the exception propagates - # beyond the topmost save() call. The extra 'name' argument is passed - # to StockPickler.save_global(). if not self._refonfail: super().save(obj, save_persistent_id) return + + # Save with 'refonfail'. # Disable framing (right after the framer.init_framing() call at dump()). self.framer.current_frame = None # Store initial state. @@ -464,36 +463,13 @@ def save(self, obj, save_persistent_id=True, *, name=None): self._trace_stack.pop() self._size_stack.pop() raise error from error_stack + return + save.__doc__ = StockPickler.save.__doc__ - def _save_module_dict(self, obj): - """Save a module's dictionary. - - If an object doesn't have a '__name__' attribute, pass the object's name - in the module's namespace to save(), so that it can be used with - save_global() to increase the chances of finding the object for saving - it by reference in the event of a failed serialization. - """ - if not self._refonfail: - super().save_dict(obj) - return - # Modified from Python Standard Library's pickle._Pickler.save_dict() - # and pickle._Pickler._batch_setitems(). Summary of changes: use - # 'SETITEM' for all pickle protocols and conditionally pass an extra - # argument to a custom implementation of the method 'save'. - # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved - # License Agreement: https://opensource.org/licenses/Python-2.0 - if self.bin: - self.write(EMPTY_DICT) - else: # proto 0 -- can't use EMPTY_DICT - self.write(MARK + DICT) - self.memoize(obj) - for k, v in obj.items(): - self.save(k) - if hasattr(v, '__name__') or hasattr(v, '__qualname__'): - self.save(v) - else: - self.save(v, name=k) - self.write(SETITEM) + def dump(self, obj): #NOTE: if settings change, need to update attributes + logger.trace_setup(self) + StockPickler.dump(self, obj) + dump.__doc__ = StockPickler.dump.__doc__ class Unpickler(StockUnpickler): """python's Unpickler extended to interpreter sessions and more types""" @@ -1279,16 +1255,39 @@ def save_module_dict(pickler, obj): logger.trace(pickler, "D4: %s", _repr_dict(obj), obj=obj) pickler.write(bytes('c%s\n__dict__\n' % obj['__name__'], 'UTF-8')) logger.trace(pickler, "# D4") - elif pickler_is_dill and pickler._session and pickler._first_pass: + elif not (pickler_is_dill and pickler._session and pickler._first_pass and pickler._refonfail): # we only care about session the first pass thru - pickler._first_pass = False - logger.trace(pickler, "D5: %s", _repr_dict(obj), obj=obj) - pickler._save_module_dict(obj) - logger.trace(pickler, "# D5") - else: + if pickler_is_dill and pickler._first_pass: + pickler._first_pass = False logger.trace(pickler, "D2: %s", _repr_dict(obj), obj=obj) StockPickler.save_dict(pickler, obj) logger.trace(pickler, "# D2") + else: + # If an object doesn't have a '__name__' attribute, pass the object's name + # in the module's namespace to save(), so that it can be used with + # save_global() to increase the chances of finding the object for saving + # it by reference in the event of a failed serialization. + pickler._first_pass = False + logger.trace(pickler, "D5: %s", _repr_dict(obj), obj=obj) + # Modified from Python Standard Library's pickle._Pickler.save_dict() + # and pickle._Pickler._batch_setitems(). Summary of changes: use + # 'SETITEM' for all pickle protocols and conditionally pass an extra + # argument to a custom implementation of the method 'save'. + # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved + # License Agreement: https://opensource.org/licenses/Python-2.0 + if pickler.bin: + pickler.write(EMPTY_DICT) + else: # proto 0 -- can't use EMPTY_DICT + pickler.write(MARK + DICT) + pickler.memoize(obj) + for k, v in obj.items(): + pickler.save(k) + if hasattr(v, '__name__') or hasattr(v, '__qualname__'): + pickler.save(v) + else: + pickler.save(v, name=k) + pickler.write(SETITEM) + logger.trace(pickler, "# D5") return diff --git a/dill/session.py b/dill/session.py index feecc147..608abbcb 100644 --- a/dill/session.py +++ b/dill/session.py @@ -366,7 +366,6 @@ def dump_module( pickler._refonfail = True # False by default pickler._file_seek = file.seek pickler._file_truncate = file.truncate - pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) return From 4fe577b6c3eb693a870bc7dc932b78c7ac63f3dc Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 13 Aug 2022 22:21:00 -0300 Subject: [PATCH 28/30] minor --- dill/_dill.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dill/_dill.py b/dill/_dill.py index bfcd6a67..938f4728 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -39,8 +39,7 @@ #XXX: get types from .objtypes ? import builtins as __builtin__ from pickle import _Pickler as StockPickler, Unpickler as StockUnpickler -from pickle import DICT, EMPTY_DICT, MARK, SETITEM -from struct import pack +from pickle import DICT, EMPTY_DICT, GLOBAL, MARK, SETITEM from _thread import LockType from _thread import RLock as RLockType #from io import IOBase @@ -174,8 +173,6 @@ def get_file_type(*args, **kwargs): import dataclasses import typing -from pickle import GLOBAL - ### Shims for different versions of Python and dill class Sentinel(object): From d059d842fb37f1c7b2b55fd2d60709f59d4cd390 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Sat, 13 Aug 2022 23:09:56 -0300 Subject: [PATCH 29/30] grammar; keep __weakref__ attribute in docs --- dill/session.py | 30 +++++++++++++++--------------- docs/source/conf.py | 2 +- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/dill/session.py b/dill/session.py index 608abbcb..36964eb7 100644 --- a/dill/session.py +++ b/dill/session.py @@ -15,18 +15,18 @@ that are pickled, `dill` assumes that they are importable when unpickling. Contrary of using :py:func:`dill.dump` and :py:func:`dill.load` to save and load -a module object, :py:func:`dill.dump_module` always try to pickle the module by -value (including built-in modules). Also, options like +a module object, :py:func:`dill.dump_module` always tries to pickle the module +by value (including built-in modules). Also, options like ``dill.settings['byref']`` and ``dill.settings['recurse']`` don't affect its behavior. However, if a module contains references to objects originating from other modules, that would prevent it from pickling or drastically increase its disk -size, they can be saved by reference instead of by value using the option +size, they can be saved by reference instead of by value, using the option ``refimported``. With :py:func:`dump_module`, namespace filters may be used to restrict the list -of variables pickled to a subset of those in the module, based on their names or +of pickled variables to a subset of those in the module, based on their names or values. Also, using :py:func:`load_module_asdict` allows one to load the variables from different saved states of the same module into dictionaries. """ @@ -261,8 +261,8 @@ def dump_module( parent objects by reference will be attempted recursively. In the worst case scenario, the module itself may be saved by reference, with a warning. Note: this option disables framing for pickle - protocol >= 4. Turning this off may improve unpickling speed, but - may cause a module to fail pickling. + protocol >= 4. Turning it off may improve unpickling speed, but may + cause a module to fail pickling. **kwds: extra keyword arguments passed to :py:class:`Pickler()`. Raises: @@ -470,7 +470,7 @@ def load_module( value. Each case and behavior is exemplified below: 1. `module`: ``None`` --- This call loads a previously saved state of - the module ``math`` and returns this at the end: + the module ``math`` and returns it (the module object) at the end: >>> import dill >>> # load module -> restore state -> return module @@ -478,7 +478,7 @@ def load_module( 2. `module`: ``str`` --- Passing the module name does the same as above, - but also verifies that the module loaded, restored and returned is + but also verifies that the module being loaded, restored and returned is indeed ``math``: >>> import dill @@ -489,7 +489,7 @@ def load_module( ValueError: can't update module 'cmath' with the saved state of module 'math' 3. `module`: ``ModuleType`` --- Passing the module itself instead of its - name have the additional effect of supressing the return value (and the + name has the additional effect of suppressing the return value (and the module is already loaded at this point): >>> import dill @@ -715,22 +715,22 @@ def load_module_asdict( raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") with _open(filename, 'rb', peekable=True) as file: main_name = _identify_module(file) - old_main = sys.modules.get(main_name) + original_main = sys.modules.get(main_name) main = ModuleType(main_name) if update: - if old_main is None: - old_main = _import_module(main_name) - main.__dict__.update(old_main.__dict__) + if original_main is None: + original_main = _import_module(main_name) + main.__dict__.update(original_main.__dict__) else: main.__builtins__ = __builtin__ try: sys.modules[main_name] = main load_module(file, **kwds) finally: - if old_main is None: + if original_main is None: del sys.modules[main_name] else: - sys.modules[main_name] = old_main + sys.modules[main_name] = original_main main.__session__ = str(filename) return main.__dict__ diff --git a/docs/source/conf.py b/docs/source/conf.py index ff34cd55..72c6fdfe 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -72,7 +72,7 @@ 'private-members': True, 'special-members': True, 'show-inheritance': True, - 'exclude-members': '__dict__, __module__, __slots__, __weakref__', + 'exclude-members': '__dict__, __module__, __slots__', } autodoc_typehints = 'description' napoleon_include_init_with_doc = True From 8bf6157ee40220c1336b74ca99317fdcba8c2103 Mon Sep 17 00:00:00 2001 From: Leonardo Gama Date: Tue, 4 Oct 2022 11:53:19 -0300 Subject: [PATCH 30/30] Add option 'refonfail' for dump_module (enabled by default) Other relevant changes in this commit: - Complement and improve the documentation of _dill.py and session.py - Rename the submodule 'logger' back to 'logging'. - Now, the submodule 'logging' is meant to be used also as a proxy to the StdLib module of the same name. - The level for pickling tracing is now 'logging.TRACE', a custom level between INFO and DEBUG - New: if logging level is set to INFO or lower, the variables saved by reference, either by 'refimported' or by 'refonfail', are listed. - New internal functions: _is_imported_module(), _is_stdlib_module() and _module_package() - New private submodule '_utils': new helper _open() context manager for opening streams - More tests added to test_session.py - Etc. --- dill/__init__.py | 19 +- dill/_dill.py | 395 +++++++++----- dill/_utils.py | 122 +++++ dill/detect.py | 2 +- dill/{logger.py => logging.py} | 138 +++-- dill/session.py | 495 ++++++++---------- dill/settings.py | 3 +- .../tests/{test_logger.py => test_logging.py} | 2 +- dill/tests/test_session.py | 214 ++++++-- dill/tests/test_stdlib_modules.py | 136 +++++ dill/tests/test_utils.py | 73 +++ docs/source/conf.py | 8 - docs/source/dill.rst | 83 ++- 13 files changed, 1159 insertions(+), 531 deletions(-) create mode 100644 dill/_utils.py rename dill/{logger.py => logging.py} (68%) rename dill/tests/{test_logger.py => test_logging.py} (97%) create mode 100644 dill/tests/test_stdlib_modules.py create mode 100644 dill/tests/test_utils.py diff --git a/dill/__init__.py b/dill/__init__.py index 3571f54e..a97e973d 100644 --- a/dill/__init__.py +++ b/dill/__init__.py @@ -11,10 +11,10 @@ from .__info__ import __version__, __author__, __doc__, __license__ except: # pragma: no cover import os - import sys + import sys parent = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) sys.path.append(parent) - # get distribution meta info + # get distribution meta info from version import (__version__, __author__, get_license_text, get_readme_as_rst) __license__ = get_license_text(os.path.join(parent, 'LICENSE')) @@ -24,23 +24,25 @@ from ._dill import ( - dump, dumps, load, loads, copy, - Pickler, Unpickler, register, pickle, pickles, check, - DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, HANDLE_FMODE, CONTENTS_FMODE, FILE_FMODE, + Pickler, Unpickler, + check, copy, dump, dumps, load, loads, pickle, pickles, register, + DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, CONTENTS_FMODE, FILE_FMODE, HANDLE_FMODE, PickleError, PickleWarning, PicklingError, PicklingWarning, UnpicklingError, UnpicklingWarning, ) from .session import ( - dump_module, load_module, load_module_asdict, is_pickled_module, + dump_module, load_module, load_module_asdict, dump_session, load_session # backward compatibility ) -from . import detect, logger, session, source, temp +from . import detect, logging, session, source, temp # get global settings from .settings import settings # make sure "trace" is turned off -logger.trace(False) +logging.trace(False) + +from importlib import reload objects = {} # local import of dill._objects @@ -66,7 +68,6 @@ def load_types(pickleable=True, unpickleable=True): Returns: None """ - from importlib import reload # local import of dill.objects from . import _objects if pickleable: diff --git a/dill/_dill.py b/dill/_dill.py index 938f4728..85504c44 100644 --- a/dill/_dill.py +++ b/dill/_dill.py @@ -16,9 +16,9 @@ Test against CH16+ Std. Lib. ... TBD. """ __all__ = [ - 'dump','dumps','load','loads','copy', - 'Pickler','Unpickler','register','pickle','pickles','check', - 'DEFAULT_PROTOCOL','HIGHEST_PROTOCOL','HANDLE_FMODE','CONTENTS_FMODE','FILE_FMODE', + 'Pickler','Unpickler', + 'check','copy','dump','dumps','load','loads','pickle','pickles','register', + 'DEFAULT_PROTOCOL','HIGHEST_PROTOCOL','CONTENTS_FMODE','FILE_FMODE','HANDLE_FMODE', 'PickleError','PickleWarning','PicklingError','PicklingWarning','UnpicklingError', 'UnpicklingWarning', ] @@ -26,8 +26,10 @@ __module__ = 'dill' import warnings -from .logger import adapter as logger -from .logger import trace as _trace +from dill import logging +from .logging import adapter as logger +from .logging import trace as _trace +_logger = logging.getLogger(__name__) import os import sys @@ -39,7 +41,7 @@ #XXX: get types from .objtypes ? import builtins as __builtin__ from pickle import _Pickler as StockPickler, Unpickler as StockUnpickler -from pickle import DICT, EMPTY_DICT, GLOBAL, MARK, SETITEM +from pickle import DICT, GLOBAL, MARK, POP, SETITEM from _thread import LockType from _thread import RLock as RLockType #from io import IOBase @@ -59,13 +61,13 @@ import marshal import gc # import zlib +import weakref from weakref import ReferenceType, ProxyType, CallableProxyType from collections import OrderedDict -from functools import partial +from functools import partial, wraps from operator import itemgetter, attrgetter GENERATOR_FAIL = False import importlib.machinery -EXTENSION_SUFFIXES = tuple(importlib.machinery.EXTENSION_SUFFIXES) try: import ctypes HAS_CTYPES = True @@ -323,46 +325,44 @@ class UnpicklingWarning(PickleWarning, UnpicklingError): class Pickler(StockPickler): """python's Pickler extended to interpreter sessions""" dispatch = MetaCatchingDict(StockPickler.dispatch.copy()) + from .settings import settings + # Flags set by dump_module() is dill.session: _refimported = False - _refonfail = False # True in session.settings + _refonfail = False _session = False _first_pass = False - _original_main = None - from .settings import settings def __init__(self, file, *args, **kwds): settings = Pickler.settings _byref = kwds.pop('byref', None) + #_strictio = kwds.pop('strictio', None) _fmode = kwds.pop('fmode', None) _recurse = kwds.pop('recurse', None) - #_strictio = kwds.pop('strictio', None) StockPickler.__init__(self, file, *args, **kwds) self._main = _main_module self._diff_cache = {} self._byref = settings['byref'] if _byref is None else _byref + self._strictio = False #_strictio self._fmode = settings['fmode'] if _fmode is None else _fmode self._recurse = settings['recurse'] if _recurse is None else _recurse - self._strictio = False #_strictio self._postproc = OrderedDict() self._file_tell = getattr(file, 'tell', None) # for logger and refonfail - def save(self, obj, save_persistent_id=True, *, name=None): + def save(self, obj, save_persistent_id=True): # This method overrides StockPickler.save() and is called for every # object pickled. When 'refonfail' is True, it tries to save the object # by reference if pickling it fails with a common pickling error, as # defined by the constant UNPICKLEABLE_ERRORS. If that also fails, then - # the exception is risen and, if this was called indirectly from another - # Pickler.save() call, the parent objects will try to be saved by - # reference recursively, until it succeeds or the exception propagates - # beyond the topmost save() call. The extra 'name' argument is passed - # to StockPickler.save_global(). + # the exception is raised and, if this method was called indirectly from + # another Pickler.save() call, the parent objects will try to be saved + # by reference recursively, until it succeeds or the exception + # propagates beyond the topmost save() call. - # numpy hack + # register if the object is a numpy ufunc + # thanks to Paul Kienzle for pointing out ufuncs didn't pickle obj_type = type(obj) if NumpyArrayType and not (obj_type is type or obj_type in Pickler.dispatch): - # register if the object is a numpy ufunc - # thanks to Paul Kienzle for pointing out ufuncs didn't pickle - if numpyufunc(obj_type): + if NumpyUfuncType and numpyufunc(obj_type): @register(obj_type) def save_numpy_ufunc(pickler, obj): logger.trace(pickler, "Nu: %s", obj) @@ -376,7 +376,7 @@ def save_numpy_ufunc(pickler, obj): # def uload(name): return getattr(numpy, name) # copy_reg.pickle(NumpyUfuncType, udump, uload) # register if the object is a numpy dtype - if numpydtype(obj_type): + if NumpyDType and numpydtype(obj_type): @register(obj_type) def save_numpy_dtype(pickler, obj): logger.trace(pickler, "Dt: %s", obj) @@ -389,7 +389,7 @@ def save_numpy_dtype(pickler, obj): # def udump(f): return uload, (f.type,) # copy_reg.pickle(NumpyDTypeType, udump, uload) # register if the object is a subclassed numpy array instance - if ndarraysubclassinstance(obj_type): + if NumpyArrayType and ndarraysubclassinstance(obj_type): @register(obj_type) def save_numpy_array(pickler, obj): logger.trace(pickler, "Nu: (%s, %s)", obj.shape, obj.dtype, obj=obj) @@ -398,74 +398,71 @@ def save_numpy_array(pickler, obj): pickler.save_reduce(_create_array, (f,args,state,npdict), obj=obj) logger.trace(pickler, "# Nu") return - # end numpy hack - - if GENERATOR_FAIL and obj_type is GeneratorType: + # end hack + if GENERATOR_FAIL and type(obj) == GeneratorType: msg = "Can't pickle %s: attribute lookup builtins.generator failed" % GeneratorType raise PicklingError(msg) if not self._refonfail: - super().save(obj, save_persistent_id) + StockPickler.save(self, obj, save_persistent_id) return - # Save with 'refonfail'. - # Disable framing (right after the framer.init_framing() call at dump()). + ## Save with 'refonfail' ## + + # Disable framing. This must be set right after the + # framer.init_framing() call at StockPickler.dump()). self.framer.current_frame = None # Store initial state. position = self._file_tell() memo_size = len(self.memo) try: - super().save(obj, save_persistent_id) - except (AttributeError, *UNPICKLEABLE_ERRORS) as error_stack: - # AttributeError may happen in the save_global() call from a child object. - if type(error_stack) == AttributeError \ - and "no attribute '__name__'" not in error_stack.args[0]: - raise - if self._session and obj is self._main: - warnings.warn( - "module %r being saved by reference due to unpickleable" - " objects in its namespace" % self._main.__name__, - PicklingWarning, - stacklevel=5, - ) - message = ( + StockPickler.save(self, obj, save_persistent_id) + except UNPICKLEABLE_ERRORS as error_stack: + trace_message = ( "# X: fallback to save as global: <%s object at %#012x>" % (type(obj).__name__, id(obj)) ) - # Roll back the stream. + # Roll back the stream. Note: truncate(position) doesn't always work. self._file_seek(position) self._file_truncate() # Roll back memo. for _ in range(len(self.memo) - memo_size): self.memo.popitem() # LIFO order is guaranteed since 3.7 + # Handle session main. + if self._session and obj is self._main: + if self._main is _main_module or not _is_imported_module(self._main): + raise + # Save an empty dict as state to distinguish from modules saved with dump(). + self.save_reduce(_import_module, (obj.__name__,), obj=obj, state={}) + logger.trace(self, trace_message, obj=obj) + warnings.warn( + "module %r saved by reference due to the unpickleable " + "variable %r. No changes to the module were saved." + % (self._main.__name__, error_stack.name), + PicklingWarning, + stacklevel=5, + ) # Try to save object by reference. - if isinstance(obj, ModuleType) and \ - (_is_builtin_module(obj) or obj is sys.modules['dill']): - self.save_reduce(_import_module, (obj.__name__,), obj=obj) - logger.trace(self, message, obj=obj) - return - if self._session: - if name is None and not (hasattr(obj, '__name__') or hasattr(obj, '__qualname__')): - name = self._id_to_name.get(id(obj)) - if name is not None and self._main.__name__ not in {'__main__', '__main_mp__'}: - self.save_reduce(getattr, (self._main, name), obj=obj) - logger.trace(self, message, obj=obj) - return - try: - self.save_global(obj, name) - logger.trace(self, message, obj=obj) - except (AttributeError, PicklingError) as error: - if getattr(self, '_trace_stack', None) and id(obj) == self._trace_stack[-1]: + elif hasattr(obj, '__name__') or hasattr(obj, '__qualname__'): + try: + self.save_global(obj) + logger.trace(self, trace_message, obj=obj) + return True # for _saved_byref, ignored otherwise + except PicklingError as error: # Roll back trace state. - self._trace_stack.pop() - self._size_stack.pop() - raise error from error_stack + logger.roll_back(self, obj) + raise error from error_stack + else: + # Roll back trace state. + logger.roll_back(self, obj) + raise return save.__doc__ = StockPickler.save.__doc__ def dump(self, obj): #NOTE: if settings change, need to update attributes logger.trace_setup(self) StockPickler.dump(self, obj) + dump.__doc__ = StockPickler.dump.__doc__ class Unpickler(StockUnpickler): @@ -533,7 +530,7 @@ def use_diff(on=True): Reduces size of pickles by only including object which have changed. Decreases pickle size but increases CPU time needed. - Also helps avoid some unpickleable objects. + Also helps avoid some unpicklable objects. MUST be called at start of script, otherwise changes will not be recorded. """ global _use_diff, diff @@ -1230,61 +1227,160 @@ def save_code(pickler, obj): logger.trace(pickler, "# Co") return +def _module_map(main_module): + """get map of imported modules""" + from collections import defaultdict + from types import SimpleNamespace + modmap = SimpleNamespace( + by_name = defaultdict(list), + by_id = defaultdict(list), + top_level = {}, # top-level modules + module = main_module.__name__, + package = _module_package(main_module), + ) + for modname, module in sys.modules.items(): + if (modname in ('__main__', '__mp_main__') or module is main_module + or not isinstance(module, ModuleType)): + continue + if '.' not in modname: + modmap.top_level[id(module)] = modname + for objname, modobj in module.__dict__.items(): + modmap.by_name[objname].append((modobj, modname)) + modmap.by_id[id(modobj)].append((objname, modname)) + return modmap + +def _lookup_module(modmap, name, obj, lookup_by_id=True) -> typing.Tuple[str, str, bool]: + """Lookup name or id of obj if module is imported. + + Lookup for objects identical to 'obj' at modules in 'modmpap'. If multiple + copies are found in different modules, return the one from the module with + higher probability of being available at unpickling time, according to the + hierarchy: + + 1. Standard Library modules + 2. modules of the same top-level package as the module being saved (if it's part of a package) + 3. installed modules in general + 4. non-installed modules + + Returns: + A 3-tuple containing the module's name, the object's name in the module, + and a boolean flag, which is `True` if the module falls under categories + (1) to (3) from the hierarchy, or `False` if it's in category (4). + """ + not_found = None, None, None + # Don't look for objects likely related to the module itself. + obj_module = getattr(obj, '__module__', type(obj).__module__) + if obj_module == modmap.module: + return not_found + obj_package = _module_package(_import_module(obj_module, safe=True)) + + for map, by_id in [(modmap.by_name, False), (modmap.by_id, True)]: + if by_id and not lookup_by_id: + break + _2nd_choice = _3rd_choice = _4th_choice = None + key = id(obj) if by_id else name + for other, modname in map[key]: + if by_id or other is obj: + other_name = other if by_id else name + other_module = sys.modules[modname] + other_package = _module_package(other_module) + # Don't return a reference to a module of another package + # if the object is likely from the same top-level package. + if (modmap.package and obj_package == modmap.package + and other_package != modmap.package): + continue + # Prefer modules imported earlier (the first found). + if _is_stdlib_module(other_module): + return modname, other_name, True + elif modmap.package and modmap.package == other_package: + if _2nd_choice: continue + _2nd_choice = modname, other_name, True + elif not _2nd_choice: + # Don't call _is_builtin_module() unnecessarily. + if _is_builtin_module(other_module): + if _3rd_choice: continue + _3rd_choice = modname, other_name, True + else: + if _4th_choice: continue + _4th_choice = modname, other_name, False # unsafe + found = _2nd_choice or _3rd_choice or _4th_choice + if found: + return found + return not_found + +def _global_string(modname, name): + return GLOBAL + bytes('%s\n%s\n' % (modname, name), 'UTF-8') + +def _save_module_dict(pickler, main_dict): + """Save a module's dictionary, saving unpickleable variables by referece.""" + main = getattr(pickler, '_original_main', pickler._main) + modmap = getattr(pickler, '_modmap', None) # cached from _stash_modules() + is_builtin = _is_builtin_module(main) + pickler.write(MARK + DICT) # don't need to memoize + for name, value in main_dict.items(): + _logger.debug("Pickling %r (%s)", name, type(value).__name__) + pickler.save(name) + try: + if pickler.save(value): + global_name = getattr(value, '__qualname__', value.__name__) + pickler._saved_byref.append((name, value.__module__, global_name)) + except UNPICKLEABLE_ERRORS as error_stack: + if modmap is None: + modmap = _module_map(main) + modname, objname, installed = _lookup_module(modmap, name, value) + if modname and (installed or not is_builtin): + pickler.write(_global_string(modname, objname)) + pickler._saved_byref.append((name, modname, objname)) + elif is_builtin: + pickler.write(_global_string(main.__name__, name)) + pickler._saved_byref.append((name, main.__name__, name)) + else: + error = PicklingError("can't save variable %r as global" % name) + error.name = name + raise error from error_stack + pickler.memoize(value) + pickler.write(SETITEM) + def _repr_dict(obj): - """make a short string representation of a dictionary""" + """Make a short string representation of a dictionary.""" return "<%s object at %#012x>" % (type(obj).__name__, id(obj)) @register(dict) def save_module_dict(pickler, obj): - pickler_is_dill = is_dill(pickler, child=False) - if pickler_is_dill and obj == pickler._main.__dict__ and \ - not (pickler._session and pickler._first_pass): + is_pickler_dill = is_dill(pickler, child=False) + if (is_pickler_dill + and obj is pickler._main.__dict__ + and not (pickler._session and pickler._first_pass)): logger.trace(pickler, "D1: %s", _repr_dict(obj), obj=obj) - pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8')) + pickler.write(GLOBAL + b'__builtin__\n__main__\n') logger.trace(pickler, "# D1") - elif (not pickler_is_dill) and (obj == _main_module.__dict__): + elif not is_pickler_dill and obj is _main_module.__dict__: #prama: no cover logger.trace(pickler, "D3: %s", _repr_dict(obj), obj=obj) - pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8')) #XXX: works in general? + pickler.write(GLOBAL + b'__main__\n__dict__\n') #XXX: works in general? logger.trace(pickler, "# D3") - elif '__name__' in obj and obj != _main_module.__dict__ \ - and type(obj['__name__']) is str \ - and obj is getattr(_import_module(obj['__name__'],True), '__dict__', None): + elif (is_pickler_dill + and pickler._session + and pickler._refonfail + and obj is pickler._main_dict_copy): + logger.trace(pickler, "D5: %s", _repr_dict(obj), obj=obj) + # we only care about session the first pass thru + pickler.first_pass = False + _save_module_dict(pickler, obj) + logger.trace(pickler, "# D5") + elif ('__name__' in obj + and obj is not _main_module.__dict__ + and type(obj['__name__']) is str + and obj is getattr(_import_module(obj['__name__'], safe=True), '__dict__', None)): logger.trace(pickler, "D4: %s", _repr_dict(obj), obj=obj) - pickler.write(bytes('c%s\n__dict__\n' % obj['__name__'], 'UTF-8')) + pickler.write(_global_string(obj['__name__'], '__dict__')) logger.trace(pickler, "# D4") - elif not (pickler_is_dill and pickler._session and pickler._first_pass and pickler._refonfail): - # we only care about session the first pass thru - if pickler_is_dill and pickler._first_pass: - pickler._first_pass = False + else: logger.trace(pickler, "D2: %s", _repr_dict(obj), obj=obj) + if is_pickler_dill: + # we only care about session the first pass thru + pickler._first_pass = False StockPickler.save_dict(pickler, obj) logger.trace(pickler, "# D2") - else: - # If an object doesn't have a '__name__' attribute, pass the object's name - # in the module's namespace to save(), so that it can be used with - # save_global() to increase the chances of finding the object for saving - # it by reference in the event of a failed serialization. - pickler._first_pass = False - logger.trace(pickler, "D5: %s", _repr_dict(obj), obj=obj) - # Modified from Python Standard Library's pickle._Pickler.save_dict() - # and pickle._Pickler._batch_setitems(). Summary of changes: use - # 'SETITEM' for all pickle protocols and conditionally pass an extra - # argument to a custom implementation of the method 'save'. - # Copyright (c) 2001-2022 Python Software Foundation; All Rights Reserved - # License Agreement: https://opensource.org/licenses/Python-2.0 - if pickler.bin: - pickler.write(EMPTY_DICT) - else: # proto 0 -- can't use EMPTY_DICT - pickler.write(MARK + DICT) - pickler.memoize(obj) - for k, v in obj.items(): - pickler.save(k) - if hasattr(v, '__name__') or hasattr(v, '__qualname__'): - pickler.save(v) - else: - pickler.save(v, name=k) - pickler.write(SETITEM) - logger.trace(pickler, "# D5") return @@ -1683,18 +1779,72 @@ def save_weakproxy(pickler, obj): logger.trace(pickler, "# R2") return +def _weak_cache(func=None, *, defaults=None): + if defaults is None: + defaults = {} + if func is None: + return partial(_weak_cache, defaults=defaults) + cache = weakref.WeakKeyDictionary() + @wraps(func) + def wrapper(referent): + try: + return defaults[referent] + except KeyError: + try: + return cache[referent] + except KeyError: + value = func(referent) + cache[referent] = value + return value + return wrapper + +@_weak_cache(defaults={None: False}) +def _is_imported_module(module): + return getattr(module, '__loader__', None) is not None or module in sys.modules.values() + +PYTHONPATH_PREFIXES = {getattr(sys, attr) for attr in ( + 'base_prefix', 'prefix', 'base_exec_prefix', 'exec_prefix', + 'real_prefix', # for old virtualenv versions + ) if hasattr(sys, attr)} +PYTHONPATH_PREFIXES = tuple(os.path.realpath(path) for path in PYTHONPATH_PREFIXES) +EXTENSION_SUFFIXES = tuple(importlib.machinery.EXTENSION_SUFFIXES) +if OLD310: + STDLIB_PREFIX = os.path.dirname(os.path.realpath(os.__file__)) + +@_weak_cache(defaults={None: True}) #XXX: shouldn't return False for None? def _is_builtin_module(module): - if not hasattr(module, "__file__"): return True + if module.__name__ in ('__main__', '__mp_main__'): + return False + mod_path = getattr(module, '__file__', None) + if not mod_path: + return _is_imported_module(module) # If a module file name starts with prefix, it should be a builtin # module, so should always be pickled as a reference. - names = ["base_prefix", "base_exec_prefix", "exec_prefix", "prefix", "real_prefix"] - return any(os.path.realpath(module.__file__).startswith(os.path.realpath(getattr(sys, name))) - for name in names if hasattr(sys, name)) or \ - module.__file__.endswith(EXTENSION_SUFFIXES) or \ - 'site-packages' in module.__file__ + mod_path = os.path.realpath(mod_path) + return ( + any(mod_path.startswith(prefix) for prefix in PYTHONPATH_PREFIXES) + or mod_path.endswith(EXTENSION_SUFFIXES) + or 'site-packages' in mod_path + ) -def _is_imported_module(module): - return getattr(module, '__loader__', None) is not None or module in sys.modules.values() +@_weak_cache(defaults={None: False}) +def _is_stdlib_module(module): + first_level = module.__name__.partition('.')[0] + if OLD310: + if first_level in sys.builtin_module_names: + return True + mod_path = getattr(module, '__file__', '') + if mod_path: + mod_path = os.path.realpath(mod_path) + return mod_path.startswith(STDLIB_PREFIX) + else: + return first_level in sys.stdlib_module_names + +@_weak_cache(defaults={None: None}) +def _module_package(module): + """get the top-level package of a module, if any""" + package = getattr(module, '__package__', None) + return package.partition('.')[0] if package else None @register(ModuleType) def save_module(pickler, obj): @@ -1706,7 +1856,7 @@ def save_module(pickler, obj): pass else: logger.trace(pickler, "M2: %s with diff", obj) - logger.info("Diff: %s", changed.keys()) + logger.trace(pickler, "Diff: %s", changed.keys()) pickler.save_reduce(_import_module, (obj.__name__,), obj=obj, state=changed) logger.trace(pickler, "# M2") @@ -1717,13 +1867,16 @@ def save_module(pickler, obj): logger.trace(pickler, "# M1") else: builtin_mod = _is_builtin_module(obj) - if obj.__name__ not in ("builtins", "dill", "dill._dill") and not builtin_mod or \ - is_dill(pickler, child=True) and obj is pickler._main: + is_session_main = is_dill(pickler, child=True) and obj is pickler._main + if (obj.__name__ not in ("builtins", "dill", "dill._dill") and not builtin_mod + or is_session_main): logger.trace(pickler, "M1: %s", obj) _main_dict = obj.__dict__.copy() #XXX: better no copy? option to copy? [_main_dict.pop(item, None) for item in singletontypes + ["__builtins__", "__loader__"]] mod_name = obj.__name__ if _is_imported_module(obj) else '__runtime__.%s' % obj.__name__ + if is_session_main: + pickler._main_dict_copy = _main_dict pickler.save_reduce(_import_module, (mod_name,), obj=obj, state=_main_dict) logger.trace(pickler, "# M1") @@ -1761,7 +1914,7 @@ def save_type(pickler, obj, postproc_list=None): elif obj is type(None): logger.trace(pickler, "T7: %s", obj) #XXX: pickler.save_reduce(type, (None,), obj=obj) - pickler.write(bytes('c__builtin__\nNoneType\n', 'UTF-8')) + pickler.write(GLOBAL + b'__builtin__\nNoneType\n') logger.trace(pickler, "# T7") elif obj is NotImplementedType: logger.trace(pickler, "T7: %s", obj) @@ -1881,7 +2034,7 @@ def save_function(pickler, obj): # If the globals is the __dict__ from the module being saved as a # session, substitute it by the dictionary being actually saved. if _original_main is not None and globs_copy is _original_main.__dict__: - globs_copy = getattr(pickler, '_main', _original_main).__dict__ + globs_copy = pickler._main.__dict__ globs = globs_copy # If the globals is a module __dict__, do not save it in the pickle. elif globs_copy is not None and obj.__module__ is not None and \ diff --git a/dill/_utils.py b/dill/_utils.py new file mode 100644 index 00000000..912a2e8e --- /dev/null +++ b/dill/_utils.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# +# Author: Leonardo Gama (@leogama) +# Copyright (c) 2022 The Uncertainty Quantification Foundation. +# License: 3-clause BSD. The full license text is available at: +# - https://github.com/uqfoundation/dill/blob/master/LICENSE +""" +Auxiliary classes and functions used in more than one module, defined here to +avoid circular import problems. +""" + +import contextlib +import io +import math +from contextlib import suppress + +#NOTE: dill._dill is not completely loaded at this point, can't import from it. +from dill import _dill + +# Type hints. +from typing import Tuple, Union + +def _format_bytes_size(size: Union[int, float]) -> Tuple[int, str]: + """Return bytes size text representation in human-redable form.""" + unit = "B" + power_of_2 = math.trunc(size).bit_length() - 1 + magnitude = min(power_of_2 - power_of_2 % 10, 80) # 2**80 == 1 YiB + if magnitude: + # Rounding trick: 1535 (1024 + 511) -> 1K; 1536 -> 2K + size = ((size >> magnitude-1) + 1) >> 1 + unit = "%siB" % "KMGTPEZY"[(magnitude // 10) - 1] + return size, unit + + +## File-related utilities ## + +class _PeekableReader(contextlib.AbstractContextManager): + """lightweight readable stream wrapper that implements peek()""" + def __init__(self, stream, closing=True): + self.stream = stream + self.closing = closing + def __exit__(self, *exc_info): + if self.closing: + self.stream.close() + def read(self, n): + return self.stream.read(n) + def readline(self): + return self.stream.readline() + def tell(self): + return self.stream.tell() + def close(self): + return self.stream.close() + def peek(self, n): + stream = self.stream + try: + if hasattr(stream, 'flush'): + stream.flush() + position = stream.tell() + stream.seek(position) # assert seek() works before reading + chunk = stream.read(n) + stream.seek(position) + return chunk + except (AttributeError, OSError): + raise NotImplementedError("stream is not peekable: %r", stream) from None + +class _SeekableWriter(io.BytesIO, contextlib.AbstractContextManager): + """works as an unlimited buffer, writes to file on close""" + def __init__(self, stream, closing=True, *args, **kwds): + super().__init__(*args, **kwds) + self.stream = stream + self.closing = closing + def __exit__(self, *exc_info): + self.close() + def close(self): + self.stream.write(self.getvalue()) + with suppress(AttributeError): + self.stream.flush() + super().close() + if self.closing: + self.stream.close() + +def _open(file, mode, *, peekable=False, seekable=False): + """return a context manager with an opened file-like object""" + readonly = ('r' in mode and '+' not in mode) + if not readonly and peekable: + raise ValueError("the 'peekable' option is invalid for writable files") + if readonly and seekable: + raise ValueError("the 'seekable' option is invalid for read-only files") + should_close = not hasattr(file, 'read' if readonly else 'write') + if should_close: + file = open(file, mode) + # Wrap stream in a helper class if necessary. + if peekable and not hasattr(file, 'peek'): + # Try our best to return it as an object with a peek() method. + if hasattr(file, 'seekable'): + file_seekable = file.seekable() + elif hasattr(file, 'seek') and hasattr(file, 'tell'): + try: + file.seek(file.tell()) + file_seekable = True + except Exception: + file_seekable = False + else: + file_seekable = False + if file_seekable: + file = _PeekableReader(file, closing=should_close) + else: + try: + file = io.BufferedReader(file) + except Exception: + # It won't be peekable, but will fail gracefully in _identify_module(). + file = _PeekableReader(file, closing=should_close) + elif seekable and ( + not hasattr(file, 'seek') + or not hasattr(file, 'truncate') + or (hasattr(file, 'seekable') and not file.seekable()) + ): + file = _SeekableWriter(file, closing=should_close) + if should_close or isinstance(file, (_PeekableReader, _SeekableWriter)): + return file + else: + return contextlib.nullcontext(file) diff --git a/dill/detect.py b/dill/detect.py index b6a6cb76..e6149d15 100644 --- a/dill/detect.py +++ b/dill/detect.py @@ -13,7 +13,7 @@ from inspect import ismethod, isfunction, istraceback, isframe, iscode from .pointers import parent, reference, at, parents, children -from .logger import trace +from .logging import trace __all__ = ['baditems','badobjects','badtypes','code','errors','freevars', 'getmodule','globalvars','nestedcode','nestedglobals','outermost', diff --git a/dill/logger.py b/dill/logging.py similarity index 68% rename from dill/logger.py rename to dill/logging.py index 98f8ff49..92386e0c 100644 --- a/dill/logger.py +++ b/dill/logging.py @@ -11,37 +11,45 @@ The 'logger' object is dill's top-level logger. The 'adapter' object wraps the logger and implements a 'trace()' method that -generates a detailed tree-style trace for the pickling call at log level INFO. +generates a detailed tree-style trace for the pickling call at log level +:const:`dill.logging.TRACE`, which has an intermediary value between +:const:`logging.INFO` and :const:`logging.DEGUB`. The 'trace()' function sets and resets dill's logger log level, enabling and disabling the pickling trace. The trace shows a tree structure depicting the depth of each object serialized *with dill save functions*, but not the ones that use save functions from -'pickle._Pickler.dispatch'. If the information is available, it also displays +``pickle._Pickler.dispatch``. If the information is available, it also displays the size in bytes that the object contributed to the pickle stream (including its child objects). Sample trace output: - >>> import dill, dill.tests - >>> dill.detect.trace(True) - >>> dill.dump_session(main=dill.tests) - ┬ M1: - ├┬ F2: + >>> import dill + >>> import keyword + >>> with dill.detect.trace(): + ... dill.dump_module(module=keyword) + ┬ M1: + ├┬ F2: │└ # F2 [32 B] - ├┬ D2: + ├┬ D5: │├┬ T4: ││└ # T4 [35 B] - │├┬ D2: + │├┬ D2: ││├┬ T4: │││└ # T4 [50 B] - ││├┬ D2: - │││└ # D2 [84 B] - ││└ # D2 [413 B] - │└ # D2 [763 B] - └ # M1 [813 B] + ││├┬ D2: + │││└ # D2 [47 B] + ││└ # D2 [280 B] + │└ # D5 [1 KiB] + └ # M1 [1 KiB] """ -__all__ = ['adapter', 'logger', 'trace'] +from __future__ import annotations + +__all__ = [ + 'adapter', 'logger', 'trace', 'getLogger', + 'CRITICAL', 'ERROR', 'WARNING', 'INFO', 'TRACE', 'DEBUG', 'NOTSET', +] import codecs import contextlib @@ -49,10 +57,21 @@ import logging import math import os +from contextlib import suppress +from logging import getLogger, CRITICAL, ERROR, WARNING, INFO, DEBUG, NOTSET from functools import partial -from typing import TextIO, Union +from typing import Optional, TextIO, Union import dill +from ._utils import _format_bytes_size + +# Intermediary logging level for tracing. +TRACE = (INFO + DEBUG) // 2 + +_nameOrBoolToLevel = logging._nameToLevel.copy() +_nameOrBoolToLevel['TRACE'] = TRACE +_nameOrBoolToLevel[False] = WARNING +_nameOrBoolToLevel[True] = TRACE # Tree drawing characters: Unicode to ASCII map. ASCII_MAP = str.maketrans({"│": "|", "├": "|", "┬": "+", "└": "`"}) @@ -105,13 +124,24 @@ class TraceAdapter(logging.LoggerAdapter): creates extra values to be added in the LogRecord from it, then calls 'info()'. - Usage of logger with 'trace()' method: + Examples: - >>> from dill.logger import adapter as logger #NOTE: not dill.logger.logger - >>> ... - >>> def save_atype(pickler, obj): - >>> logger.trace(pickler, "Message with %s and %r etc. placeholders", 'text', obj) - >>> ... + In the first call to `trace()`, before pickling an object, it must be passed + to `trace()` as the last positional argument or as the keyword argument + `obj`. Note how, in the second example, the object is not passed as a + positional argument, and therefore won't be substituted in the message: + + >>> from dill.logger import adapter as logger #NOTE: not dill.logger.logger + >>> ... + >>> def save_atype(pickler, obj): + >>> logger.trace(pickler, "X: Message with %s and %r placeholders", 'text', obj) + >>> ... + >>> logger.trace(pickler, "# X") + >>> def save_weakproxy(pickler, obj) + >>> trace_message = "W: This works even with a broken weakproxy: %r" % obj + >>> logger.trace(pickler, trace_message, obj=obj) + >>> ... + >>> logger.trace(pickler, "# W") """ def __init__(self, logger): self.logger = logger @@ -128,7 +158,7 @@ def trace_setup(self, pickler): # Called by Pickler.dump(). if not dill._dill.is_dill(pickler, child=False): return - if self.isEnabledFor(logging.INFO): + elif self.isEnabledFor(TRACE): pickler._trace_stack = [] pickler._size_stack = [] else: @@ -137,16 +167,22 @@ def trace(self, pickler, msg, *args, obj=None, **kwargs): if not hasattr(pickler, '_trace_stack'): logger.info(msg, *args, **kwargs) return - if pickler._trace_stack is None: + elif pickler._trace_stack is None: return extra = kwargs.get('extra', {}) pushed_obj = msg.startswith('#') if not pushed_obj: + if obj is None and (not args or type(args[-1]) is str): + raise TypeError( + "the pickled object must be passed as the last positional " + "argument (being substituted in the message) or as the " + "'obj' keyword argument." + ) if obj is None: obj = args[-1] pickler._trace_stack.append(id(obj)) size = None - try: + with suppress(AttributeError, TypeError): # Streams are not required to be tellable. size = pickler._file_tell() frame = pickler.framer.current_frame @@ -155,11 +191,12 @@ def trace(self, pickler, msg, *args, obj=None, **kwargs): except AttributeError: # PyPy may use a BytesBuilder as frame size += len(frame) - except (AttributeError, TypeError): - pass if size is not None: if not pushed_obj: pickler._size_stack.append(size) + if len(pickler._size_stack) == 3: # module > dict > variable + with suppress(AttributeError, KeyError): + extra['varname'] = pickler._id_to_name.pop(id(obj)) else: size -= pickler._size_stack.pop() extra['size'] = size @@ -168,6 +205,10 @@ def trace(self, pickler, msg, *args, obj=None, **kwargs): self.info(msg, *args, **kwargs) if pushed_obj: pickler._trace_stack.pop() + def roll_back(self, pickler, obj): + if pickler._trace_stack and id(obj) == pickler._trace_stack[-1]: + pickler._trace_stack.pop() + pickler._size_stack.pop() class TraceFormatter(logging.Formatter): """ @@ -202,24 +243,26 @@ def format(self, record): if not self.is_utf8: prefix = prefix.translate(ASCII_MAP) + "-" fields['prefix'] = prefix + " " - if hasattr(record, 'size'): - # Show object size in human-redable form. - power = int(math.log(record.size, 2)) // 10 - size = record.size >> power*10 - fields['suffix'] = " [%d %sB]" % (size, "KMGTP"[power] + "i" if power else "") + if hasattr(record, 'varname'): + fields['suffix'] = " as %r" % record.varname + elif hasattr(record, 'size'): + fields['suffix'] = " [%d %s]" % _format_bytes_size(record.size) vars(record).update(fields) return super().format(record) -logger = logging.getLogger('dill') +logger = getLogger('dill') logger.propagate = False adapter = TraceAdapter(logger) stderr_handler = logging._StderrHandler() adapter.addHandler(stderr_handler) -def trace(arg: Union[bool, TextIO, str, os.PathLike] = None, *, mode: str = 'a') -> None: +def trace( + arg: Union[bool, str, TextIO, os.PathLike] = None, *, mode: str = 'a' + ) -> Optional[TraceManager]: """print a trace through the stack when pickling; useful for debugging - With a single boolean argument, enable or disable the tracing. + With a single boolean argument, enable or disable the tracing. Or, with a + logging level name (not ``int``), set the logging level of the dill logger. Example usage: @@ -229,10 +272,10 @@ def trace(arg: Union[bool, TextIO, str, os.PathLike] = None, *, mode: str = 'a') Alternatively, ``trace()`` can be used as a context manager. With no arguments, it just takes care of restoring the tracing state on exit. - Either a file handle, or a file name and (optionally) a file mode may be - specitfied to redirect the tracing output in the ``with`` block context. A - log function is yielded by the manager so the user can write extra - information to the file. + Either a file handle, or a file name and a file mode (optional) may be + specified to redirect the tracing output in the ``with`` block. A ``log()`` + function is yielded by the manager so the user can write extra information + to the file. Example usage: @@ -251,13 +294,18 @@ def trace(arg: Union[bool, TextIO, str, os.PathLike] = None, *, mode: str = 'a') >>> log("> squared = %r", squared) >>> dumps(squared) - Arguments: - arg: a boolean value, or an optional file-like or path-like object for the context manager - mode: mode string for ``open()`` if a file name is passed as the first argument + Parameters: + arg: a boolean value, the name of a logging level (including "TRACE") + or an optional file-like or path-like object for the context manager + mode: mode string for ``open()`` if a file name is passed as the first + argument """ - if not isinstance(arg, bool): + level = _nameOrBoolToLevel.get(arg) if isinstance(arg, (bool, str)) else None + if level is not None: + logger.setLevel(level) + return + else: return TraceManager(file=arg, mode=mode) - logger.setLevel(logging.INFO if arg else logging.WARNING) class TraceManager(contextlib.AbstractContextManager): """context manager version of trace(); can redirect the trace to a file""" @@ -276,7 +324,7 @@ def __enter__(self): adapter.removeHandler(stderr_handler) adapter.addHandler(self.handler) self.old_level = adapter.getEffectiveLevel() - adapter.setLevel(logging.INFO) + adapter.setLevel(TRACE) return adapter.info def __exit__(self, *exc_info): adapter.setLevel(self.old_level) diff --git a/dill/session.py b/dill/session.py index 36964eb7..a993407a 100644 --- a/dill/session.py +++ b/dill/session.py @@ -9,14 +9,14 @@ """ Pickle and restore the intepreter session or a module's state. -The functions :py:func:`dump_module`, :py:func:`load_module` and -:py:func:`load_module_asdict` are capable of saving and restoring, as long as +The functions :func:`dump_module`, :func:`load_module` and +:func:`load_module_asdict` are capable of saving and restoring, as long as objects are pickleable, the complete state of a module. For imported modules that are pickled, `dill` assumes that they are importable when unpickling. -Contrary of using :py:func:`dill.dump` and :py:func:`dill.load` to save and load -a module object, :py:func:`dill.dump_module` always tries to pickle the module -by value (including built-in modules). Also, options like +Contrary of using :func:`dill.dump` and :func:`dill.load` to save and load a +module object, :func:`dill.dump_module` always tries to pickle the module by +value (including built-in modules). Also, options like ``dill.settings['byref']`` and ``dill.settings['recurse']`` don't affect its behavior. @@ -24,180 +24,74 @@ modules, that would prevent it from pickling or drastically increase its disk size, they can be saved by reference instead of by value, using the option ``refimported``. - -With :py:func:`dump_module`, namespace filters may be used to restrict the list -of pickled variables to a subset of those in the module, based on their names or -values. Also, using :py:func:`load_module_asdict` allows one to load the -variables from different saved states of the same module into dictionaries. """ - __all__ = [ - 'dump_module', 'load_module', 'load_module_asdict', 'is_pickled_module', + 'dump_module', 'load_module', 'load_module_asdict', 'dump_session', 'load_session' # backward compatibility ] -import io import re import sys import warnings -from contextlib import AbstractContextManager, nullcontext, suppress -from dill import _dill, Pickler, Unpickler, UnpicklingError +from dill import _dill, logging +from dill import Pickler, Unpickler, UnpicklingError from ._dill import ( BuiltinMethodType, FunctionType, MethodType, ModuleType, TypeType, - _import_module, _is_builtin_module, _is_imported_module, _main_module, - _reverse_typemap, __builtin__, + _import_module, _is_builtin_module, _is_imported_module, + _lookup_module, _main_module, _module_map, _reverse_typemap, __builtin__, ) +from ._utils import _open + +logger = logging.getLogger(__name__) # Type hints. -from typing import Optional, Union +from typing import Any, Dict, Optional, Union import pathlib import tempfile TEMPDIR = pathlib.PurePath(tempfile.gettempdir()) -settings = { - 'refimported': False, - 'refonfail' : True, -} - -class _PeekableReader(AbstractContextManager): - """lightweight readable stream wrapper that implements peek()""" - def __init__(self, stream, closing=True): - self.stream = stream - self.closing = closing - def __exit__(self, *exc_info): - if self.closing: - self.stream.close() - def read(self, n): - return self.stream.read(n) - def readline(self): - return self.stream.readline() - def tell(self): - return self.stream.tell() - def close(self): - return self.stream.close() - def peek(self, n): - stream = self.stream - try: - if hasattr(stream, 'flush'): stream.flush() - position = stream.tell() - stream.seek(position) # assert seek() works before reading - chunk = stream.read(n) - stream.seek(position) - return chunk - except (AttributeError, OSError): - raise NotImplementedError("stream is not peekable: %r", stream) from None - -class _TruncatableWriter(io.BytesIO, AbstractContextManager): - """works as an unlimited buffer, writes to file on close""" - def __init__(self, stream, closing=True, *args, **kwds): - super().__init__(*args, **kwds) - self.stream = stream - self.closing = closing - def __exit__(self, *exc_info): - self.close() - def close(self): - self.stream.write(self.getvalue()) - with suppress(AttributeError): - self.stream.flush() - super().close() - if self.closing: - self.stream.close() - -def _open(file, mode, *, peekable=False, truncatable=False): - """return a context manager with an opened file-like object""" - readonly = ('r' in mode and '+' not in mode) - if not readonly and peekable: - raise ValueError("the 'peekable' option is invalid for writable files") - if readonly and truncatable: - raise ValueError("the 'truncatable' option is invalid for read-only files") - should_close = not hasattr(file, 'read' if readonly else 'write') - if should_close: - file = open(file, mode) - # Wrap stream in a helper class if necessary. - if peekable and not hasattr(file, 'peek'): - # Try our best to return it as an object with a peek() method. - if hasattr(file, 'tell') and hasattr(file, 'seek'): - file = _PeekableReader(file, closing=should_close) - else: - try: - file = io.BufferedReader(file) - except Exception: - # It won't be peekable, but will fail gracefully in _identify_module(). - file = _PeekableReader(file, closing=should_close) - elif truncatable and ( - not hasattr(file, 'truncate') - or (hasattr(file, 'seekable') and not file.seekable()) - ): - file = _TruncatableWriter(file, closing=should_close) - if should_close or isinstance(file, (_PeekableReader, _TruncatableWriter)): - return file - else: - return nullcontext(file) - -def _module_map(): - """get map of imported modules""" - from collections import defaultdict - from types import SimpleNamespace - modmap = SimpleNamespace( - by_name=defaultdict(list), - by_id=defaultdict(list), - top_level={}, - ) - for modname, module in sys.modules.items(): - if modname in ('__main__', '__mp_main__') or not isinstance(module, ModuleType): - continue - if '.' not in modname: - modmap.top_level[id(module)] = modname - for objname, modobj in module.__dict__.items(): - modmap.by_name[objname].append((modobj, modname)) - modmap.by_id[id(modobj)].append((modobj, objname, modname)) - return modmap - +# Unique objects (with no duplicates) that may be imported with "import as". IMPORTED_AS_TYPES = (ModuleType, TypeType, FunctionType, MethodType, BuiltinMethodType) if 'PyCapsuleType' in _reverse_typemap: IMPORTED_AS_TYPES += (_reverse_typemap['PyCapsuleType'],) +# For unique objects of various types that have a '__module__' attribute. IMPORTED_AS_MODULES = [re.compile(x) for x in ( 'ctypes', 'typing', 'subprocess', 'threading', r'concurrent\.futures(\.\w+)?', r'multiprocessing(\.\w+)?' )] -def _lookup_module(modmap, name, obj, main_module): - """lookup name or id of obj if module is imported""" - for modobj, modname in modmap.by_name[name]: - if modobj is obj and sys.modules[modname] is not main_module: - return modname, name - __module__ = getattr(obj, '__module__', None) - if isinstance(obj, IMPORTED_AS_TYPES) or (__module__ is not None - and any(regex.fullmatch(__module__) for regex in IMPORTED_AS_MODULES)): - for modobj, objname, modname in modmap.by_id[id(obj)]: - if sys.modules[modname] is not main_module: - return modname, objname - return None, None - -def _stash_modules(main_module): - modmap = _module_map() - newmod = ModuleType(main_module.__name__) +BUILTIN_CONSTANTS = (None, False, True, NotImplemented) +def _stash_modules(main_module, original_main): + """pop imported variables to be saved by reference in the __dill_imported* attributes""" + modmap = _module_map(original_main) + newmod = ModuleType(main_module.__name__) + original = {} imported = [] imported_as = [] imported_top_level = [] # keep separated for backward compatibility - original = {} + for name, obj in main_module.__dict__.items(): - if obj is main_module: - original[name] = newmod # self-reference - elif obj is main_module.__dict__: - original[name] = newmod.__dict__ - # Avoid incorrectly matching a singleton value in another package (ex.: __doc__). - elif any(obj is singleton for singleton in (None, False, True)) \ - or isinstance(obj, ModuleType) and _is_builtin_module(obj): # always saved by ref + # Avoid incorrectly matching a singleton value in another package (e.g. __doc__ == None). + if (any(obj is constant for constant in BUILTIN_CONSTANTS) # must compare by identity + or type(obj) is str and obj == '' # internalized, for cases like: __package__ == '' + or type(obj) is int and -128 <= obj <= 256 # possibly cached by compiler/interpreter + or isinstance(obj, ModuleType) and _is_builtin_module(obj) # always saved by ref + or obj is main_module or obj is main_module.__dict__): original[name] = obj else: - source_module, objname = _lookup_module(modmap, name, obj, main_module) + modname = getattr(obj, '__module__', None) + lookup_by_id = ( + isinstance(obj, IMPORTED_AS_TYPES) + or modname is not None + and any(regex.fullmatch(modname) for regex in IMPORTED_AS_MODULES) + ) + source_module, objname, _ = _lookup_module(modmap, name, obj, lookup_by_id) if source_module is not None: if objname == name: imported.append((source_module, name)) @@ -214,23 +108,50 @@ def _stash_modules(main_module): newmod.__dill_imported = imported newmod.__dill_imported_as = imported_as newmod.__dill_imported_top_level = imported_top_level - if getattr(newmod, '__loader__', None) is None and _is_imported_module(main_module): - # Trick _is_imported_module() to force saving as an imported module. - newmod.__loader__ = True # will be discarded by save_module() - return newmod + _discard_added_variables(newmod, main_module.__dict__) + + if logger.isEnabledFor(logging.INFO): + refimported = [(name, "%s.%s" % (mod, name)) for mod, name in imported] + refimported += [(name, "%s.%s" % (mod, objname)) for mod, objname, name in imported_as] + refimported += [(name, mod) for mod, name in imported_top_level] + message = "[dump_module] Variables saved by reference (refimported):\n" + logger.info(message + _format_log_dict(dict(refimported))) + logger.debug("main namespace after _stash_modules(): %s", dir(newmod)) + + return newmod, modmap else: - return main_module + return main_module, modmap def _restore_modules(unpickler, main_module): - try: - for modname, name in main_module.__dict__.pop('__dill_imported'): - main_module.__dict__[name] = unpickler.find_class(modname, name) - for modname, objname, name in main_module.__dict__.pop('__dill_imported_as'): - main_module.__dict__[name] = unpickler.find_class(modname, objname) - for modname, name in main_module.__dict__.pop('__dill_imported_top_level'): - main_module.__dict__[name] = __import__(modname) - except KeyError: - pass + for modname, name in main_module.__dict__.pop('__dill_imported', ()): + main_module.__dict__[name] = unpickler.find_class(modname, name) + for modname, objname, name in main_module.__dict__.pop('__dill_imported_as', ()): + main_module.__dict__[name] = unpickler.find_class(modname, objname) + for modname, name in main_module.__dict__.pop('__dill_imported_top_level', ()): + main_module.__dict__[name] = _import_module(modname) + +def _format_log_dict(dict): + return pprint.pformat(dict, compact=True, sort_dicts=True).replace("'", "") + +def _discard_added_variables(main, original_namespace): + # Some empty attributes like __doc__ may have been added by ModuleType(). + added_names = set(main.__dict__) + added_names.discard('__name__') # required + added_names.difference_update(original_namespace) + added_names.difference_update('__dill_imported%s' % s for s in ('', '_as', '_top_level')) + for name in added_names: + delattr(main, name) + +def _fix_module_namespace(main, original_main): + # Self-references. + for name, obj in main.__dict__.items(): + if obj is original_main: + setattr(main, name, main) + elif obj is original_main.__dict__: + setattr(main, name, main.__dict__) + # Trick _is_imported_module(), forcing main to be saved as an imported module. + if getattr(main, '__loader__', None) is None and _is_imported_module(original_main): + main.__loader__ = True # will be discarded by _dill.save_module() def dump_module( filename = str(TEMPDIR/'session.pkl'), @@ -240,33 +161,38 @@ def dump_module( refonfail: Optional[bool] = None, **kwds ) -> None: - R"""Pickle the current state of :py:mod:`__main__` or another module to a file. + """Pickle the current state of :mod:`__main__` or another module to a file. - Save the contents of :py:mod:`__main__` (e.g. from an interactive + Save the contents of :mod:`__main__` (e.g. from an interactive interpreter session), an imported module, or a module-type object (e.g. - built with :py:class:`~types.ModuleType`), to a file. The pickled - module can then be restored with the function :py:func:`load_module`. + built with :class:`~types.ModuleType`), to a file. The pickled + module can then be restored with the function :func:`load_module`. Parameters: filename: a path-like object or a writable stream. module: a module object or the name of an importable module. If `None` - (the default), :py:mod:`__main__` is saved. + (the default), :mod:`__main__` is saved. refimported: if `True`, all objects identified as having been imported into the module's namespace are saved by reference. *Note:* this is - similar but independent from ``dill.settings[`byref`]``, as + similar but independent from ``dill.settings['byref']``, as ``refimported`` refers to virtually all imported objects, while ``byref`` only affects select objects. refonfail: if `True` (the default), objects that fail to pickle by value will try to be saved by reference. If this also fails, saving their parent objects by reference will be attempted recursively. In the worst case scenario, the module itself may be saved by reference, - with a warning. Note: this option disables framing for pickle - protocol >= 4. Turning it off may improve unpickling speed, but may - cause a module to fail pickling. - **kwds: extra keyword arguments passed to :py:class:`Pickler()`. + with a warning. *Note:* this has the side effect of disabling framing + for pickle protocol ≥ 4. Turning this option off may improve + unpickling speed, but may cause a module to fail pickling. + **kwds: extra keyword arguments passed to :class:`Pickler()`. Raises: - :py:exc:`PicklingError`: if pickling fails. + :exc:`PicklingError`: if pickling fails. + :exc:`PicklingWarning`: if the module itself ends being saved by + reference due to unpickleable objects in its namespace. + + Default values for keyword-only arguments can be set in + `dill.session.settings`. Examples: @@ -291,7 +217,7 @@ def dump_module( >>> foo.values = [1,2,3] >>> import math >>> foo.sin = math.sin - >>> dill.dump_module('foo_session.pkl', module=foo, refimported=True) + >>> dill.dump_module('foo_session.pkl', module=foo) - Save the state of a module with unpickleable objects: @@ -316,7 +242,23 @@ def dump_module( [0.8414709848078965, 0.9092974268256817, 0.1411200080598672] >>> os = dill.load_module('os_session.pkl') >>> print(os.altsep.join('path')) - p\a\t\h + p\\a\\t\\h + + - Use `refimported` to save imported objects by reference: + + >>> import dill + >>> from html.entities import html5 + >>> type(html5), len(html5) + (dict, 2231) + >>> import io + >>> buf = io.BytesIO() + >>> dill.dump_module(buf) # saves __main__, with html5 saved by value + >>> len(buf.getvalue()) # pickle size in bytes + 71665 + >>> buf = io.BytesIO() + >>> dill.dump_module(buf, refimported=True) # html5 saved by reference + >>> len(buf.getvalue()) + 438 *Changed in version 0.3.6:* Function ``dump_session()`` was renamed to ``dump_module()``. Parameters ``main`` and ``byref`` were renamed to @@ -324,7 +266,7 @@ def dump_module( Note: Currently, ``dill.settings['byref']`` and ``dill.settings['recurse']`` - don't apply to this function. + don't apply to this function.` """ for old_par, par in [('main', 'module'), ('byref', 'refimported')]: if old_par in kwds: @@ -339,10 +281,9 @@ def dump_module( from .settings import settings as dill_settings protocol = dill_settings['protocol'] - if refimported is None: - refimported = settings['refimported'] - if refonfail is None: - refonfail = settings['refonfail'] + if refimported is None: refimported = settings['refimported'] + if refonfail is None: refonfail = settings['refonfail'] + main = module if main is None: main = _main_module @@ -351,22 +292,36 @@ def dump_module( if not isinstance(main, ModuleType): raise TypeError("%r is not a module" % main) original_main = main + + logger.debug("original main namespace: %s", dir(main)) if refimported: - main = _stash_modules(main) - with _open(filename, 'wb', truncatable=True) as file: + main, modmap = _stash_modules(main, original_main) + + with _open(filename, 'wb', seekable=True) as file: pickler = Pickler(file, protocol, **kwds) - if main is not original_main: - pickler._original_main = original_main pickler._main = main #FIXME: dill.settings are disabled pickler._byref = False # disable pickling by name reference pickler._recurse = False # disable pickling recursion for globals pickler._session = True # is best indicator of when pickling a session pickler._first_pass = True + if main is not original_main: + pickler._original_main = original_main + _fix_module_namespace(main, original_main) if refonfail: pickler._refonfail = True # False by default pickler._file_seek = file.seek pickler._file_truncate = file.truncate + pickler._saved_byref = [] + if refimported: + # Cache modmap for refonfail. + pickler._modmap = modmap + if logger.isEnabledFor(logging.TRACE): + pickler._id_to_name = {id(v): k for k, v in main.__dict__.items()} pickler.dump(main) + if refonfail and pickler._saved_byref and logger.isEnabledFor(logging.INFO): + saved_byref = {var: "%s.%s" % (mod, obj) for var, mod, obj in pickler._saved_byref} + message = "[dump_module] Variables saved by reference (refonfail):\n" + logger.info(message + _format_log_dict(saved_byref)) return # Backward compatibility. @@ -377,97 +332,64 @@ def dump_session(filename=str(TEMPDIR/'session.pkl'), main=None, byref=False, ** def _identify_module(file, main=None): """identify the name of the module stored in the given file-type object""" - import pickletools - NEUTRAL = {'PROTO', 'FRAME', 'PUT', 'BINPUT', 'MEMOIZE', 'MARK', 'STACK_GLOBAL'} - opcodes = ((opcode.name, arg) for opcode, arg, pos in pickletools.genops(file.peek(256)) - if opcode.name not in NEUTRAL) + from pickletools import genops + UNICODE = {'UNICODE', 'BINUNICODE', 'SHORT_BINUNICODE'} + found_import = False try: - opcode, arg = next(opcodes) - if (opcode, arg) == ('SHORT_BINUNICODE', 'dill._dill'): - # The file uses STACK_GLOBAL instead of GLOBAL. - opcode, arg = next(opcodes) - if not (opcode in ('SHORT_BINUNICODE', 'GLOBAL') and arg.split()[-1] == '_import_module'): - raise ValueError - opcode, arg = next(opcodes) - if not opcode in ('SHORT_BINUNICODE', 'BINUNICODE', 'UNICODE'): - raise ValueError - module_name = arg - if not ( - next(opcodes)[0] in ('TUPLE1', 'TUPLE') and - next(opcodes)[0] == 'REDUCE' #and - #next(opcodes)[0] in ('EMPTY_DICT', 'DICT') - ): - raise ValueError - return module_name - except StopIteration: - raise UnpicklingError("reached STOP without finding module") from None + for opcode, arg, pos in genops(file.peek(256)): + if not found_import: + if opcode.name in ('GLOBAL', 'SHORT_BINUNICODE') and \ + arg.endswith('_import_module'): + found_import = True + else: + if opcode.name in UNICODE: + return arg + else: + raise UnpicklingError("reached STOP without finding main module") except (NotImplementedError, ValueError) as error: # ValueError occours when the end of the chunk is reached (without a STOP). if isinstance(error, NotImplementedError) and main is not None: # file is not peekable, but we have main. return None - raise UnpicklingError("unable to identify module") from error - -def is_pickled_module(filename, importable: bool = True) -> bool: - """Check if a file is a pickle file readable by :py:func:`load_module`. - - Parameters: - filename: a path-like object or a readable stream. - importable: expected kind of the file's saved module. Use `True` for - importable modules (the default) or `False` for module-type objects. - - Returns: - `True` if the pickle file at ``filename`` was generated with - :py:func:`dump_module` **AND** the module whose state is saved in it is - of the kind specified by the ``importable`` argument. `False` otherwise. - """ - with _open(filename, 'rb', peekable=True) as file: - try: - pickle_main = _identify_module(file) - except UnpicklingError: - return False - else: - is_runtime_mod = pickle_main.startswith('__runtime__.') - return importable ^ is_runtime_mod + raise UnpicklingError("unable to identify main module") from error def load_module( filename = str(TEMPDIR/'session.pkl'), module: Optional[Union[ModuleType, str]] = None, **kwds ) -> Optional[ModuleType]: - """Update the selected module (default is :py:mod:`__main__`) with - the state saved at ``filename``. + """Update the selected module with the state saved at ``filename``. - Restore a module to the state saved with :py:func:`dump_module`. The - saved module can be :py:mod:`__main__` (e.g. an interpreter session), + Restore a module to the state saved with :func:`dump_module`. The + saved module can be :mod:`__main__` (e.g. an interpreter session), an imported module, or a module-type object (e.g. created with - :py:class:`~types.ModuleType`). + :class:`~types.ModuleType`). - When restoring the state of a non-importable module-type object, the + When restoring the state of a non-importable, module-type object, the current instance of this module may be passed as the argument ``module``. - Otherwise, a new instance is created with :py:class:`~types.ModuleType` + Otherwise, a new instance is created with :class:`~types.ModuleType` and returned. Parameters: filename: a path-like object or a readable stream. module: a module object or the name of an importable module; - the module name and kind (i.e. imported or non-imported) must + the module's name and kind (i.e. imported or non-imported) must match the name and kind of the module stored at ``filename``. - **kwds: extra keyword arguments passed to :py:class:`Unpickler()`. + **kwds: extra keyword arguments passed to :class:`Unpickler()`. Raises: - :py:exc:`UnpicklingError`: if unpickling fails. - :py:exc:`ValueError`: if the argument ``module`` and module saved - at ``filename`` are incompatible. + :exc:`UnpicklingError`: if unpickling fails. + :exc:`ValueError`: if the argument ``module`` and the module + saved at ``filename`` are incompatible. Returns: - A module object, if the saved module is not :py:mod:`__main__` or + A module object, if the saved module is not :mod:`__main__` and a module instance wasn't provided with the argument ``module``. Passing an argument to ``module`` forces `dill` to verify that the module being loaded is compatible with the argument value. Additionally, if the - argument is a module (instead of a module name), it supresses the return - value. Each case and behavior is exemplified below: + argument is a module instance (instead of a module name), it supresses the + return value. Each case and behavior is exemplified below: 1. `module`: ``None`` --- This call loads a previously saved state of the module ``math`` and returns it (the module object) at the end: @@ -580,10 +502,6 @@ def load_module( *Changed in version 0.3.6:* Function ``load_session()`` was renamed to ``load_module()``. Parameter ``main`` was renamed to ``module``. - - See also: - :py:func:`load_module_asdict` to load the contents of module saved - with :py:func:`dump_module` into a dictionary. """ if 'main' in kwds: warnings.warn( @@ -593,13 +511,10 @@ def load_module( if module is not None: raise TypeError("both 'module' and 'main' arguments were used") module = kwds.pop('main') + main = module with _open(filename, 'rb', peekable=True) as file: - #FIXME: dill.settings are disabled - unpickler = Unpickler(file, **kwds) - unpickler._session = True - - # Resolve unpickler._main + # Resolve main. pickle_main = _identify_module(file, main) if main is None: main = pickle_main @@ -611,7 +526,6 @@ def load_module( main = _import_module(main) if not isinstance(main, ModuleType): raise TypeError("%r is not a module" % main) - unpickler._main = main # Check against the pickle's main. is_main_imported = _is_imported_module(main) @@ -622,17 +536,21 @@ def load_module( error_msg = "can't update{} module{} %r with the saved state of{} module{} %r" if main.__name__ != pickle_main: raise ValueError(error_msg.format("", "", "", "") % (main.__name__, pickle_main)) - if is_runtime_mod and is_main_imported: + elif is_runtime_mod and is_main_imported: raise ValueError( error_msg.format(" imported", "", "", "-type object") % (main.__name__, main.__name__) ) - if not is_runtime_mod and not is_main_imported: + elif not is_runtime_mod and not is_main_imported: raise ValueError( error_msg.format("", "-type object", " imported", "") % (main.__name__, main.__name__) ) + # Load the module's state. + #FIXME: dill.settings are disabled + unpickler = Unpickler(file, **kwds) + unpickler._session = True try: if not is_main_imported: # This is for find_class() to be able to locate it. @@ -658,9 +576,8 @@ def load_session(filename=str(TEMPDIR/'session.pkl'), main=None, **kwds): def load_module_asdict( filename = str(TEMPDIR/'session.pkl'), - update: bool = False, **kwds -) -> dict: +) -> Dict[str, Any]: """ Load the contents of a saved module into a dictionary. @@ -668,27 +585,22 @@ def load_module_asdict( lambda filename: vars(dill.load_module(filename)).copy() - however, does not alter the original module. Also, the path of - the loaded module is stored in the ``__session__`` attribute. + however, it does not alter the original module. Also, the path of + the loaded file is stored with the key ``'__session__'``. Parameters: filename: a path-like object or a readable stream - update: if `True`, initialize the dictionary with the current state - of the module prior to loading the state stored at filename. - **kwds: extra keyword arguments passed to :py:class:`Unpickler()` + **kwds: extra keyword arguments passed to :class:`Unpickler()` Raises: - :py:exc:`UnpicklingError`: if unpickling fails + :exc:`UnpicklingError`: if unpickling fails Returns: A copy of the restored module's dictionary. Note: - If ``update`` is True, the corresponding module may first be imported - into the current namespace before the saved state is loaded from - filename to the dictionary. Note that any module that is imported into - the current namespace as a side-effect of using ``update`` will not be - modified by loading the saved module in filename to a dictionary. + Even if not changed, the module refered in the file is always loaded + before its saved state is restored from `filename` to the dictionary. Example: >>> import dill @@ -708,37 +620,52 @@ def load_module_asdict( False >>> main['anum'] == anum # changed after the session was saved False - >>> new_var in main # would be True if the option 'update' was set - False + >>> new_var in main # it was initialized with the current state of __main__ + True """ if 'module' in kwds: raise TypeError("'module' is an invalid keyword argument for load_module_asdict()") + with _open(filename, 'rb', peekable=True) as file: - main_name = _identify_module(file) - original_main = sys.modules.get(main_name) - main = ModuleType(main_name) - if update: - if original_main is None: - original_main = _import_module(main_name) - main.__dict__.update(original_main.__dict__) - else: - main.__builtins__ = __builtin__ + main_qualname = _identify_module(file) + main = _import_module(main_qualname) + main_copy = ModuleType(main_qualname) + main_copy.__dict__.clear() + main_copy.__dict__.update(main.__dict__) + + parent_name, _, main_name = main_qualname.rpartition('.') + if parent_name: + parent = sys.modules[parent_name] try: - sys.modules[main_name] = main + sys.modules[main_qualname] = main_copy + if parent_name and getattr(parent, main_name, None) is main: + setattr(parent, main_name, main_copy) load_module(file, **kwds) finally: - if original_main is None: - del sys.modules[main_name] - else: - sys.modules[main_name] = original_main - main.__session__ = str(filename) - return main.__dict__ + sys.modules[main_qualname] = main + if parent_name and getattr(parent, main_name, None) is main_copy: + setattr(parent, main_name, main) + + if isinstance(getattr(filename, 'name', None), str): + main_copy.__session__ = filename.name + else: + main_copy.__session__ = str(filename) + return main_copy.__dict__ + + +## Session settings ## + +settings = { + 'refimported': False, + 'refonfail': True, +} + +## Variables set in this module to avoid circular import problems ## # Internal exports for backward compatibility with dill v0.3.5.1 -# Can't be placed in dill._dill because of circular import problems. for name in ( - '_lookup_module', '_module_map', '_restore_modules', '_stash_modules', + '_restore_modules', '_stash_modules', 'dump_session', 'load_session' # backward compatibility functions ): setattr(_dill, name, globals()[name]) diff --git a/dill/settings.py b/dill/settings.py index 140bfb5d..b105d2e8 100644 --- a/dill/settings.py +++ b/dill/settings.py @@ -9,8 +9,6 @@ global settings for Pickler """ -__all__ = ['settings'] - from pickle import DEFAULT_PROTOCOL settings = { @@ -24,3 +22,4 @@ } del DEFAULT_PROTOCOL + diff --git a/dill/tests/test_logger.py b/dill/tests/test_logging.py similarity index 97% rename from dill/tests/test_logger.py rename to dill/tests/test_logging.py index b4e4881a..ed33e6c4 100644 --- a/dill/tests/test_logger.py +++ b/dill/tests/test_logging.py @@ -11,7 +11,7 @@ import dill from dill import detect -from dill.logger import stderr_handler, adapter as logger +from dill.logging import stderr_handler, adapter as logger try: from StringIO import StringIO diff --git a/dill/tests/test_session.py b/dill/tests/test_session.py index 6a6ce22e..52e9cdd0 100644 --- a/dill/tests/test_session.py +++ b/dill/tests/test_session.py @@ -14,6 +14,7 @@ from types import ModuleType import dill +from dill import _dill session_file = os.path.join(os.path.dirname(__file__), 'session-refimported-%s.pkl') @@ -21,7 +22,7 @@ # Child process # ################### -def _error_line(error, obj, refimported): +def _error_line(obj, refimported): import traceback line = traceback.format_exc().splitlines()[-2].replace('[obj]', '['+repr(obj)+']') return "while testing (with refimported=%s): %s" % (refimported, line.lstrip()) @@ -53,7 +54,7 @@ def test_modules(refimported): assert __main__.complex_log is cmath.log except AssertionError as error: - error.args = (_error_line(error, obj, refimported),) + error.args = (_error_line(obj, refimported),) raise test_modules(refimported) @@ -92,6 +93,7 @@ def weekdays(self): return [day_name[i] for i in self.iterweekdays()] cal = CalendarSubclass() selfref = __main__ +self_dict = __main__.__dict__ # Setup global namespace for session saving tests. class TestNamespace: @@ -121,7 +123,7 @@ def _clean_up_cache(module): def _test_objects(main, globals_copy, refimported): try: main_dict = __main__.__dict__ - global Person, person, Calendar, CalendarSubclass, cal, selfref + global Person, person, Calendar, CalendarSubclass, cal, selfref, self_dict for obj in ('json', 'url', 'local_mod', 'sax', 'dom'): assert globals()[obj].__name__ == globals_copy[obj].__name__ @@ -142,9 +144,10 @@ def _test_objects(main, globals_copy, refimported): assert cal.weekdays() == globals_copy['cal'].weekdays() assert selfref is __main__ + assert self_dict is __main__.__dict__ except AssertionError as error: - error.args = (_error_line(error, obj, refimported),) + error.args = (_error_line(obj, refimported),) raise def test_session_main(refimported): @@ -193,12 +196,12 @@ def test_session_other(): assert module.selfref is module def test_runtime_module(): - modname = '__runtime__' - runtime = ModuleType(modname) - runtime.x = 42 + modname = 'runtime' + runtime_mod = ModuleType(modname) + runtime_mod.x = 42 - mod = dill.session._stash_modules(runtime) - if mod is not runtime: + mod, _ = dill.session._stash_modules(runtime_mod, runtime_mod) + if mod is not runtime_mod: print("There are objects to save by referenece that shouldn't be:", mod.__dill_imported, mod.__dill_imported_as, mod.__dill_imported_top_level, file=sys.stderr) @@ -207,46 +210,23 @@ def test_runtime_module(): # without imported objects in the namespace. It's a contrived example because # even dill can't be in it. This should work after fixing #462. session_buffer = BytesIO() - dill.dump_module(session_buffer, module=runtime, refimported=True) + dill.dump_module(session_buffer, module=runtime_mod, refimported=True) session_dump = session_buffer.getvalue() # Pass a new runtime created module with the same name. - runtime = ModuleType(modname) # empty - return_val = dill.load_module(BytesIO(session_dump), module=runtime) + runtime_mod = ModuleType(modname) # empty + return_val = dill.load_module(BytesIO(session_dump), module=runtime_mod) assert return_val is None - assert runtime.__name__ == modname - assert runtime.x == 42 - assert runtime not in sys.modules.values() + assert runtime_mod.__name__ == modname + assert runtime_mod.x == 42 + assert runtime_mod not in sys.modules.values() # Pass nothing as main. load_module() must create it. session_buffer.seek(0) - runtime = dill.load_module(BytesIO(session_dump)) - assert runtime.__name__ == modname - assert runtime.x == 42 - assert runtime not in sys.modules.values() - -def test_refimported_imported_as(): - import collections - import concurrent.futures - import types - import typing - mod = sys.modules['__test__'] = ModuleType('__test__') - dill.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) - mod.Dict = collections.UserDict # select by type - mod.AsyncCM = typing.AsyncContextManager # select by __module__ - mod.thread_exec = dill.executor # select by __module__ with regex - - session_buffer = BytesIO() - dill.dump_module(session_buffer, mod, refimported=True) - session_buffer.seek(0) - mod = dill.load(session_buffer) - del sys.modules['__test__'] - - assert set(mod.__dill_imported_as) == { - ('collections', 'UserDict', 'Dict'), - ('typing', 'AsyncContextManager', 'AsyncCM'), - ('dill', 'executor', 'thread_exec'), - } + runtime_mod = dill.load_module(BytesIO(session_dump)) + assert runtime_mod.__name__ == modname + assert runtime_mod.x == 42 + assert runtime_mod not in sys.modules.values() def test_load_module_asdict(): with TestNamespace(): @@ -268,13 +248,155 @@ def test_load_module_asdict(): assert main_vars['names'] == names assert main_vars['names'] is not names assert main_vars['x'] != x - assert 'y' not in main_vars + assert 'y' in main_vars assert 'empty' in main_vars + # Test a submodule. + import html + from html import entities + entitydefs = entities.entitydefs + + session_buffer = BytesIO() + dill.dump_module(session_buffer, entities) + session_buffer.seek(0) + entities_vars = dill.load_module_asdict(session_buffer) + + assert entities is html.entities # restored + assert entities is sys.modules['html.entities'] # restored + assert entitydefs is entities.entitydefs # unchanged + assert entitydefs is not entities_vars['entitydefs'] # saved by value + assert entitydefs == entities_vars['entitydefs'] + +def test_lookup_module(): + assert not _dill._is_builtin_module(local_mod) and local_mod.__package__ == '' + + def lookup(mod, name, obj, lookup_by_name=True): + from dill._dill import _lookup_module, _module_map + return _lookup_module(_module_map(mod), name, obj, lookup_by_name) + + name = '__unpickleable' + obj = object() + setattr(dill, name, obj) + assert lookup(dill, name, obj) == (None, None, None) + + # 4th level: non-installed module + setattr(local_mod, name, obj) + sys.modules[local_mod.__name__] = sys.modules.pop(local_mod.__name__) # put at the end + assert lookup(dill, name, obj) == (local_mod.__name__, name, False) # not installed + try: + import pox + # 3rd level: installed third-party module + setattr(pox, name, obj) + sys.modules['pox'] = sys.modules.pop('pox') + assert lookup(dill, name, obj) == ('pox', name, True) + except ModuleNotFoundError: + pass + # 2nd level: module of same package + setattr(dill.session, name, obj) + sys.modules['dill.session'] = sys.modules.pop('dill.session') + assert lookup(dill, name, obj) == ('dill.session', name, True) + # 1st level: stdlib module + setattr(os, name, obj) + sys.modules['os'] = sys.modules.pop('os') + assert lookup(dill, name, obj) == ('os', name, True) + + # Lookup by id. + name2 = name + '2' + setattr(dill, name2, obj) + assert lookup(dill, name2, obj) == ('os', name, True) + assert lookup(dill, name2, obj, lookup_by_name=False) == (None, None, None) + setattr(local_mod, name2, obj) + assert lookup(dill, name2, obj) == (local_mod.__name__, name2, False) + +def test_refimported(): + import collections + import concurrent.futures + import types + import typing + + mod = sys.modules['__test__'] = ModuleType('__test__') + mod.builtin_module_names = sys.builtin_module_names + dill.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) + mod.Dict = collections.UserDict # select by type + mod.AsyncCM = typing.AsyncContextManager # select by __module__ + mod.thread_exec = dill.executor # select by __module__ with regex + mod.local_mod = local_mod + + session_buffer = BytesIO() + dill.dump_module(session_buffer, mod, refimported=True) + session_buffer.seek(0) + mod = dill.load(session_buffer) + + assert mod.__dill_imported == [('sys', 'builtin_module_names')] + assert set(mod.__dill_imported_as) == { + ('collections', 'UserDict', 'Dict'), + ('typing', 'AsyncContextManager', 'AsyncCM'), + ('dill', 'executor', 'thread_exec'), + } + assert mod.__dill_imported_top_level == [(local_mod.__name__, 'local_mod')] + + session_buffer.seek(0) + dill.load_module(session_buffer, mod) + del sys.modules['__test__'] + assert mod.builtin_module_names is sys.builtin_module_names + assert mod.Dict is collections.UserDict + assert mod.AsyncCM is typing.AsyncContextManager + assert mod.thread_exec is dill.executor + assert mod.local_mod is local_mod + +def test_unpickleable_var(): + global local_mod + import keyword as builtin_mod + from dill._dill import _global_string + refonfail_default = dill.session.settings['refonfail'] + dill.session.settings['refonfail'] = True + name = '__unpickleable' + obj = memoryview(b'') + assert _dill._is_builtin_module(builtin_mod) + assert not _dill._is_builtin_module(local_mod) + # assert not dill.pickles(obj) + try: + dill.dumps(obj) + except _dill.UNPICKLEABLE_ERRORS: + pass + else: + raise Exception("test object should be unpickleable") + + def dump_with_ref(mod, other_mod): + setattr(other_mod, name, obj) + buf = BytesIO() + dill.dump_module(buf, mod) + return buf.getvalue() + + # "user" modules + _local_mod = local_mod + del local_mod # remove from __main__'s namespace + try: + dump_with_ref(__main__, __main__) + except dill.PicklingError: + pass # success + else: + raise Exception("saving with a reference to the module itself should fail for '__main__'") + assert _global_string(_local_mod.__name__, name) in dump_with_ref(__main__, _local_mod) + assert _global_string('os', name) in dump_with_ref(__main__, os) + local_mod = _local_mod + del _local_mod, __main__.__unpickleable, local_mod.__unpickleable, os.__unpickleable + + # "builtin" or "installed" modules + assert _global_string(builtin_mod.__name__, name) in dump_with_ref(builtin_mod, builtin_mod) + assert _global_string(builtin_mod.__name__, name) in dump_with_ref(builtin_mod, local_mod) + assert _global_string('os', name) in dump_with_ref(builtin_mod, os) + del builtin_mod.__unpickleable, local_mod.__unpickleable, os.__unpickleable + + dill.session.settings['refonfail'] = refonfail_default + if __name__ == '__main__': - test_session_main(refimported=False) - test_session_main(refimported=True) + if os.getenv('COVERAGE') != 'true': + test_session_main(refimported=False) + test_session_main(refimported=True) test_session_other() test_runtime_module() - test_refimported_imported_as() test_load_module_asdict() + test_lookup_module() + test_refimported() + test_unpickleable_var() diff --git a/dill/tests/test_stdlib_modules.py b/dill/tests/test_stdlib_modules.py new file mode 100644 index 00000000..15cb0767 --- /dev/null +++ b/dill/tests/test_stdlib_modules.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +# Author: Leonardo Gama (@leogama) +# Copyright (c) 2022 The Uncertainty Quantification Foundation. +# License: 3-clause BSD. The full license text is available at: +# - https://github.com/uqfoundation/dill/blob/master/LICENSE + +import io +import itertools +import logging +import multiprocessing +import os +import sys +import warnings + +import dill + +if not dill._dill.OLD310: + STDLIB_MODULES = list(sys.stdlib_module_names) + STDLIB_MODULES += [ + # From https://docs.python.org/3.11/library/ + 'collections.abc', 'concurrent.futures', 'curses.ascii', 'curses.panel', 'curses.textpad', + 'html.entities', 'html.parser', 'http.client', 'http.cookiejar', 'http.cookies', 'http.server', + 'importlib.metadata', 'importlib.resources', 'importlib.resources.abc', 'logging.config', + 'logging.handlers', 'multiprocessing.shared_memory', 'os.path', 'test.support', + 'test.support.bytecode_helper', 'test.support.import_helper', 'test.support.os_helper', + 'test.support.script_helper', 'test.support.socket_helper', 'test.support.threading_helper', + 'test.support.warnings_helper', 'tkinter.colorchooser', 'tkinter.dnd', 'tkinter.font', + 'tkinter.messagebox', 'tkinter.scrolledtext', 'tkinter.tix', 'tkinter.ttk', 'unittest.mock', + 'urllib.error', 'urllib.parse', 'urllib.request', 'urllib.response', 'urllib.robotparser', + 'xml.dom', 'xml.dom.minidom', 'xml.dom.pulldom', 'xml.etree.ElementTree', 'xml.parsers.expat', + 'xml.sax', 'xml.sax.handler', 'xml.sax.saxutils', 'xml.sax.xmlreader', 'xmlrpc.client', + 'xmlrpc.server', + ] + STDLIB_MODULES.sort() +else: + STDLIB_MODULES = [ + # From https://docs.python.org/3.9/library/ + '__future__', '_thread', 'abc', 'aifc', 'argparse', 'array', 'ast', 'asynchat', 'asyncio', + 'asyncore', 'atexit', 'audioop', 'base64', 'bdb', 'binascii', 'binhex', 'bisect', 'builtins', + 'bz2', 'calendar', 'cgi', 'cgitb', 'chunk', 'cmath', 'cmd', 'code', 'codecs', 'codeop', + 'collections', 'collections.abc', 'colorsys', 'compileall', 'concurrent', 'concurrent.futures', + 'configparser', 'contextlib', 'contextvars', 'copy', 'copyreg', 'crypt', 'csv', 'ctypes', + 'curses', 'curses.ascii', 'curses.panel', 'curses.textpad', 'dataclasses', 'datetime', 'dbm', + 'decimal', 'difflib', 'dis', 'distutils', 'doctest', 'email', 'ensurepip', 'enum', 'errno', + 'faulthandler', 'fcntl', 'filecmp', 'fileinput', 'fnmatch', 'formatter', 'fractions', 'ftplib', + 'functools', 'gc', 'getopt', 'getpass', 'gettext', 'glob', 'graphlib', 'grp', 'gzip', 'hashlib', + 'heapq', 'hmac', 'html', 'html.entities', 'html.parser', 'http', 'http.client', + 'http.cookiejar', 'http.cookies', 'http.server', 'imaplib', 'imghdr', 'imp', 'importlib', + 'importlib.metadata', 'inspect', 'io', 'ipaddress', 'itertools', 'json', 'keyword', 'linecache', + 'locale', 'logging', 'logging.config', 'logging.handlers', 'lzma', 'mailbox', 'mailcap', + 'marshal', 'math', 'mimetypes', 'mmap', 'modulefinder', 'msilib', 'msvcrt', 'multiprocessing', + 'multiprocessing.shared_memory', 'netrc', 'nis', 'nntplib', 'numbers', 'operator', 'optparse', + 'os', 'os.path', 'ossaudiodev', 'parser', 'pathlib', 'pdb', 'pickle', 'pickletools', 'pipes', + 'pkgutil', 'platform', 'plistlib', 'poplib', 'posix', 'pprint', 'pty', 'pwd', 'py_compile', + 'pyclbr', 'pydoc', 'queue', 'quopri', 'random', 're', 'readline', 'reprlib', 'resource', + 'rlcompleter', 'runpy', 'sched', 'secrets', 'select', 'selectors', 'shelve', 'shlex', 'shutil', + 'signal', 'site', 'site', 'smtpd', 'smtplib', 'sndhdr', 'socket', 'socketserver', 'spwd', + 'sqlite3', 'ssl', 'stat', 'statistics', 'string', 'stringprep', 'struct', 'subprocess', 'sunau', + 'symbol', 'symtable', 'sys', 'sysconfig', 'syslog', 'tabnanny', 'tarfile', 'telnetlib', + 'tempfile', 'termios', 'test', 'test.support', 'test.support.bytecode_helper', + 'test.support.script_helper', 'test.support.socket_helper', 'textwrap', 'threading', 'time', + 'timeit', 'tkinter', 'tkinter.colorchooser', 'tkinter.dnd', 'tkinter.font', + 'tkinter.messagebox', 'tkinter.scrolledtext', 'tkinter.tix', 'tkinter.ttk', 'token', 'tokenize', + 'trace', 'traceback', 'tracemalloc', 'tty', 'turtle', 'types', 'typing', 'unicodedata', + 'unittest', 'unittest.mock', 'urllib', 'urllib.error', 'urllib.parse', 'urllib.request', + 'urllib.response', 'urllib.robotparser', 'uu', 'uuid', 'venv', 'warnings', 'wave', 'weakref', + 'webbrowser', 'winreg', 'winsound', 'wsgiref', 'xdrlib', 'xml.dom', 'xml.dom.minidom', + 'xml.dom.pulldom', 'xml.etree.ElementTree', 'xml.parsers.expat', 'xml.sax', 'xml.sax.handler', + 'xml.sax.saxutils', 'xml.sax.xmlreader', 'xmlrpc', 'xmlrpc.client', 'xmlrpc.server', 'zipapp', + 'zipfile', 'zipimport', 'zlib', 'zoneinfo', +] + +def _dump_load_module(module_name, refonfail): + try: + __import__(module_name) + except ImportError: + return None, None + success_load = None + buf = io.BytesIO() + try: + dill.dump_module(buf, module_name, refonfail=refonfail) + except Exception: + print("F", end="") + success_dump = False + return success_dump, success_load + print(":", end="") + success_dump = True + buf.seek(0) + try: + module = dill.load_module(buf) + except Exception: + success_load = False + return success_dump, success_load + success_load = True + return success_dump, success_load + +def test_stdlib_modules(): + modules = [x for x in STDLIB_MODULES if + not x.startswith('_') + and not x.startswith('test') + and x not in ('antigravity', 'this')] + + + print("\nTesting pickling and unpickling of Standard Library modules...") + message = "Success rate (%s_module, refonfail=%s): %.1f%% [%d/%d]" + with multiprocessing.Pool(maxtasksperchild=1) as pool: + for refonfail in (False, True): + args = zip(modules, itertools.repeat(refonfail)) + result = pool.starmap(_dump_load_module, args, chunksize=1) + dump_successes = sum(dumped for dumped, loaded in result if dumped is not None) + load_successes = sum(loaded for dumped, loaded in result if loaded is not None) + dump_failures = sum(not dumped for dumped, loaded in result if dumped is not None) + load_failures = sum(not loaded for dumped, loaded in result if loaded is not None) + dump_total = dump_successes + dump_failures + load_total = load_successes + load_failures + dump_percent = 100 * dump_successes / dump_total + load_percent = 100 * load_successes / load_total + if logging.getLogger().isEnabledFor(logging.INFO): print() + logging.info(message, "dump", refonfail, dump_percent, dump_successes, dump_total) + logging.info(message, "load", refonfail, load_percent, load_successes, load_total) + if refonfail: + failed_dump = [mod for mod, (dumped, _) in zip(modules, result) if dumped is False] + failed_load = [mod for mod, (_, loaded) in zip(modules, result) if loaded is False] + if failed_dump: + logging.info("dump_module() FAILURES: %s", str(failed_dump).replace("'", "")[1:-1]) + if failed_load: + logging.info("load_module() FAILURES: %s", str(failed_load).replace("'", "")[1:-1]) + assert dump_percent > 99 + assert load_percent > 85 #FIXME: many important modules fail to unpickle + print() + +if __name__ == '__main__': + logging.basicConfig(level=os.environ.get('PYTHONLOGLEVEL', 'WARNING')) + warnings.simplefilter('ignore') + test_stdlib_modules() diff --git a/dill/tests/test_utils.py b/dill/tests/test_utils.py new file mode 100644 index 00000000..8da0ac99 --- /dev/null +++ b/dill/tests/test_utils.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +# Author: Leonardo Gama (@leogama) +# Copyright (c) 2022 The Uncertainty Quantification Foundation. +# License: 3-clause BSD. The full license text is available at: +# - https://github.com/uqfoundation/dill/blob/master/LICENSE + +"""test general utilities in _utils.py""" + +import io +import os +import sys + +from dill import _utils + +def test_format_bytes(): + formatb = _utils._format_bytes_size + assert formatb(1000) == (1000, 'B') + assert formatb(1024) == (1, 'KiB') + assert formatb(1024 + 511) == (1, 'KiB') + assert formatb(1024 + 512) == (2, 'KiB') + assert formatb(10**9) == (954, 'MiB') + +def test_open(): + file_unpeekable = open(__file__, 'rb', buffering=0) + assert not hasattr(file_unpeekable, 'peek') + + content = file_unpeekable.read() + peeked_chars = content[:10] + first_line = content[:100].partition(b'\n')[0] + b'\n' + file_unpeekable.seek(0) + + # Test _PeekableReader for seekable stream + with _utils._open(file_unpeekable, 'r', peekable=True) as file: + assert isinstance(file, _utils._PeekableReader) + assert file.peek(10)[:10] == peeked_chars + assert file.readline() == first_line + assert not file_unpeekable.closed + file_unpeekable.close() + + _pipe_r, _pipe_w = os.pipe() + pipe_r = io.FileIO(_pipe_r, closefd=False) + pipe_w = io.FileIO(_pipe_w, mode='w') + assert not hasattr(pipe_r, 'peek') + assert not pipe_r.seekable() + assert not pipe_w.seekable() + + # Test io.BufferedReader for unseekable stream + with _utils._open(pipe_r, 'r', peekable=True) as file: + assert isinstance(file, io.BufferedReader) + pipe_w.write(content[:100]) + assert file.peek(10)[:10] == peeked_chars + assert file.readline() == first_line + assert not pipe_r.closed + + # Test _SeekableWriter for unseekable stream + with _utils._open(pipe_w, 'w', seekable=True) as file: + # pipe_r is closed here for some reason... + assert isinstance(file, _utils._SeekableWriter) + file.write(content) + file.flush() + file.seek(0) + file.truncate() + file.write(b'a line of text\n') + assert not pipe_w.closed + pipe_r = io.FileIO(_pipe_r) + assert pipe_r.readline() == b'a line of text\n' + pipe_r.close() + pipe_w.close() + +if __name__ == '__main__': + test_format_bytes() + test_open() diff --git a/docs/source/conf.py b/docs/source/conf.py index 72c6fdfe..ead9ed06 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -66,14 +66,6 @@ # extension config github_project_url = "https://github.com/uqfoundation/dill" autoclass_content = 'both' -autodoc_default_options = { - 'members': True, - 'undoc-members': True, - 'private-members': True, - 'special-members': True, - 'show-inheritance': True, - 'exclude-members': '__dict__, __module__, __slots__', -} autodoc_typehints = 'description' napoleon_include_init_with_doc = True napoleon_include_private_with_doc = False diff --git a/docs/source/dill.rst b/docs/source/dill.rst index e18607db..e1ca2344 100644 --- a/docs/source/dill.rst +++ b/docs/source/dill.rst @@ -5,52 +5,107 @@ dill module ----------- .. automodule:: dill._dill -.. :exclude-members: + + :members: + :undoc-members: + :private-members: + :special-members: + :show-inheritance: + :imported-members: +.. :exclude-members: detect module ------------- .. automodule:: dill.detect -.. :exclude-members: +ismethod, isfunction, istraceback, isframe, iscode, parent, reference, at, parents, children - -logger module -------------- - -.. automodule:: dill.logger + :members: + :undoc-members: + :private-members: + :special-members: + :show-inheritance: + :imported-members: +.. :exclude-members: ismethod, isfunction, istraceback, isframe, iscode, parent, reference, at, parents, children + +logging module +-------------- + +.. automodule:: dill.logging + :members: + :undoc-members: + :private-members: + :special-members: + :show-inheritance: + :imported-members: :exclude-members: +trace objtypes module --------------- .. automodule:: dill.objtypes -.. :exclude-members: + + :members: + :undoc-members: + :private-members: + :special-members: + :show-inheritance: + :imported-members: +.. :exclude-members: pointers module --------------- .. automodule:: dill.pointers -.. :exclude-members: + + :members: + :undoc-members: + :private-members: + :special-members: + :show-inheritance: + :imported-members: +.. :exclude-members: session module ---------------- +-------------- .. automodule:: dill.session - :exclude-members: +dump_session, load_session + :members: + :undoc-members: + :private-members: + :special-members: + :show-inheritance: + :imported-members: + :exclude-members: dump_session, load_session settings module --------------- .. automodule:: dill.settings -.. :exclude-members: + + :members: + :undoc-members: + :private-members: + :special-members: + :show-inheritance: + :imported-members: +.. :exclude-members: source module ------------- .. automodule:: dill.source -.. :exclude-members: + + :members: + :undoc-members: + :private-members: + :special-members: + :show-inheritance: + :imported-members: +.. :exclude-members: temp module ----------- .. automodule:: dill.temp -.. :exclude-members: + + :members: + :undoc-members: + :private-members: + :special-members: + :show-inheritance: + :imported-members: +.. :exclude-members: +