From 225c34b3c37d8f608308b449bc69616565e0da4f Mon Sep 17 00:00:00 2001 From: srikris Date: Thu, 18 Feb 2016 02:05:37 -0800 Subject: [PATCH 1/5] Make GLPickle more extensible. Summary of changes ------------------ - Made GLPickle more extensible - Used the extensibility mechanism to implement things for Model, SGraph, SFrame, and SArray. - Added some additional tests for the same. Details ------- Previously, GLPickle could not be extended without making changes directly to it. Now, we added a mechanism to extend it by implementing 2 simple methods: 1. __gl_pickle_save__: Save your object to a filename (not handle) given to you. 2. __gl_pickle_load__: Load your object from a filename (not handle) given to you. As an example, here is a simple class that was extended to be pickled via GLPickle. class SampleClass(object): def __init__(self, member): self.member = member def __gl_pickle_save__(self, filename): with open(filename, 'w') as f: f.write(self.member) @staticmethod def __gl_pickle_load__(filename): with open(filename, 'r') as f: member = f.read().split() return SampleClass(member) --- oss_src/unity/python/sframe/__init__.py | 2 + oss_src/unity/python/sframe/_gl_pickle.py | 239 ++++++++++-------- .../python/sframe/data_structures/sarray.py | 7 + .../python/sframe/data_structures/sframe.py | 9 +- .../python/sframe/data_structures/sgraph.py | 27 +- .../python/sframe/test/test_gl_pickler.py | 63 ++++- .../unity/python/sframe/toolkits/_model.py | 7 + 7 files changed, 230 insertions(+), 124 deletions(-) diff --git a/oss_src/unity/python/sframe/__init__.py b/oss_src/unity/python/sframe/__init__.py index e3e2643b..1f5d2be7 100644 --- a/oss_src/unity/python/sframe/__init__.py +++ b/oss_src/unity/python/sframe/__init__.py @@ -62,6 +62,8 @@ from .version_info import version from .version_info import __VERSION__ +from _gl_pickle import GLPickler +from _gl_pickle import GLUnpickler class DeprecationHelper(object): def __init__(self, new_target): diff --git a/oss_src/unity/python/sframe/_gl_pickle.py b/oss_src/unity/python/sframe/_gl_pickle.py index ef8787a3..f3f74657 100644 --- a/oss_src/unity/python/sframe/_gl_pickle.py +++ b/oss_src/unity/python/sframe/_gl_pickle.py @@ -29,86 +29,44 @@ def _get_temp_filename(): def _get_tmp_file_location(): return _util._make_temp_directory(prefix='gl_pickle_') -def _is_not_pickle_safe_gl_model_class(obj_class): +def _is_gl_pickle_extensible(obj): """ - Check if a GraphLab create model is pickle safe. - - The function does it by checking that _CustomModel is the base class. - - Parameters - ---------- - obj_class : Class to be checked. - - Returns - ---------- - True if the GLC class is a model and is pickle safe. - - """ - if issubclass(obj_class, _toolkits._model.CustomModel): - return not obj_class._is_gl_pickle_safe() - return False - -def _is_not_pickle_safe_gl_class(obj_class): - """ - Check if class is a GraphLab create model. - - The function does it by checking the method resolution order (MRO) of the - class and verifies that _Model is the base class. + Check if an object has an external serialization prototol. We do so by + checking if the object has the methods __gl_pickle_load__ and + __gl_pickle_save__. Parameters ---------- - obj_class : Class to be checked. + obj: An object Returns ---------- - True if the class is a GLC Model. + True (if usable by gl_pickle) """ - gl_ds = [_SFrame, _SArray, _SGraph] - - # Object is GLC-DS or GLC-Model - return (obj_class in gl_ds) or _is_not_pickle_safe_gl_model_class(obj_class) - -def _get_gl_class_type(obj_class): - """ - Internal util to get the type of the GLC class. The pickle file stores - this name so that it knows how to construct the object on unpickling. - - Parameters - ---------- - obj_class : Class which has to be categoriized. - - Returns - ---------- - A class type for the pickle file to save. - - """ - - if obj_class == _SFrame: - return "SFrame" - elif obj_class == _SGraph: - return "SGraph" - elif obj_class == _SArray: - return "SArray" - elif _is_not_pickle_safe_gl_model_class(obj_class): - return "Model" + obj_class = None if not hasattr(obj, '__class__') else obj.__class__ + if obj_class is None: + return False else: - return None + return hasattr(obj_class, '__gl_pickle_load__') and \ + hasattr(obj_class, '__gl_pickle_save__') def _get_gl_object_from_persistent_id(type_tag, gl_archive_abs_path): """ - Internal util to get a GLC object from a persistent ID in the pickle file. + (GLPickle Version 1.0). + + Get an object from a persistent ID. Parameters ---------- - type_tag : The name of the glc class as saved in the GLC pickler. + type_tag : The name of the class as saved in the GLPickler. - gl_archive_abs_path: An absolute path to the GLC archive where the - object was saved. + gl_archive_abs_path: An absolute path to the archive where the + object was saved. Returns ---------- - The GLC object. + object: The deserialized object. """ if type_tag == "SFrame": @@ -121,8 +79,10 @@ def _get_gl_object_from_persistent_id(type_tag, gl_archive_abs_path): from . import load_model as _load_model obj = _load_model(gl_archive_abs_path) else: - raise _pickle.UnpicklingError("GraphLab pickling Error: Unspported object." - " Only SFrames, SGraphs, SArrays, and Models are supported.") + raise _pickle.UnpicklingError("Pickling Error: Unspported object." + " Implement the methods __gl_pickle_load__ and" + " __gl_pickle_save__ to use GLPickle. See the docstrings" + " for examples.") return obj class GLPickler(_cloudpickle.CloudPickler): @@ -131,15 +91,11 @@ def _to_abs_path_set(self, l): return set([_os.path.abspath(x) for x in l]) """ - - # GLC pickle works with: + # GLPickle works with: # # (1) Regular python objects - # (2) SArray - # (3) SFrame - # (4) SGraph - # (5) Models - # (6) Any combination of (1) - (5) + # (2) Any object with __gl_pickle_save__ and __gl_pickle_load__ + # (3) Any combination of (1) - (2) Examples -------- @@ -155,7 +111,7 @@ def _to_abs_path_set(self, l): 'bar': gl.SArray([1,2,3]), 'foo-bar': ['foo-and-bar', gl.SFrame()]} - # Setup the GLC pickler + # Setup the GLPickler pickler = gl_pickle.GLPickler(filename = 'foo-bar') pickler.dump(obj) @@ -171,9 +127,9 @@ def _to_abs_path_set(self, l): unpickler.close() print obj - The GLC pickler needs a temporary working directory to manage GLC objects. - This temporary working path must be a local path to the file system. It - can also be a relative path in the FS. + The GLPickler needs a temporary working directory to manage GLC objects. + This temporary working path must be a local path to the file system. It can + also be a relative path in the filesystem. .. sourcecode:: python @@ -237,7 +193,9 @@ def __init__(self, filename, protocol = -1, min_bytes_to_save = 0): # Directory: # ---------- # Version 1: GLC 1.4: 1 + # Version 2: SFrame 1.8.2+ (new gl_pickle extensibility mechanism) + VERSION = "2.0" self.archive_filename = None self.gl_temp_storage_path = _get_tmp_file_location() self.gl_object_memo = set() @@ -289,7 +247,7 @@ def __init__(self, filename, protocol = -1, min_bytes_to_save = 0): # Write the version number. with open(_os.path.join(self.gl_temp_storage_path, 'version'), 'w') as f: - f.write("1.0") + f.write(VERSION) def _set_hdfs_exec_dir(self, exec_dir): self.hdfs_exec_dir= exec_dir @@ -308,56 +266,96 @@ def persistent_id(self, obj): obj: Name of the object whose persistant ID is extracted. Returns - -------- + ------- None if the object is not a GLC object. (ClassName, relative path) if the object is a GLC object. - Notes - ----- + Examples + -------- + For the benefit of object persistence, the pickle module supports the + notion of a reference to an object outside the pickled data stream. To + pickle objects that have an external persistent id, the pickler must + have a custom persistent_id() method that takes an object as an + argument and returns either None or the persistent id for that object. + + For extended objects, the persistent_id is merely a relative file path + (within the ZIP archive) to the archive where the object is saved. For + example: - Borrowed from pickle docs (https://docs.python.org/2/library/_pickle.html) + (load_sframe, 'sframe-save-path') + (load_sgraph, 'sgraph-save-path') + (load_model, 'model-save-path') - For the benefit of object persistence, the pickle module supports the - notion of a reference to an object outside the pickled data stream. + To extend your object to work with gl_pickle you need to implement two + simple functions __gl_pickle_load__ and __gl_pickle_save__. + (1) __gl_pickle_save__: A member method to save your object to a + filepath (not file handle) given. + (2) __gl_pickle_load__: A static method that lets you load your object + from a filepath (not file handle). - To pickle objects that have an external persistent id, the pickler must - have a custom persistent_id() method that takes an object as an argument and - returns either None or the persistent id for that object. + A simple example is provided below: - For GLC objects, the persistent_id is merely a relative file path (within - the ZIP archive) to the GLC archive where the GLC object is saved. For - example: + .. sourcecode:: python - (SFrame, 'sframe-save-path') - (SGraph, 'sgraph-save-path') - (Model, 'model-save-path') + class SampleClass(object): + def __init__(self, member): + self.member = member - """ + def __gl_pickle_save__(self, filename): + with open(filename, 'w') as f: + f.write(self.member) - # Get the class of the object (if it can be done) - obj_class = None if not hasattr(obj, '__class__') else obj.__class__ - if obj_class is None: - return None + @staticmethod + def __gl_pickle_load__(filename): + with open(filename, 'r') as f: + member = f.read().split() + return SampleClass(member) + + WARNING: Version 1.0 and before of GLPickle only supported the + following extended objects. - # If the object is a GLC class. - if _is_not_pickle_safe_gl_class(obj_class): + - SFrame + - SGraph + - Model + + For these objects, the persistent_id was also a relative file path + (within the ZIP archive) to the archive where the object is saved. For + example: + + ("SFrame", 'sframe-save-path') + ("SGraph", 'sgraph-save-path') + ("Model", 'model-save-path') + + Note that the key difference between version 1.0 and 2.0 is that 2.0 of + GLPickle is that version 2.0 saves the load_sframe method while 1.0 + saves the string name for the class (which was hard-coded in) + + References + ---------- + - Python Pickle Docs(https://docs.python.org/2/library/_pickle.html) + """ + # If the object is a GL class. + if _is_gl_pickle_extensible(obj): if (id(obj) in self.gl_object_memo): # has already been pickled return (None, None, id(obj)) else: - # Save the location of the GLC object's archive to the pickle file. + # Save the location of the object's archive to the pickle file. relative_filename = str(_uuid.uuid4()) - filename = _os.path.join(self.gl_temp_storage_path, relative_filename) + filename = _os.path.join(self.gl_temp_storage_path, + relative_filename) self.mark_for_delete -= set([filename]) - # Save the GLC object - obj.save(filename) + # Save the object + print "Obj = %s" % obj + print "Type = %s" % type(obj) + obj.__gl_pickle_save__(filename) # Memoize. self.gl_object_memo.add(id(obj)) - # Return the tuple (class_name, relative_filename) in archive. - return (_get_gl_class_type(obj.__class__), relative_filename, id(obj)) + # Return the tuple (load_func, relative_filename) in archive. + return (obj.__gl_pickle_load__, relative_filename, id(obj)) # Not a GLC object. Default to cloud pickle else: @@ -387,12 +385,9 @@ def close(self): for f in self.mark_for_delete: error = [False] - def register_error(*args): error[0] = True - _shutil.rmtree(f, onerror = register_error) - if error[0]: _atexit.register(_shutil.rmtree, f, ignore_errors=True) @@ -438,6 +433,7 @@ def __init__(self, filename): self.tmp_file = None self.file = None self.gl_temp_storage_path = _get_tmp_file_location() + self.version = None # GLC 1.3 used Zipfiles for storing the objects. self.directory_mode = True @@ -447,11 +443,13 @@ def __init__(self, filename): # GLC 1.3 uses zipfiles if _file_util._is_valid_s3_key(filename): _file_util.download_from_s3(filename, self.tmp_file, \ - aws_credentials = _get_aws_credentials(), is_dir=False, silent=True) + aws_credentials = _get_aws_credentials(), is_dir=False, + silent=True) # GLC 1.4 uses directories else: _file_util.download_from_s3(filename, self.tmp_file, \ - aws_credentials = _get_aws_credentials(), is_dir=True, silent=True) + aws_credentials = _get_aws_credentials(), is_dir=True, + silent=True) filename = self.tmp_file elif _file_util.is_hdfs_path(filename): @@ -495,9 +493,18 @@ def __init__(self, filename): self.directory_mode = True pickle_filename = _os.path.join(filename, "pickle_archive") if not _os.path.exists(pickle_filename): - raise IOError("Corrupted archive: Missing pickle file %s." % pickle_filename) + raise IOError("Corrupted archive: Missing pickle file %s." \ + % pickle_filename) if not _os.path.exists(_os.path.join(filename, "version")): raise IOError("Corrupted archive: Missing version file.") + try: + version_filename = _os.path.join(filename, "version") + self.version = open(version_filename).read().strip() + except: + raise IOError("Corrupted archive: Corrupted version file.") + if self.version not in ["1.0", "2.0"]: + raise Exception( + "Corrupted archive: Version string must be in [1.0, 2.0]") self.pickle_filename = pickle_filename self.gl_temp_storage_path = _os.path.abspath(filename) @@ -514,7 +521,8 @@ def persistent_load(self, pid): """ Reconstruct a GLC object using the persistent ID. - This method should not be used externally. It is required by the unpickler super class. + This method should not be used externally. It is required by the + unpickler super class. Parameters ---------- @@ -525,18 +533,25 @@ def persistent_load(self, pid): The GLC object. """ if len(pid) == 2: - # Pre GLC-1.3 release behavior, without memorization + # Pre GLC-1.3 release behavior, without memoization type_tag, filename = pid abs_path = _os.path.join(self.gl_temp_storage_path, filename) return _get_gl_object_from_persistent_id(type_tag, abs_path) else: - # Post GLC-1.3 release behavior, with memorization + # Post GLC-1.3 release behavior, with memoization type_tag, filename, object_id = pid if object_id in self.gl_object_memo: return self.gl_object_memo[object_id] else: abs_path = _os.path.join(self.gl_temp_storage_path, filename) - obj = _get_gl_object_from_persistent_id(type_tag, abs_path) + if self.version in ["1.0", None]: + obj = _get_gl_object_from_persistent_id(type_tag, abs_path) + elif self.version == "2.0": + obj = type_tag(abs_path) + else: + raise Exception( + "Unknown version %s: Expected version in [1.0, 2.0]" \ + % self.version) self.gl_object_memo[object_id] = obj return obj diff --git a/oss_src/unity/python/sframe/data_structures/sarray.py b/oss_src/unity/python/sframe/data_structures/sarray.py index 1be965ca..0b9d3598 100644 --- a/oss_src/unity/python/sframe/data_structures/sarray.py +++ b/oss_src/unity/python/sframe/data_structures/sarray.py @@ -3917,3 +3917,10 @@ def cumulative_var(self): from .. import extensions agg_op = "__builtin__cum_var__" return SArray(_proxy = self.__proxy__.builtin_cumulative_aggregate(agg_op)) + + def __gl_pickle_save__(self, filename): + self.save(filename) + + @staticmethod + def __gl_pickle_load__(filename): + return SArray(filename) diff --git a/oss_src/unity/python/sframe/data_structures/sframe.py b/oss_src/unity/python/sframe/data_structures/sframe.py index dfbd1b90..75d0678c 100644 --- a/oss_src/unity/python/sframe/data_structures/sframe.py +++ b/oss_src/unity/python/sframe/data_structures/sframe.py @@ -2428,7 +2428,7 @@ def to_sql(self, conn, table_name, dbapi_module=None, } get_sql_param = sql_param[mod_info['paramstyle']] - + # form insert string ins_str = "INSERT INTO " + str(table_name) value_str = " VALUES (" @@ -6180,3 +6180,10 @@ def __proxy__(self, value): self._cache = None self._proxy = value self._cache = None + + def __gl_pickle_save__(self, filename): + self.save(filename) + + @staticmethod + def __gl_pickle_load__(filename): + return load_sframe(filename) diff --git a/oss_src/unity/python/sframe/data_structures/sgraph.py b/oss_src/unity/python/sframe/data_structures/sgraph.py index 96035014..19b3896f 100644 --- a/oss_src/unity/python/sframe/data_structures/sgraph.py +++ b/oss_src/unity/python/sframe/data_structures/sgraph.py @@ -1166,19 +1166,19 @@ def show(self, vlabel=None, vlabel_hover=False, vcolor=[0.522, 0.741, 0.], >>> g.show(highlight=[2, 3], vlabel='id', arrows=True) """ from ..visualization.show import show - show(self, - vlabel=vlabel, - vlabel_hover=vlabel_hover, + show(self, + vlabel=vlabel, + vlabel_hover=vlabel_hover, vcolor=vcolor, - highlight=highlight, - highlight_color=highlight_color, + highlight=highlight, + highlight_color=highlight_color, node_size=node_size, - elabel=elabel, - elabel_hover=elabel_hover, + elabel=elabel, + elabel_hover=elabel_hover, ecolor=ecolor, - ewidth=ewidth, - v_offset=v_offset, - h_offset=h_offset, + ewidth=ewidth, + v_offset=v_offset, + h_offset=h_offset, arrows=arrows, vertex_positions=vertex_positions) @@ -1266,6 +1266,13 @@ def get_neighborhood(self, ids, radius=1, full_subgraph=True): g = g.add_edges(edges, src_field='__src_id', dst_field='__dst_id') return g + def __gl_pickle_save__(self, filename): + self.save(filename) + + @staticmethod + def __gl_pickle_load__(filename): + return load_sgraph(filename) + #/**************************************************************************/ #/* */ diff --git a/oss_src/unity/python/sframe/test/test_gl_pickler.py b/oss_src/unity/python/sframe/test/test_gl_pickler.py index 3aa3ad1b..448e86b4 100644 --- a/oss_src/unity/python/sframe/test/test_gl_pickler.py +++ b/oss_src/unity/python/sframe/test/test_gl_pickler.py @@ -5,6 +5,7 @@ import uuid import shutil import sys +from nose.tools import nottest import pickle from ..util import cloudpickle @@ -238,7 +239,8 @@ def test_save_to_s3(self): del os.environ['GRAPHLAB_UNIT_TEST'] - def _test_backward_compatibility(self): + @nottest + def test_backward_compatibility(self): # Arrange file_name = 's3://gl-internal-datasets/models/1.3/gl-pickle.gl' @@ -256,6 +258,26 @@ def _test_backward_compatibility(self): assert_sframe_equal(obj['foo-bar'][1], obj_ret['foo-bar'][1]) self.assertEqual(obj['foo-bar'][0], obj_ret['foo-bar'][0]) + @nottest + def test_backward_compatibility_v1(self): + + # Arrange + file_name = 's3://gl-internal-datasets/archives/gl-pickle-1.0.gl' + obj = {'foo': SFrame([1,2,3]), + 'bar': SFrame(), + 'foo-bar': ['foo-and-bar', SFrame([1])]} + + # Act + unpickler = gl_pickle.GLUnpickler(file_name) + obj_ret = unpickler.load() + + # Assert + assert_sframe_equal(obj['foo'], obj_ret['foo']) + assert_sframe_equal(obj['bar'], obj_ret['bar']) + assert_sframe_equal(obj['foo-bar'][1], obj_ret['foo-bar'][1]) + self.assertEqual(obj['foo-bar'][0], obj_ret['foo-bar'][0]) + + def test_save_over_previous(self): sarray_list = [ @@ -273,4 +295,43 @@ def test_save_over_previous(self): pickler.dump(obj) pickler.close() + def test_extensibility(self): + + class SampleClass: + def __init__(self, member): + self.member = member + + def __gl_pickle_save__(self, filename): + with open(filename, 'w') as f: + f.write(self.member) + + @staticmethod + def __gl_pickle_save__(filename): + with open(filename, 'r') as f: + member = f.read().split() + return SampleClass(member) + def __eq__(self, other): + return self.member == other.member + + test_list = [ + 1, + SFrame([1,2,3]), + SampleClass("Obj-1"), + [SampleClass("Obj-1"), SampleClass("Obj-22")], + {'one': SampleClass("Obj-1"), 'two': SampleClass("Obj-22")} + ] + for obj in test_list: + pickler = gl_pickle.GLPickler(self.filename) + pickler.dump(obj) + pickler.close() + + obj_ret = gl_pickle.GLUnpickler(self.filename).load() + if type(obj) == SFrame: + assert_sframe_equal(obj, obj_ret) + else: + self.assertEqual(obj, obj_ret) + + pickler = gl_pickle.GLPickler(self.filename) + pickler.dump(obj) + pickler.close() diff --git a/oss_src/unity/python/sframe/toolkits/_model.py b/oss_src/unity/python/sframe/toolkits/_model.py index 4b613c62..fe227ad1 100644 --- a/oss_src/unity/python/sframe/toolkits/_model.py +++ b/oss_src/unity/python/sframe/toolkits/_model.py @@ -587,3 +587,10 @@ def _is_gl_pickle_safe(cls): contain elements that are written using Python + GraphLab objects. """ return False + + def __gl_pickle_save__(self, filename): + self.save(filename) + + @staticmethod + def __gl_pickle_load__(filename): + return load_model(filename) From a314262946db113f4ebb21eb453dc4d873852ee7 Mon Sep 17 00:00:00 2001 From: srikris Date: Thu, 18 Feb 2016 11:00:14 -0800 Subject: [PATCH 2/5] Fixed a print and import issue. --- oss_src/unity/python/sframe/__init__.py | 4 ++-- oss_src/unity/python/sframe/_gl_pickle.py | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/oss_src/unity/python/sframe/__init__.py b/oss_src/unity/python/sframe/__init__.py index 1f5d2be7..bf9bb90c 100644 --- a/oss_src/unity/python/sframe/__init__.py +++ b/oss_src/unity/python/sframe/__init__.py @@ -62,8 +62,8 @@ from .version_info import version from .version_info import __VERSION__ -from _gl_pickle import GLPickler -from _gl_pickle import GLUnpickler +from ._gl_pickle import GLPickler +from ._gl_pickle import GLUnpickler class DeprecationHelper(object): def __init__(self, new_target): diff --git a/oss_src/unity/python/sframe/_gl_pickle.py b/oss_src/unity/python/sframe/_gl_pickle.py index f3f74657..8bf1ceac 100644 --- a/oss_src/unity/python/sframe/_gl_pickle.py +++ b/oss_src/unity/python/sframe/_gl_pickle.py @@ -347,8 +347,6 @@ def __gl_pickle_load__(filename): self.mark_for_delete -= set([filename]) # Save the object - print "Obj = %s" % obj - print "Type = %s" % type(obj) obj.__gl_pickle_save__(filename) # Memoize. From 154bb78c7b71e0184c8d7c67a4845f550c675c02 Mon Sep 17 00:00:00 2001 From: srikris Date: Mon, 22 Feb 2016 20:38:28 -0800 Subject: [PATCH 3/5] Modified the code to make sure imports work. --- oss_src/unity/python/sframe/_gl_pickle.py | 37 ++++++++++++++++------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/oss_src/unity/python/sframe/_gl_pickle.py b/oss_src/unity/python/sframe/_gl_pickle.py index 8bf1ceac..b1e492c0 100644 --- a/oss_src/unity/python/sframe/_gl_pickle.py +++ b/oss_src/unity/python/sframe/_gl_pickle.py @@ -29,6 +29,17 @@ def _get_temp_filename(): def _get_tmp_file_location(): return _util._make_temp_directory(prefix='gl_pickle_') + +def _get_class_from_name(module_name, class_name): + import importlib + + # load the module, will raise ImportError if module cannot be loaded + m = importlib.import_module(module_name) + + # get the class, will raise AttributeError if class cannot be found + c = getattr(m, class_name) + return c + def _is_gl_pickle_extensible(obj): """ Check if an object has an external serialization prototol. We do so by @@ -188,14 +199,13 @@ def __init__(self, filename, protocol = -1, min_bytes_to_save = 0): """ # Zipfile # -------- - # Version 1: GLC 1.2.1 + # Version None: GLC 1.2.1 # # Directory: # ---------- # Version 1: GLC 1.4: 1 - # Version 2: SFrame 1.8.2+ (new gl_pickle extensibility mechanism) - VERSION = "2.0" + VERSION = "1.0" self.archive_filename = None self.gl_temp_storage_path = _get_tmp_file_location() self.gl_object_memo = set() @@ -305,11 +315,11 @@ def __gl_pickle_save__(self, filename): with open(filename, 'w') as f: f.write(self.member) - @staticmethod - def __gl_pickle_load__(filename): + @classmethod + def __gl_pickle_load__(cls, filename): with open(filename, 'r') as f: member = f.read().split() - return SampleClass(member) + return cls(member) WARNING: Version 1.0 and before of GLPickle only supported the following extended objects. @@ -500,9 +510,9 @@ def __init__(self, filename): self.version = open(version_filename).read().strip() except: raise IOError("Corrupted archive: Corrupted version file.") - if self.version not in ["1.0", "2.0"]: + if self.version not in ["1.0"]: raise Exception( - "Corrupted archive: Version string must be in [1.0, 2.0]") + "Corrupted archive: Version string must be 1.0.") self.pickle_filename = pickle_filename self.gl_temp_storage_path = _os.path.abspath(filename) @@ -543,9 +553,14 @@ def persistent_load(self, pid): else: abs_path = _os.path.join(self.gl_temp_storage_path, filename) if self.version in ["1.0", None]: - obj = _get_gl_object_from_persistent_id(type_tag, abs_path) - elif self.version == "2.0": - obj = type_tag(abs_path) + if type_tag in ["SFrame", "SGraph", "SArray", "Model"]: + obj = _get_gl_object_from_persistent_id(type_tag, + abs_path) + else: + module_name, class_name = type_tag + type_class = _get_class_from_name(module_name, + class_name) + obj = type_class.__gl_pickle_load__(abs_path) else: raise Exception( "Unknown version %s: Expected version in [1.0, 2.0]" \ From 322241773776c397628ac990b96d70063781c96d Mon Sep 17 00:00:00 2001 From: srikris Date: Tue, 23 Feb 2016 01:22:08 -0800 Subject: [PATCH 4/5] Addressed the following issues (from comments): - No bump of version number. - Loading module and then class name. --- oss_src/unity/python/sframe/_gl_pickle.py | 22 +++++++++---------- .../python/sframe/test/test_gl_pickler.py | 4 ++-- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/oss_src/unity/python/sframe/_gl_pickle.py b/oss_src/unity/python/sframe/_gl_pickle.py index b1e492c0..25063926 100644 --- a/oss_src/unity/python/sframe/_gl_pickle.py +++ b/oss_src/unity/python/sframe/_gl_pickle.py @@ -5,9 +5,9 @@ This software may be modified and distributed under the terms of the BSD license. See the LICENSE file for details. ''' -from . import util as _util, toolkits as _toolkits, SFrame as _SFrame, SArray as _SArray, \ - SGraph as _SGraph, load_graph as _load_graph - +from . import util as _util, toolkits as _toolkits, SFrame as \ + _SFrame, SArray as _SArray, SGraph as _SGraph, load_graph as\ + _load_graph from .util import _get_aws_credentials as _util_get_aws_credentials, \ cloudpickle as _cloudpickle, file_util as _file_util @@ -29,7 +29,6 @@ def _get_temp_filename(): def _get_tmp_file_location(): return _util._make_temp_directory(prefix='gl_pickle_') - def _get_class_from_name(module_name, class_name): import importlib @@ -184,11 +183,11 @@ def __init__(self, filename, protocol = -1, min_bytes_to_save = 0): Parameters ---------- - filename : Name of the file to write to. This file is all you need to pickle - all objects (including GLC objects). + filename : Name of the file to write to. This file is all you need to + pickle all objects (including GLC objects). - protocol : Pickle protocol (see pickle docs). Note that all pickle protocols - may not be compatable with GLC objects. + protocol : Pickle protocol (see pickle docs). Note: All pickle + protocols may not be compatable with GLC objects. min_bytes_to_save : Cloud pickle option (see cloud pickle docs). @@ -358,12 +357,13 @@ def __gl_pickle_load__(cls, filename): # Save the object obj.__gl_pickle_save__(filename) + type_tag = (obj.__module__, obj.__class__.__name__) # Memoize. self.gl_object_memo.add(id(obj)) # Return the tuple (load_func, relative_filename) in archive. - return (obj.__gl_pickle_load__, relative_filename, id(obj)) + return (type_tag, relative_filename, id(obj)) # Not a GLC object. Default to cloud pickle else: @@ -562,9 +562,7 @@ def persistent_load(self, pid): class_name) obj = type_class.__gl_pickle_load__(abs_path) else: - raise Exception( - "Unknown version %s: Expected version in [1.0, 2.0]" \ - % self.version) + raise Exception("Unknown version %s" % self.version) self.gl_object_memo[object_id] = obj return obj diff --git a/oss_src/unity/python/sframe/test/test_gl_pickler.py b/oss_src/unity/python/sframe/test/test_gl_pickler.py index 448e86b4..4db88ac2 100644 --- a/oss_src/unity/python/sframe/test/test_gl_pickler.py +++ b/oss_src/unity/python/sframe/test/test_gl_pickler.py @@ -239,7 +239,7 @@ def test_save_to_s3(self): del os.environ['GRAPHLAB_UNIT_TEST'] - @nottest + #@unittest.skip("Can be run locally for testing.") def test_backward_compatibility(self): # Arrange @@ -258,7 +258,7 @@ def test_backward_compatibility(self): assert_sframe_equal(obj['foo-bar'][1], obj_ret['foo-bar'][1]) self.assertEqual(obj['foo-bar'][0], obj_ret['foo-bar'][0]) - @nottest + #@unittest.skip("Can be run locally for testing.") def test_backward_compatibility_v1(self): # Arrange From 4ccfd09ad9b3575bc3ce3fa4aa2d84c254aaa52e Mon Sep 17 00:00:00 2001 From: srikris Date: Thu, 28 Apr 2016 08:32:08 -0700 Subject: [PATCH 5/5] Rebased with master. Should now work! --- oss_src/unity/python/sframe/_gl_pickle.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/oss_src/unity/python/sframe/_gl_pickle.py b/oss_src/unity/python/sframe/_gl_pickle.py index 25063926..34afe4ed 100644 --- a/oss_src/unity/python/sframe/_gl_pickle.py +++ b/oss_src/unity/python/sframe/_gl_pickle.py @@ -114,15 +114,15 @@ def _to_abs_path_set(self, l): .. sourcecode:: python - from graphlab.util import gl_pickle - import graphlab as gl + import sframe + from sframe import GLPickle obj = {'foo': gl.SFrame([1,2,3]), 'bar': gl.SArray([1,2,3]), 'foo-bar': ['foo-and-bar', gl.SFrame()]} # Setup the GLPickler - pickler = gl_pickle.GLPickler(filename = 'foo-bar') + pickler = GLPickler(filename = 'foo-bar') pickler.dump(obj) # The pickler has to be closed to make sure the files get closed. @@ -132,7 +132,7 @@ def _to_abs_path_set(self, l): .. sourcecode:: python - unpickler = gl_pickle.GLUnpickler(filename = 'foo-bar') + unpickler = GLUnpickler(filename = 'foo-bar') obj = unpickler.load() unpickler.close() print obj @@ -143,14 +143,13 @@ def _to_abs_path_set(self, l): .. sourcecode:: python - unpickler = gl_pickle.GLUnpickler('foo-bar') + unpickler = GLUnpickler('foo-bar') obj = unpickler.load() unpickler.close() print obj - Notes - -------- + ----- The GLC pickler saves the files into single zip archive with the following file layout. @@ -173,8 +172,6 @@ def _to_abs_path_set(self, l): "gl_archive_dir_N" - - """ def __init__(self, filename, protocol = -1, min_bytes_to_save = 0): """ @@ -335,10 +332,6 @@ def __gl_pickle_load__(cls, filename): ("SGraph", 'sgraph-save-path') ("Model", 'model-save-path') - Note that the key difference between version 1.0 and 2.0 is that 2.0 of - GLPickle is that version 2.0 saves the load_sframe method while 1.0 - saves the string name for the class (which was hard-coded in) - References ---------- - Python Pickle Docs(https://docs.python.org/2/library/_pickle.html)