From 1b560508097d737379694885017acfa467208bc5 Mon Sep 17 00:00:00 2001 From: Juan Nunez-Iglesias Date: Sat, 17 Jun 2023 12:31:56 +1000 Subject: [PATCH 1/9] Add failing test for #364 --- pooch/tests/test_processors.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pooch/tests/test_processors.py b/pooch/tests/test_processors.py index 578b478d..cbe85a2d 100644 --- a/pooch/tests/test_processors.py +++ b/pooch/tests/test_processors.py @@ -255,3 +255,28 @@ def _unpacking_expected_paths_and_logs(archive, members, path, name): log_lines.append(f"Extracting '{member}'") true_paths = set(true_paths) return true_paths, log_lines + + +@pytest.mark.network +@pytest.mark.parametrize( + "processor_class,extension", + [(Unzip, ".zip"), (Untar, ".tar.gz")], +) +def test_unpacking_members_then_no_members(processor_class, extension): + """Test that calling with members then without them works. + + https://github.com/fatiando/pooch/issues/364 + """ + + with TemporaryDirectory() as local_store: + pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY) + + # Do a first fetch with incorrect member + processor1 = processor_class(members=["i don't exist"]) + filenames1 = pup.fetch("store" + extension, processor=processor1) + assert len(filenames1) == 0 + + # Do a second fetch with no members + processor2 = processor_class() + filenames2 = pup.fetch("store" + extension, processor=processor2) + assert len(filenames2) > 0 From b6d53d30fae737180f96747e15c6b44953c3bd02 Mon Sep 17 00:00:00 2001 From: Juan Nunez-Iglesias Date: Sat, 17 Jun 2023 12:57:26 +1000 Subject: [PATCH 2/9] Add _all_members method for when members=None --- pooch/processors.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/pooch/processors.py b/pooch/processors.py index dbe1a910..b5530a40 100644 --- a/pooch/processors.py +++ b/pooch/processors.py @@ -44,6 +44,13 @@ def __init__(self, members=None, extract_dir=None): self.members = members self.extract_dir = extract_dir + def _all_members(self, fname): + """Return all the members in the archive. + + In the base class, this returns None. + """ + return None + def __call__(self, fname, action, pooch): """ Extract all files from the given archive. @@ -79,17 +86,14 @@ def __call__(self, fname, action, pooch): else: archive_dir = fname.rsplit(os.path.sep, maxsplit=1)[0] self.extract_dir = os.path.join(archive_dir, self.extract_dir) + if self.members is None: + self.members = self._all_members(fname) if ( (action in ("update", "download")) or (not os.path.exists(self.extract_dir)) - or ( - (self.members is not None) - and ( - not all( - os.path.exists(os.path.join(self.extract_dir, m)) - for m in self.members - ) - ) + or not all( + os.path.exists(os.path.join(self.extract_dir, m)) + for m in self.members ) ): # Make sure that the folder with the extracted files exists @@ -148,6 +152,11 @@ class Unzip(ExtractorProcessor): # pylint: disable=too-few-public-methods suffix = ".unzip" + def _all_members(self, fname): + """Return all members from a given archive.""" + with ZipFile(fname, "r") as zip_file: + return zip_file.namelist() + def _extract_file(self, fname, extract_dir): """ This method receives an argument for the archive to extract and the @@ -209,6 +218,11 @@ class Untar(ExtractorProcessor): # pylint: disable=too-few-public-methods suffix = ".untar" + def _all_members(self, fname): + """Return all members from a given archive.""" + with TarFile.open(fname, "r") as tar_file: + return [info.name for info in tar_file.getmembers()] + def _extract_file(self, fname, extract_dir): """ This method receives an argument for the archive to extract and the From 8a9e3bc6925ee6874a86e2ee2c53b68fd0bbf1d1 Mon Sep 17 00:00:00 2001 From: Juan Nunez-Iglesias Date: Mon, 30 Oct 2023 19:19:40 +1100 Subject: [PATCH 3/9] Use local members variable instead of modifying attribute --- pooch/processors.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pooch/processors.py b/pooch/processors.py index b5530a40..0f82e513 100644 --- a/pooch/processors.py +++ b/pooch/processors.py @@ -86,14 +86,13 @@ def __call__(self, fname, action, pooch): else: archive_dir = fname.rsplit(os.path.sep, maxsplit=1)[0] self.extract_dir = os.path.join(archive_dir, self.extract_dir) - if self.members is None: - self.members = self._all_members(fname) + members = self.members or self._all_members(fname) if ( (action in ("update", "download")) or (not os.path.exists(self.extract_dir)) or not all( os.path.exists(os.path.join(self.extract_dir, m)) - for m in self.members + for m in members ) ): # Make sure that the folder with the extracted files exists From f034b72d9153dc15aede97b1642244f6764699a6 Mon Sep 17 00:00:00 2001 From: Juan Nunez-Iglesias Date: Wed, 1 Nov 2023 00:42:27 +1100 Subject: [PATCH 4/9] Use existing file, since it causes same problem Co-authored-by: Leonardo Uieda --- pooch/tests/test_processors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pooch/tests/test_processors.py b/pooch/tests/test_processors.py index cbe85a2d..023e38a0 100644 --- a/pooch/tests/test_processors.py +++ b/pooch/tests/test_processors.py @@ -272,9 +272,9 @@ def test_unpacking_members_then_no_members(processor_class, extension): pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY) # Do a first fetch with incorrect member - processor1 = processor_class(members=["i don't exist"]) + processor1 = processor_class(members=["tiny-data.txt"]) filenames1 = pup.fetch("store" + extension, processor=processor1) - assert len(filenames1) == 0 + assert len(filenames1) == 1 # Do a second fetch with no members processor2 = processor_class() From 89a3056b5e8d9733852fbe3c85a6324785d38acf Mon Sep 17 00:00:00 2001 From: Juan Nunez-Iglesias Date: Wed, 1 Nov 2023 00:46:49 +1100 Subject: [PATCH 5/9] Format file with black --- pooch/processors.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pooch/processors.py b/pooch/processors.py index 0f82e513..483a4a0f 100644 --- a/pooch/processors.py +++ b/pooch/processors.py @@ -91,8 +91,7 @@ def __call__(self, fname, action, pooch): (action in ("update", "download")) or (not os.path.exists(self.extract_dir)) or not all( - os.path.exists(os.path.join(self.extract_dir, m)) - for m in members + os.path.exists(os.path.join(self.extract_dir, m)) for m in members ) ): # Make sure that the folder with the extracted files exists From 7d3db888e205d682440b36f75f9569bb2284f801 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 12 Feb 2024 08:46:48 -0300 Subject: [PATCH 6/9] Break line on base all_members docstring --- pooch/processors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pooch/processors.py b/pooch/processors.py index 483a4a0f..d51618f9 100644 --- a/pooch/processors.py +++ b/pooch/processors.py @@ -45,7 +45,8 @@ def __init__(self, members=None, extract_dir=None): self.extract_dir = extract_dir def _all_members(self, fname): - """Return all the members in the archive. + """ + Return all the members in the archive. In the base class, this returns None. """ From 4d576d0d1b84ce2bd9a634f1a047b33c2cdcb349 Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 19 Feb 2024 10:28:12 -0300 Subject: [PATCH 7/9] Use ABC and abstractmethod in ExtractorProcessor This way we don't have to test against using it since it can't be instantiated directly. Make _all_members and _extract_file abstract methods. --- pooch/processors.py | 21 +++++++++++---------- pooch/tests/test_processors.py | 18 +----------------- 2 files changed, 12 insertions(+), 27 deletions(-) diff --git a/pooch/processors.py b/pooch/processors.py index d51618f9..e3a2754f 100644 --- a/pooch/processors.py +++ b/pooch/processors.py @@ -8,6 +8,7 @@ """ Post-processing hooks """ +import abc import os import bz2 import gzip @@ -19,7 +20,7 @@ from .utils import get_logger -class ExtractorProcessor: # pylint: disable=too-few-public-methods +class ExtractorProcessor(abc.ABC): # pylint: disable=too-few-public-methods """ Base class for extractions from compressed archives. @@ -44,13 +45,20 @@ def __init__(self, members=None, extract_dir=None): self.members = members self.extract_dir = extract_dir + @abc.abstractmethod def _all_members(self, fname): """ Return all the members in the archive. + MUST BE IMPLEMENTED BY CHILD CLASSES. + """ - In the base class, this returns None. + @abc.abstractmethod + def _extract_file(self, fname, extract_dir): + """ + This method receives an argument for the archive to extract and the + destination path. + MUST BE IMPLEMENTED BY CHILD CLASSES. """ - return None def __call__(self, fname, action, pooch): """ @@ -114,13 +122,6 @@ def __call__(self, fname, action, pooch): return fnames - def _extract_file(self, fname, extract_dir): - """ - This method receives an argument for the archive to extract and the - destination path. MUST BE IMPLEMENTED BY CHILD CLASSES. - """ - raise NotImplementedError - class Unzip(ExtractorProcessor): # pylint: disable=too-few-public-methods """ diff --git a/pooch/tests/test_processors.py b/pooch/tests/test_processors.py index 023e38a0..42f885b2 100644 --- a/pooch/tests/test_processors.py +++ b/pooch/tests/test_processors.py @@ -14,7 +14,7 @@ import pytest from .. import Pooch -from ..processors import Unzip, Untar, ExtractorProcessor, Decompress +from ..processors import Unzip, Untar, Decompress from .utils import pooch_test_url, pooch_test_registry, check_tiny_data, capture_log @@ -97,22 +97,6 @@ def test_decompress_fails(): assert "pooch.Unzip/Untar" in exception.value.args[0] -@pytest.mark.network -def test_extractprocessor_fails(): - "The base class should be used and should fail when passed to fecth" - with TemporaryDirectory() as local_store: - # Setup a pooch in a temp dir - pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY) - processor = ExtractorProcessor() - with pytest.raises(NotImplementedError) as exception: - pup.fetch("tiny-data.tar.gz", processor=processor) - assert "'suffix'" in exception.value.args[0] - processor.suffix = "tar.gz" - with pytest.raises(NotImplementedError) as exception: - pup.fetch("tiny-data.tar.gz", processor=processor) - assert not exception.value.args - - @pytest.mark.network @pytest.mark.parametrize( "target_path", [None, "some_custom_path"], ids=["default_path", "custom_path"] From fab577fe147e74365a765f4a646f3591ee706f5a Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 19 Feb 2024 11:09:06 -0300 Subject: [PATCH 8/9] Add a check for calling with an existing member --- pooch/processors.py | 8 +++++++- pooch/tests/test_processors.py | 31 +++++++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/pooch/processors.py b/pooch/processors.py index e3a2754f..ea9aa2aa 100644 --- a/pooch/processors.py +++ b/pooch/processors.py @@ -95,7 +95,13 @@ def __call__(self, fname, action, pooch): else: archive_dir = fname.rsplit(os.path.sep, maxsplit=1)[0] self.extract_dir = os.path.join(archive_dir, self.extract_dir) - members = self.members or self._all_members(fname) + # Get a list of everyone who is supposed to be in the unpacked folder + # so we can check if they are all there or if we need to extract new + # files. + if self.members is None or not self.members: + members = self._all_members(fname) + else: + members = self.members if ( (action in ("update", "download")) or (not os.path.exists(self.extract_dir)) diff --git a/pooch/tests/test_processors.py b/pooch/tests/test_processors.py index 42f885b2..1a2a1e2a 100644 --- a/pooch/tests/test_processors.py +++ b/pooch/tests/test_processors.py @@ -247,18 +247,41 @@ def _unpacking_expected_paths_and_logs(archive, members, path, name): [(Unzip, ".zip"), (Untar, ".tar.gz")], ) def test_unpacking_members_then_no_members(processor_class, extension): - """Test that calling with members then without them works. - + """ + Test that calling with valid members then without them works. https://github.com/fatiando/pooch/issues/364 """ + with TemporaryDirectory() as local_store: + pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY) + + # Do a first fetch with an existing member + processor1 = processor_class(members=["store/tiny-data.txt"]) + filenames1 = pup.fetch("store" + extension, processor=processor1) + assert len(filenames1) == 1 + + # Do a second fetch with no members + processor2 = processor_class() + filenames2 = pup.fetch("store" + extension, processor=processor2) + assert len(filenames2) > 1 + +@pytest.mark.network +@pytest.mark.parametrize( + "processor_class,extension", + [(Unzip, ".zip"), (Untar, ".tar.gz")], +) +def test_unpacking_wrong_members_then_no_members(processor_class, extension): + """ + Test that calling with invalid members then without them works. + https://github.com/fatiando/pooch/issues/364 + """ with TemporaryDirectory() as local_store: pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY) # Do a first fetch with incorrect member - processor1 = processor_class(members=["tiny-data.txt"]) + processor1 = processor_class(members=["not-a-valid-file.csv"]) filenames1 = pup.fetch("store" + extension, processor=processor1) - assert len(filenames1) == 1 + assert len(filenames1) == 0 # Do a second fetch with no members processor2 = processor_class() From e59157b1ea4f7d7c6304a0cbc43c859da791da3a Mon Sep 17 00:00:00 2001 From: Leonardo Uieda Date: Mon, 19 Feb 2024 11:37:58 -0300 Subject: [PATCH 9/9] Make the suffix an abstract property Make it required to be implemented by child classes so we don't have to check if it's None. --- pooch/processors.py | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/pooch/processors.py b/pooch/processors.py index ea9aa2aa..16670f9c 100644 --- a/pooch/processors.py +++ b/pooch/processors.py @@ -22,7 +22,7 @@ class ExtractorProcessor(abc.ABC): # pylint: disable=too-few-public-methods """ - Base class for extractions from compressed archives. + Abstract base class for extractions from compressed archives. Subclasses can be used with :meth:`pooch.Pooch.fetch` and :func:`pooch.retrieve` to unzip a downloaded data file into a folder in the @@ -35,16 +35,28 @@ class ExtractorProcessor(abc.ABC): # pylint: disable=too-few-public-methods If None, will unpack all files in the archive. Otherwise, *members* must be a list of file names to unpack from the archive. Only these files will be unpacked. + extract_dir : str or None + If None, files will be unpacked to the default location (a folder in + the same location as the downloaded zip file, with a suffix added). + Otherwise, files will be unpacked to ``extract_dir``, which is + interpreted as a *relative path* (relative to the cache location + provided by :func:`pooch.retrieve` or :meth:`pooch.Pooch.fetch`). """ - # String appended to unpacked archive. To be implemented in subclass - suffix = None - def __init__(self, members=None, extract_dir=None): self.members = members self.extract_dir = extract_dir + @property + @abc.abstractmethod + def suffix(self): + """ + String appended to unpacked archive folder name. + Only used if extract_dir is None. + MUST BE IMPLEMENTED BY CHILD CLASSES. + """ + @abc.abstractmethod def _all_members(self, fname): """ @@ -85,11 +97,6 @@ def __call__(self, fname, action, pooch): A list of the full path to all files in the extracted archive. """ - if self.suffix is None and self.extract_dir is None: - raise NotImplementedError( - "Derived classes must define either a 'suffix' attribute or " - "an 'extract_dir' attribute." - ) if self.extract_dir is None: self.extract_dir = fname + self.suffix else: @@ -156,7 +163,13 @@ class Unzip(ExtractorProcessor): # pylint: disable=too-few-public-methods """ - suffix = ".unzip" + @property + def suffix(self): + """ + String appended to unpacked archive folder name. + Only used if extract_dir is None. + """ + return ".unzip" def _all_members(self, fname): """Return all members from a given archive.""" @@ -222,7 +235,13 @@ class Untar(ExtractorProcessor): # pylint: disable=too-few-public-methods :meth:`pooch.Pooch.fetch`). """ - suffix = ".untar" + @property + def suffix(self): + """ + String appended to unpacked archive folder name. + Only used if extract_dir is None. + """ + return ".untar" def _all_members(self, fname): """Return all members from a given archive."""