diff --git a/pooch/processors.py b/pooch/processors.py index dbe1a910..16670f9c 100644 --- a/pooch/processors.py +++ b/pooch/processors.py @@ -8,6 +8,7 @@ """ Post-processing hooks """ +import abc import os import bz2 import gzip @@ -19,9 +20,9 @@ from .utils import get_logger -class ExtractorProcessor: # pylint: disable=too-few-public-methods +class ExtractorProcessor(abc.ABC): # pylint: disable=too-few-public-methods """ - Base class for extractions from compressed archives. + Abstract base class for extractions from compressed archives. Subclasses can be used with :meth:`pooch.Pooch.fetch` and :func:`pooch.retrieve` to unzip a downloaded data file into a folder in the @@ -34,16 +35,43 @@ class ExtractorProcessor: # pylint: disable=too-few-public-methods If None, will unpack all files in the archive. Otherwise, *members* must be a list of file names to unpack from the archive. Only these files will be unpacked. + extract_dir : str or None + If None, files will be unpacked to the default location (a folder in + the same location as the downloaded zip file, with a suffix added). + Otherwise, files will be unpacked to ``extract_dir``, which is + interpreted as a *relative path* (relative to the cache location + provided by :func:`pooch.retrieve` or :meth:`pooch.Pooch.fetch`). """ - # String appended to unpacked archive. To be implemented in subclass - suffix = None - def __init__(self, members=None, extract_dir=None): self.members = members self.extract_dir = extract_dir + @property + @abc.abstractmethod + def suffix(self): + """ + String appended to unpacked archive folder name. + Only used if extract_dir is None. + MUST BE IMPLEMENTED BY CHILD CLASSES. + """ + + @abc.abstractmethod + def _all_members(self, fname): + """ + Return all the members in the archive. + MUST BE IMPLEMENTED BY CHILD CLASSES. + """ + + @abc.abstractmethod + def _extract_file(self, fname, extract_dir): + """ + This method receives an argument for the archive to extract and the + destination path. + MUST BE IMPLEMENTED BY CHILD CLASSES. + """ + def __call__(self, fname, action, pooch): """ Extract all files from the given archive. @@ -69,27 +97,23 @@ def __call__(self, fname, action, pooch): A list of the full path to all files in the extracted archive. """ - if self.suffix is None and self.extract_dir is None: - raise NotImplementedError( - "Derived classes must define either a 'suffix' attribute or " - "an 'extract_dir' attribute." - ) if self.extract_dir is None: self.extract_dir = fname + self.suffix else: archive_dir = fname.rsplit(os.path.sep, maxsplit=1)[0] self.extract_dir = os.path.join(archive_dir, self.extract_dir) + # Get a list of everyone who is supposed to be in the unpacked folder + # so we can check if they are all there or if we need to extract new + # files. + if self.members is None or not self.members: + members = self._all_members(fname) + else: + members = self.members if ( (action in ("update", "download")) or (not os.path.exists(self.extract_dir)) - or ( - (self.members is not None) - and ( - not all( - os.path.exists(os.path.join(self.extract_dir, m)) - for m in self.members - ) - ) + or not all( + os.path.exists(os.path.join(self.extract_dir, m)) for m in members ) ): # Make sure that the folder with the extracted files exists @@ -111,13 +135,6 @@ def __call__(self, fname, action, pooch): return fnames - def _extract_file(self, fname, extract_dir): - """ - This method receives an argument for the archive to extract and the - destination path. MUST BE IMPLEMENTED BY CHILD CLASSES. - """ - raise NotImplementedError - class Unzip(ExtractorProcessor): # pylint: disable=too-few-public-methods """ @@ -146,7 +163,18 @@ class Unzip(ExtractorProcessor): # pylint: disable=too-few-public-methods """ - suffix = ".unzip" + @property + def suffix(self): + """ + String appended to unpacked archive folder name. + Only used if extract_dir is None. + """ + return ".unzip" + + def _all_members(self, fname): + """Return all members from a given archive.""" + with ZipFile(fname, "r") as zip_file: + return zip_file.namelist() def _extract_file(self, fname, extract_dir): """ @@ -207,7 +235,18 @@ class Untar(ExtractorProcessor): # pylint: disable=too-few-public-methods :meth:`pooch.Pooch.fetch`). """ - suffix = ".untar" + @property + def suffix(self): + """ + String appended to unpacked archive folder name. + Only used if extract_dir is None. + """ + return ".untar" + + def _all_members(self, fname): + """Return all members from a given archive.""" + with TarFile.open(fname, "r") as tar_file: + return [info.name for info in tar_file.getmembers()] def _extract_file(self, fname, extract_dir): """ diff --git a/pooch/tests/test_processors.py b/pooch/tests/test_processors.py index 578b478d..1a2a1e2a 100644 --- a/pooch/tests/test_processors.py +++ b/pooch/tests/test_processors.py @@ -14,7 +14,7 @@ import pytest from .. import Pooch -from ..processors import Unzip, Untar, ExtractorProcessor, Decompress +from ..processors import Unzip, Untar, Decompress from .utils import pooch_test_url, pooch_test_registry, check_tiny_data, capture_log @@ -97,22 +97,6 @@ def test_decompress_fails(): assert "pooch.Unzip/Untar" in exception.value.args[0] -@pytest.mark.network -def test_extractprocessor_fails(): - "The base class should be used and should fail when passed to fecth" - with TemporaryDirectory() as local_store: - # Setup a pooch in a temp dir - pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY) - processor = ExtractorProcessor() - with pytest.raises(NotImplementedError) as exception: - pup.fetch("tiny-data.tar.gz", processor=processor) - assert "'suffix'" in exception.value.args[0] - processor.suffix = "tar.gz" - with pytest.raises(NotImplementedError) as exception: - pup.fetch("tiny-data.tar.gz", processor=processor) - assert not exception.value.args - - @pytest.mark.network @pytest.mark.parametrize( "target_path", [None, "some_custom_path"], ids=["default_path", "custom_path"] @@ -255,3 +239,51 @@ def _unpacking_expected_paths_and_logs(archive, members, path, name): log_lines.append(f"Extracting '{member}'") true_paths = set(true_paths) return true_paths, log_lines + + +@pytest.mark.network +@pytest.mark.parametrize( + "processor_class,extension", + [(Unzip, ".zip"), (Untar, ".tar.gz")], +) +def test_unpacking_members_then_no_members(processor_class, extension): + """ + Test that calling with valid members then without them works. + https://github.com/fatiando/pooch/issues/364 + """ + with TemporaryDirectory() as local_store: + pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY) + + # Do a first fetch with an existing member + processor1 = processor_class(members=["store/tiny-data.txt"]) + filenames1 = pup.fetch("store" + extension, processor=processor1) + assert len(filenames1) == 1 + + # Do a second fetch with no members + processor2 = processor_class() + filenames2 = pup.fetch("store" + extension, processor=processor2) + assert len(filenames2) > 1 + + +@pytest.mark.network +@pytest.mark.parametrize( + "processor_class,extension", + [(Unzip, ".zip"), (Untar, ".tar.gz")], +) +def test_unpacking_wrong_members_then_no_members(processor_class, extension): + """ + Test that calling with invalid members then without them works. + https://github.com/fatiando/pooch/issues/364 + """ + with TemporaryDirectory() as local_store: + pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY) + + # Do a first fetch with incorrect member + processor1 = processor_class(members=["not-a-valid-file.csv"]) + filenames1 = pup.fetch("store" + extension, processor=processor1) + assert len(filenames1) == 0 + + # Do a second fetch with no members + processor2 = processor_class() + filenames2 = pup.fetch("store" + extension, processor=processor2) + assert len(filenames2) > 0