diff --git a/src/dbetto/textdb.py b/src/dbetto/textdb.py index a91abfd..33d17ea 100644 --- a/src/dbetto/textdb.py +++ b/src/dbetto/textdb.py @@ -100,6 +100,8 @@ def __init__( self.__store__ = AttrsDict() self.__ftypes__ = {"json", "yaml"} + self.__validity_file__ = None + self.__catalog__ = None if not self.__lazy__: self.scan() @@ -123,6 +125,8 @@ def reset(self, rescan: bool = True) -> None: changes at runtime. """ self.__store__ = AttrsDict() + self.__validity_file__ = None + self.__catalog__ = None if rescan and not self.__lazy__: self.scan() @@ -157,6 +161,42 @@ def keys(self) -> list[str]: def items(self) -> Iterator[(str, TextDB | AttrsDict | list)]: return self.__store__.items() + def _find_validity_file(self) -> None: + """Find and cache the validity file path and parsed catalog. + + This method searches for a validity file with supported extensions, + caches its path in __validity_file__, and parses it into a Catalog + object cached in __catalog__. The cached objects are reused on + subsequent calls to .on() to avoid repeated filesystem and parsing + operations. + + Raises + ------ + RuntimeError + If no validity file is found in the database directory. + """ + _extensions = [*list(self.__extensions__), ".jsonl"] + validity_file = None + for ext in _extensions: + candidate = self.__path__ / f"validity{ext}" + if candidate.is_file(): + if validity_file is not None: + msg = ( + "multiple supported validity files found, " + f"will use the first one of {_extensions}" + ) + log.warning(msg) + break + validity_file = candidate + + if validity_file is None: + msg = f"no validity.* file found in {self.__path__!s}" + raise RuntimeError(msg) + + self.__validity_file__ = validity_file + # Parse and cache the catalog to avoid re-parsing on every .on() call + self.__catalog__ = Catalog.read_from(validity_file) + def on( self, timestamp: str | datetime, pattern: str | None = None, system: str = "all" ) -> AttrsDict | list: @@ -183,26 +223,12 @@ def on( system query only a data taking "system" (e.g. 'all', 'phy', 'cal', 'lar', ...) """ - _extensions = [*list(self.__extensions__), ".jsonl"] - validity_file = None - for ext in _extensions: - candidate = self.__path__ / f"validity{ext}" - if candidate.is_file(): - if validity_file is not None: - msg = ( - "multiple supported validity files found, " - "will use the first on of {_extensions}" - ) - log.warning(msg) - break - validity_file = candidate - - if validity_file is None: - msg = f"no validity.* file found in {self.__path__!s}" - raise RuntimeError(msg) + # Use cached validity file and catalog if available, otherwise find/parse it + if self.__validity_file__ is None: + self._find_validity_file() - # parse validity file and return requested files - file_list = Catalog.get_files(str(validity_file), timestamp, system) + # Use the cached catalog instead of re-parsing the validity file + file_list = self.__catalog__.valid_for(timestamp, system) # select only files matching pattern, if specified if pattern is not None: @@ -384,6 +410,8 @@ def __getstate__(self) -> dict: "__hidden__": self.__hidden__, "__ftypes__": self.__ftypes__, "__store__": self.__store__, + "__validity_file__": self.__validity_file__, + "__catalog__": self.__catalog__, } def __setstate__(self, state: dict) -> None: @@ -401,6 +429,8 @@ def __setstate__(self, state: dict) -> None: else state["__ftypes__"] ) self.__store__ = state["__store__"] + self.__validity_file__ = state.get("__validity_file__") + self.__catalog__ = state.get("__catalog__") def __contains__(self, value: str) -> bool: return self.__store__.__contains__(value) diff --git a/tests/test_textdb.py b/tests/test_textdb.py index 7e7d4f5..6fefcd5 100644 --- a/tests/test_textdb.py +++ b/tests/test_textdb.py @@ -132,11 +132,13 @@ def test_scan(): jdb.scan(recursive=True) assert sorted(jdb.__dict__.keys()) == [ + "__catalog__", "__ftypes__", "__hidden__", "__lazy__", "__path__", "__store__", + "__validity_file__", "arrays", "dir1", "dir2", @@ -149,11 +151,13 @@ def test_scan(): jdb.scan(recursive=False) assert sorted(jdb.__dict__.keys()) == [ + "__catalog__", "__ftypes__", "__hidden__", "__lazy__", "__path__", "__store__", + "__validity_file__", "arrays", "file1", "file2", @@ -164,11 +168,13 @@ def test_scan(): jdb.scan(recursive=False, subdir="dir1") assert sorted(jdb.__dict__.keys()) == [ + "__catalog__", "__ftypes__", "__hidden__", "__lazy__", "__path__", "__store__", + "__validity_file__", "dir1", ] @@ -292,31 +298,37 @@ def test_lazyness(): jdb = TextDB(testdb, lazy="auto") assert jdb.__lazy__ is True assert sorted(jdb.__dict__.keys()) == [ + "__catalog__", "__ftypes__", "__hidden__", "__lazy__", "__path__", "__store__", + "__validity_file__", ] jdb = TextDB(testdb, lazy=True) assert jdb.__lazy__ is True assert sorted(jdb.__dict__.keys()) == [ + "__catalog__", "__ftypes__", "__hidden__", "__lazy__", "__path__", "__store__", + "__validity_file__", ] jdb = TextDB(testdb, lazy=False) assert jdb.__lazy__ is False assert sorted(jdb.__dict__.keys()) == [ + "__catalog__", "__ftypes__", "__hidden__", "__lazy__", "__path__", "__store__", + "__validity_file__", "arrays", "dir1", "dir2", @@ -338,3 +350,33 @@ def test_hidden(): assert isinstance(jdb.dir2, TextDB) assert getattr(jdb.dir2, "__hidden__", False) is True + + +def test_validity_file_caching(): + """Test that validity file path and catalog are cached after first .on() call.""" + jdb = TextDB(testdb, lazy=False) + + # Initially, validity_file and catalog should be None for the root db + assert jdb.__validity_file__ is None + assert jdb.__catalog__ is None + + # After first .on() call on dir1 subdirectory, both should be cached + jdb.dir1.on("20230101T000001Z") + assert jdb.dir1.__validity_file__ is not None + assert jdb.dir1.__catalog__ is not None + + # Verify the cached path is correct + assert jdb.dir1.__validity_file__.name == "validity.yaml" + assert jdb.dir1.__validity_file__.parent == jdb.dir1.__path__ + + # Second .on() call should use cached validity_file and catalog (no I/O) + cached_file = jdb.dir1.__validity_file__ + cached_catalog = jdb.dir1.__catalog__ + jdb.dir1.on("20230102T000000Z") + assert jdb.dir1.__validity_file__ is cached_file # Same object + assert jdb.dir1.__catalog__ is cached_catalog # Same object + + # Reset should clear both caches + jdb.dir1.reset() + assert jdb.dir1.__validity_file__ is None + assert jdb.dir1.__catalog__ is None