Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 49 additions & 19 deletions src/dbetto/textdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ def __init__(

self.__store__ = AttrsDict()
self.__ftypes__ = {"json", "yaml"}
self.__validity_file__ = None
self.__catalog__ = None

if not self.__lazy__:
self.scan()
Expand All @@ -123,6 +125,8 @@ def reset(self, rescan: bool = True) -> None:
changes at runtime.
"""
self.__store__ = AttrsDict()
self.__validity_file__ = None
self.__catalog__ = None

if rescan and not self.__lazy__:
self.scan()
Expand Down Expand Up @@ -157,6 +161,42 @@ def keys(self) -> list[str]:
def items(self) -> Iterator[(str, TextDB | AttrsDict | list)]:
return self.__store__.items()

def _find_validity_file(self) -> None:
"""Find and cache the validity file path and parsed catalog.

This method searches for a validity file with supported extensions,
caches its path in __validity_file__, and parses it into a Catalog
object cached in __catalog__. The cached objects are reused on
subsequent calls to .on() to avoid repeated filesystem and parsing
operations.

Raises
------
RuntimeError
If no validity file is found in the database directory.
"""
_extensions = [*list(self.__extensions__), ".jsonl"]
validity_file = None
for ext in _extensions:
candidate = self.__path__ / f"validity{ext}"
if candidate.is_file():
if validity_file is not None:
msg = (
"multiple supported validity files found, "
f"will use the first one of {_extensions}"
)
log.warning(msg)
break
validity_file = candidate

if validity_file is None:
msg = f"no validity.* file found in {self.__path__!s}"
raise RuntimeError(msg)

self.__validity_file__ = validity_file
# Parse and cache the catalog to avoid re-parsing on every .on() call
self.__catalog__ = Catalog.read_from(validity_file)

def on(
self, timestamp: str | datetime, pattern: str | None = None, system: str = "all"
) -> AttrsDict | list:
Expand All @@ -183,26 +223,12 @@ def on(
system
query only a data taking "system" (e.g. 'all', 'phy', 'cal', 'lar', ...)
"""
_extensions = [*list(self.__extensions__), ".jsonl"]
validity_file = None
for ext in _extensions:
candidate = self.__path__ / f"validity{ext}"
if candidate.is_file():
if validity_file is not None:
msg = (
"multiple supported validity files found, "
"will use the first on of {_extensions}"
)
log.warning(msg)
break
validity_file = candidate

if validity_file is None:
msg = f"no validity.* file found in {self.__path__!s}"
raise RuntimeError(msg)
# Use cached validity file and catalog if available, otherwise find/parse it
if self.__validity_file__ is None:
self._find_validity_file()

# parse validity file and return requested files
file_list = Catalog.get_files(str(validity_file), timestamp, system)
# Use the cached catalog instead of re-parsing the validity file
file_list = self.__catalog__.valid_for(timestamp, system)

# select only files matching pattern, if specified
if pattern is not None:
Expand Down Expand Up @@ -384,6 +410,8 @@ def __getstate__(self) -> dict:
"__hidden__": self.__hidden__,
"__ftypes__": self.__ftypes__,
"__store__": self.__store__,
"__validity_file__": self.__validity_file__,
"__catalog__": self.__catalog__,
}

def __setstate__(self, state: dict) -> None:
Expand All @@ -401,6 +429,8 @@ def __setstate__(self, state: dict) -> None:
else state["__ftypes__"]
)
self.__store__ = state["__store__"]
self.__validity_file__ = state.get("__validity_file__")
self.__catalog__ = state.get("__catalog__")

def __contains__(self, value: str) -> bool:
return self.__store__.__contains__(value)
Expand Down
42 changes: 42 additions & 0 deletions tests/test_textdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,13 @@ def test_scan():
jdb.scan(recursive=True)

assert sorted(jdb.__dict__.keys()) == [
"__catalog__",
"__ftypes__",
"__hidden__",
"__lazy__",
"__path__",
"__store__",
"__validity_file__",
"arrays",
"dir1",
"dir2",
Expand All @@ -149,11 +151,13 @@ def test_scan():
jdb.scan(recursive=False)

assert sorted(jdb.__dict__.keys()) == [
"__catalog__",
"__ftypes__",
"__hidden__",
"__lazy__",
"__path__",
"__store__",
"__validity_file__",
"arrays",
"file1",
"file2",
Expand All @@ -164,11 +168,13 @@ def test_scan():
jdb.scan(recursive=False, subdir="dir1")

assert sorted(jdb.__dict__.keys()) == [
"__catalog__",
"__ftypes__",
"__hidden__",
"__lazy__",
"__path__",
"__store__",
"__validity_file__",
"dir1",
]

Expand Down Expand Up @@ -292,31 +298,37 @@ def test_lazyness():
jdb = TextDB(testdb, lazy="auto")
assert jdb.__lazy__ is True
assert sorted(jdb.__dict__.keys()) == [
"__catalog__",
"__ftypes__",
"__hidden__",
"__lazy__",
"__path__",
"__store__",
"__validity_file__",
]

jdb = TextDB(testdb, lazy=True)
assert jdb.__lazy__ is True
assert sorted(jdb.__dict__.keys()) == [
"__catalog__",
"__ftypes__",
"__hidden__",
"__lazy__",
"__path__",
"__store__",
"__validity_file__",
]

jdb = TextDB(testdb, lazy=False)
assert jdb.__lazy__ is False
assert sorted(jdb.__dict__.keys()) == [
"__catalog__",
"__ftypes__",
"__hidden__",
"__lazy__",
"__path__",
"__store__",
"__validity_file__",
"arrays",
"dir1",
"dir2",
Expand All @@ -338,3 +350,33 @@ def test_hidden():

assert isinstance(jdb.dir2, TextDB)
assert getattr(jdb.dir2, "__hidden__", False) is True


def test_validity_file_caching():
"""Test that validity file path and catalog are cached after first .on() call."""
jdb = TextDB(testdb, lazy=False)

# Initially, validity_file and catalog should be None for the root db
assert jdb.__validity_file__ is None
assert jdb.__catalog__ is None

# After first .on() call on dir1 subdirectory, both should be cached
jdb.dir1.on("20230101T000001Z")
assert jdb.dir1.__validity_file__ is not None
assert jdb.dir1.__catalog__ is not None

# Verify the cached path is correct
assert jdb.dir1.__validity_file__.name == "validity.yaml"
assert jdb.dir1.__validity_file__.parent == jdb.dir1.__path__

# Second .on() call should use cached validity_file and catalog (no I/O)
cached_file = jdb.dir1.__validity_file__
cached_catalog = jdb.dir1.__catalog__
jdb.dir1.on("20230102T000000Z")
assert jdb.dir1.__validity_file__ is cached_file # Same object
assert jdb.dir1.__catalog__ is cached_catalog # Same object

# Reset should clear both caches
jdb.dir1.reset()
assert jdb.dir1.__validity_file__ is None
assert jdb.dir1.__catalog__ is None