diff --git a/MAINTAINER.md b/MAINTAINER.md index 1da6a43c..3b37ae52 100644 --- a/MAINTAINER.md +++ b/MAINTAINER.md @@ -1,6 +1,6 @@ # Maintainer notes -This document is intended for users contributing/manitaining this repository. +This document is intended for users contributing/maintaining this repository. It is not comprehensive, but aims to capture items relevant to architecture that are not covered in another document. @@ -10,7 +10,7 @@ Since these terms are used somewhat haphazardly in different database implementa we'll quickly define them for the purposes of this document: - database - a single instance of a database product, local or in the cloud. it can -contain serveral schemas. +contain several schemas. - schema - a namespaced collection of tables inside a database Cumulus, as a holistic system, is designed to allow querying against the entire history @@ -36,4 +36,58 @@ via cross-database joining. Additional tables should not be created by users in schemas. A user could elect to use these vocabulary builders and skip the entire rest of the -Cumulus ecosystem, if they wanted to. \ No newline at end of file +Cumulus ecosystem, if they wanted to. + +## Advanced study features + +These features are for very narrow and advanced use cases, +designed for internal project studies (like `core`, `discovery`, or `data_metrics`). + +### Dynamic prefixes + +The `data_metrics` study wants to be able to generate an analysis of a single study cohort's data. +In order to do this, it needs to privately namespace that analysis. + +The solution we use for this is to allow a study to dynamically generate the prefix it will use. +Thus, the `data_metrics` study can use a prefix like `data_metrics_hypertension__` for a +`hypertension` study and `data_metrics_covid__` for a `covid` study. + +#### Config +Add a field called `dynamic_study_prefix` pointing at a local Python file. +If this field is present, any `study_prefix` field is ignored. +```toml +dynamic_study_prefix = "gen_prefix.py" +``` + +#### Generator file + +Your generator file will be called as a Python script, +with the `--option` arguments that Cumulus Library gets +(but without the `--option` bit). +You should ignore unknown options, for forward compatibility. + +You should print your desired prefix to stdout. + +Example: +```python +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--study") +args, _rest = parser.parse_known_args() + +if args.study: + print(f"data_metrics_{args.study}") +else: + print("data_metrics") +``` + +#### Usage + +Your study still has to be selected using its original name (`--target=data_metrics`), +but the resulting tables will be prefixed using the generated study name. + +This command line would build a `data_metrics_hypertension` study: +```sh +cumulus-library build -t data_metrics --option study:hypertension +``` diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py index 0e4b410b..dc720c99 100644 --- a/cumulus_library/actions/builder.py +++ b/cumulus_library/actions/builder.py @@ -48,7 +48,7 @@ def _load_and_execute_builder( :keyword filename: filename of a module implementing a TableBuilder :keyword db_parser: an object implementing DatabaseParser for the target database :keyword write_reference_sql: if true, writes sql to disk inside a study's directory - :keyword doc_string: A string to insert between queries written to disk + :keyword doc_str: A string to insert between queries written to disk """ # Since we have to support arbitrary user-defined python files here, we diff --git a/cumulus_library/builders/counts.py b/cumulus_library/builders/counts.py index ef649cfb..fb222e96 100644 --- a/cumulus_library/builders/counts.py +++ b/cumulus_library/builders/counts.py @@ -20,6 +20,7 @@ def __init__(self, study_prefix: str | None = None): study_path = Path(sys.modules[self.__module__].__file__).parent try: + # FIXME: MIKE can we pass down options? parser = study_manifest.StudyManifest(study_path) self.study_prefix = parser.get_study_prefix() except Exception as e: diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py index f39f9473..22b1471c 100755 --- a/cumulus_library/cli.py +++ b/cumulus_library/cli.py @@ -37,11 +37,11 @@ class StudyRunner: verbose = False schema_name = None - def __init__(self, config: base_utils.StudyConfig, data_path: str): + def __init__(self, config: base_utils.StudyConfig, data_path: str | None): self.config = config - self.data_path = data_path + self.data_path = data_path and pathlib.Path(data_path) - def get_config(self, manifest=study_manifest.StudyManifest): + def get_config(self, manifest: study_manifest.StudyManifest): schema = base_utils.get_schema(self.config, manifest) if schema == self.config.schema: return self.config @@ -57,6 +57,7 @@ def clean_study( targets: list[str], study_dict: dict, *, + options: dict[str, str], prefix: bool = False, ) -> None: """Removes study table/views from Athena. @@ -65,8 +66,9 @@ def clean_study( this can be useful for cleaning up tables if a study prefix is changed for some reason. - :param target: The study prefix to use for IDing tables to remove + :param targets: The study prefixes to use for IDing tables to remove :param study_dict: The dictionary of available study targets + :param options: The dictionary of study-specific options :keyword prefix: If True, does a search by string prefix in place of study name """ if targets is None: @@ -77,26 +79,28 @@ def clean_study( for target in targets: if prefix: - manifest = study_manifest.StudyManifest() + manifest = study_manifest.StudyManifest(options=options) cleaner.clean_study( config=self.get_config(manifest), manifest=manifest, prefix=target ) else: - manifest = study_manifest.StudyManifest(study_dict[target]) + manifest = study_manifest.StudyManifest(study_dict[target], options=options) cleaner.clean_study(config=self.get_config(manifest), manifest=manifest) def clean_and_build_study( self, target: pathlib.Path, *, + options: dict[str, str], continue_from: str | None = None, ) -> None: """Recreates study views/tables :param target: A path to the study directory + :param options: The dictionary of study-specific options :keyword continue_from: Restart a run from a specific sql file (for dev only) """ - manifest = study_manifest.StudyManifest(target, self.data_path) + manifest = study_manifest.StudyManifest(target, self.data_path, options=options) try: builder.run_protected_table_builder(config=self.get_config(manifest), manifest=manifest) if not continue_from: @@ -150,13 +154,16 @@ def run_matching_table_builder( self, target: pathlib.Path, table_builder_name: str, + *, + options: dict[str, str], ) -> None: """Runs a single table builder :param target: A path to the study directory :param table_builder_name: a builder file referenced in the study's manifest + :param options: The dictionary of study-specific options """ - manifest = study_manifest.StudyManifest(target) + manifest = study_manifest.StudyManifest(target, options=options) builder.run_matching_table_builder( config=self.get_config(manifest), manifest=manifest, @@ -164,14 +171,22 @@ def run_matching_table_builder( ) ### Data exporters - def export_study(self, target: pathlib.Path, data_path: pathlib.Path, archive: bool) -> None: + def export_study( + self, + target: pathlib.Path, + data_path: pathlib.Path, + archive: bool, + *, + options: dict[str, str], + ) -> None: """Exports aggregates defined in a manifest :param target: A path to the study directory - :param datapath: The location to export data to + :param data_path: The location to export data to :param archive: If true, will export all tables, otherwise uses manifest list + :param options: The dictionary of study-specific options """ - manifest = study_manifest.StudyManifest(target, data_path) + manifest = study_manifest.StudyManifest(target, data_path, options=options) exporter.export_study( config=self.get_config(manifest), manifest=manifest, @@ -183,14 +198,14 @@ def generate_study_sql( self, target: pathlib.Path, *, - builder: str | None = None, + options: dict[str, str], ) -> None: """Materializes study sql from templates :param target: A path to the study directory - :keyword builder: Specify a single builder to generate sql from + :param options: The dictionary of study-specific options """ - manifest = study_manifest.StudyManifest(target) + manifest = study_manifest.StudyManifest(target, options=options) file_generator.run_generate_sql(config=self.get_config(manifest), manifest=manifest) def generate_study_markdown( @@ -252,7 +267,8 @@ def get_studies_by_manifest_path(path: pathlib.Path) -> dict[str, pathlib.Path]: manifest_paths.update(get_studies_by_manifest_path(child_path)) elif child_path.name == "manifest.toml": try: - manifest = study_manifest.StudyManifest(path) + # Pass empty options, since we are not in a study-specific context + manifest = study_manifest.StudyManifest(path, options={}) manifest_paths[manifest.get_study_prefix()] = path except errors.StudyManifestParsingError as exc: rich.print(f"[bold red] Ignoring study in '{path}': {exc}") @@ -309,15 +325,19 @@ def run_cli(args: dict): targets=args["target"], study_dict=study_dict, prefix=args["prefix"], + options=args["options"], ) elif args["action"] == "build": for target in args["target"]: if args["builder"]: - runner.run_matching_table_builder(study_dict[target], args["builder"]) + runner.run_matching_table_builder( + study_dict[target], args["builder"], options=args["options"] + ) else: runner.clean_and_build_study( study_dict[target], continue_from=args["continue_from"], + options=args["options"], ) elif args["action"] == "export": @@ -337,7 +357,12 @@ def run_cli(args: dict): if response.lower() != "y": sys.exit() for target in args["target"]: - runner.export_study(study_dict[target], args["data_path"], args["archive"]) + runner.export_study( + study_dict[target], + args["data_path"], + args["archive"], + options=args["options"], + ) elif args["action"] == "import": for archive in args["archive_path"]: @@ -346,7 +371,7 @@ def run_cli(args: dict): elif args["action"] == "generate-sql": for target in args["target"]: - runner.generate_study_sql(study_dict[target], builder=args["builder"]) + runner.generate_study_sql(study_dict[target], options=args["options"]) elif args["action"] == "generate-md": for target in args["target"]: @@ -429,7 +454,7 @@ def main(cli_args=None): options = {} cli_options = args.get("options") or [] for c_arg in cli_options: - c_arg = c_arg.split(":", 2) + c_arg = c_arg.split(":", 1) if len(c_arg) == 1: sys.exit( f"Custom argument '{c_arg}' is not validly formatted.\n" diff --git a/cumulus_library/study_manifest.py b/cumulus_library/study_manifest.py index b2a89017..3ce88e49 100644 --- a/cumulus_library/study_manifest.py +++ b/cumulus_library/study_manifest.py @@ -1,6 +1,9 @@ """Class for loading study configuration data from manifest.toml files""" import pathlib +import re +import subprocess +import sys import tomllib from cumulus_library import errors @@ -9,42 +12,43 @@ class StudyManifest: """Handles interactions with study directories and manifest files""" + PREFIX_REGEX = re.compile(r"[a-zA-Z][a-zA-Z0-9_]*") + def __init__( self, study_path: pathlib.Path | None = None, data_path: pathlib.Path | None = None, + *, + options: dict[str, str] | None = None, ): """Instantiates a StudyManifest. :param study_path: A Path object pointing to the dir of the manifest, optional :param data_path: A Path object pointing to the dir to save/load ancillary files from, optional + :param options: Command line study-specific options for dynamic manifest values, optional """ + self._study_prefix = None self._study_path = None self._study_config = {} if study_path is not None: - self.load_study_manifest(study_path) + self._load_study_manifest(study_path, options or {}) self.data_path = data_path def __repr__(self): return str(self._study_config) ### toml parsing helper functions - def load_study_manifest(self, study_path: pathlib.Path) -> None: + def _load_study_manifest(self, study_path: pathlib.Path, options: dict[str, str]) -> None: """Reads in a config object from a directory containing a manifest.toml :param study_path: A pathlib.Path object pointing to a study directory + :param options: Command line study-specific options (--option=A:B) :raises StudyManifestParsingError: the manifest.toml is malformed or missing. """ try: with open(f"{study_path}/manifest.toml", "rb") as file: config = tomllib.load(file) - if not config.get("study_prefix") or not isinstance(config["study_prefix"], str): - raise errors.StudyManifestParsingError( - f"Invalid prefix in manifest at {study_path}" - ) - self._study_config = config - self._study_path = study_path except FileNotFoundError as e: raise errors.StudyManifestFilesystemError( f"Missing or invalid manifest found at {study_path}" @@ -53,12 +57,24 @@ def load_study_manifest(self, study_path: pathlib.Path) -> None: # just unify the error classes for convenience of catching them raise errors.StudyManifestParsingError(str(e)) from e + self._study_config = config + self._study_path = study_path + + if dynamic_study_prefix := config.get("dynamic_study_prefix"): + self._study_prefix = self._run_dynamic_script(dynamic_study_prefix, options) + elif config.get("study_prefix") and isinstance(config["study_prefix"], str): + self._study_prefix = config["study_prefix"] + + if not self._study_prefix or not re.fullmatch(self.PREFIX_REGEX, self._study_prefix): + raise errors.StudyManifestParsingError(f"Invalid prefix in manifest at {study_path}") + self._study_prefix = self._study_prefix.lower() + def get_study_prefix(self) -> str | None: """Reads the name of a study prefix from the in-memory study config :returns: A string of the prefix in the manifest, or None if not found """ - return self._study_config.get("study_prefix") + return self._study_prefix def get_dedicated_schema(self) -> str | None: """Reads the contents of the dedicated schema in the options dict @@ -117,13 +133,20 @@ def get_export_table_list(self) -> list[str] | None: :returns: An array of tables to export from the manifest, or None if not found. """ export_config = self._study_config.get("export_config", {}) - export_table_list = export_config.get("export_list", []) or [] - for table in export_table_list: - if not table.startswith(f"{self.get_study_prefix()}__"): + export_table_list = [] + for table in export_config.get("export_list") or []: + if table.startswith(f"{self.get_study_prefix()}__"): + export_table_list.append(table) + elif "__" in table: # has a prefix, just the wrong one raise errors.StudyManifestParsingError( f"{table} in export list does not start with prefix " f"{self.get_study_prefix()}__ - check your manifest file." ) + else: + # Add the prefix for them (helpful in dynamic prefix cases where the prefix + # is not known ahead of time) + export_table_list.append(f"{self.get_study_prefix()}__{table}") + return export_table_list def get_all_generators(self) -> list[str]: @@ -139,3 +162,19 @@ def get_prefix_with_seperator(self) -> str: if dedicated := self.get_dedicated_schema(): return f"{dedicated}." return f"{self.get_study_prefix()}__" + + ### Dynamic Python code support + + def _run_dynamic_script(self, filename: str, options: dict[str, str]) -> str: + if not sys.executable: + raise RuntimeError("Unknown Python executable, dynamic manifest values not supported") + + full_path = f"{self._study_path}/{filename}" + option_args = [f"--{key}={value}" for key, value in options.items()] + result = subprocess.run( # noqa: S603 + [sys.executable, full_path, *option_args], + check=True, + capture_output=True, + ) + + return result.stdout.decode("utf8").strip() diff --git a/tests/conftest.py b/tests/conftest.py index 6e0cda28..f602a247 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -227,6 +227,7 @@ def mock_db_core(tmp_path, mock_db): # pylint: disable=redefined-outer-name builder = cli.StudyRunner(config, data_path=f"{tmp_path}/data_path") builder.clean_and_build_study( pathlib.Path(__file__).parent.parent / "cumulus_library/studies/core", + options={}, ) yield mock_db @@ -256,6 +257,7 @@ def mock_db_stats(tmp_path): builder = cli.StudyRunner(config, data_path=f"{tmp_path}/data_path") builder.clean_and_build_study( pathlib.Path(__file__).parent.parent / "cumulus_library/studies/core", + options={}, ) yield db diff --git a/tests/test_athena.py b/tests/test_athena.py index fb140564..940b7708 100644 --- a/tests/test_athena.py +++ b/tests/test_athena.py @@ -99,12 +99,12 @@ def test_create_schema(mock_client): assert mock_clientobj.create_database.called -def test_dedicated_schema_namespacing(): - manifest = study_manifest.StudyManifest() - manifest._study_config = manifest._study_config = { - "study_prefix": "foo", - "advanced_options": {"dedicated_schema": "foo"}, - } +def test_dedicated_schema_namespacing(tmp_path): + with open(f"{tmp_path}/manifest.toml", "w", encoding="utf8") as f: + f.write('study_prefix="foo"\n') + f.write("[advanced_options]\n") + f.write('dedicated_schema="foo"\n') + manifest = study_manifest.StudyManifest(tmp_path) query = "CREATE TABLE foo__bar" result = base_utils.update_query_if_schema_specified(query, manifest) assert result == "CREATE TABLE foo.bar" diff --git a/tests/test_data/study_dynamic_prefix/gen_prefix.py b/tests/test_data/study_dynamic_prefix/gen_prefix.py new file mode 100644 index 00000000..5cbb8470 --- /dev/null +++ b/tests/test_data/study_dynamic_prefix/gen_prefix.py @@ -0,0 +1,7 @@ +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--prefix", default="dynamic") +args, _rest = parser.parse_known_args() + +print(args.prefix) diff --git a/tests/test_data/study_dynamic_prefix/manifest.toml b/tests/test_data/study_dynamic_prefix/manifest.toml new file mode 100644 index 00000000..551f8cef --- /dev/null +++ b/tests/test_data/study_dynamic_prefix/manifest.toml @@ -0,0 +1,11 @@ +dynamic_study_prefix = "gen_prefix.py" + +[table_builder_config] +file_names = [ + "meta.py", +] + +[export_config] +export_list = [ + "meta_version", +] diff --git a/tests/test_data/study_dynamic_prefix/meta.py b/tests/test_data/study_dynamic_prefix/meta.py new file mode 100644 index 00000000..64afba6c --- /dev/null +++ b/tests/test_data/study_dynamic_prefix/meta.py @@ -0,0 +1,11 @@ +"""Sets study metadata""" + +import cumulus_library + + +class DynamicMeta(cumulus_library.BaseTableBuilder): + def prepare_queries(self, *args, manifest: cumulus_library.StudyManifest, **kwargs): + prefix = manifest.get_study_prefix() + self.queries.append( + f"CREATE TABLE {prefix}__meta_version AS SELECT 1 AS data_package_version;" + ) diff --git a/tests/test_dynamic_manifest.py b/tests/test_dynamic_manifest.py new file mode 100644 index 00000000..f4ccff86 --- /dev/null +++ b/tests/test_dynamic_manifest.py @@ -0,0 +1,93 @@ +"""tests for using dynamic values in study manifests""" + +import builtins +import os +import pathlib +import shutil +from contextlib import nullcontext as does_not_raise +from pathlib import Path +from unittest import mock + +import duckdb +import pytest + +from cumulus_library import cli, errors, study_manifest +from tests.conftest import duckdb_args + +STUDY_ARGS = [ + f"--study-dir={Path(__file__).parent}/test_data/study_dynamic_prefix", + "--target=dynamic", +] + + +@pytest.mark.parametrize( + "options,result", + [ + ({"prefix": "my_study"}, "my_study"), # simply happy path + ({"prefix": "STUDY"}, "study"), # lowercased + ({"prefix": ""}, pytest.raises(errors.StudyManifestParsingError)), # empty prefix + ({"prefix": "..."}, pytest.raises(errors.StudyManifestParsingError)), # bad prefix + ({"prefix": "123"}, pytest.raises(errors.StudyManifestParsingError)), # bad prefix + ], +) +def test_manifest_with_dynamic_prefix(options, result): + if isinstance(result, str): + raises = does_not_raise() + else: + raises = result + with raises: + path = pathlib.Path("tests/test_data/study_dynamic_prefix") + manifest = study_manifest.StudyManifest(path, options=options) + assert manifest.get_study_prefix() == result + + +@mock.patch("sys.executable", new=None) +def test_manifest_with_dynamic_prefix_and_no_executable(): + """sys.executable must be valid for us to run a Python script""" + with pytest.raises(RuntimeError): + study_manifest.StudyManifest(pathlib.Path("tests/test_data/study_dynamic_prefix")) + + +def test_cli_clean_with_dynamic_prefix(tmp_path): + cli.main(cli_args=duckdb_args(["build", *STUDY_ARGS, "--option=prefix:dynamic2"], tmp_path)) + cli.main(cli_args=duckdb_args(["build", *STUDY_ARGS], tmp_path)) + + # Confirm that both prefixes got built + db = duckdb.connect(f"{tmp_path}/duck.db") + tables = {row[0] for row in db.cursor().execute("show tables").fetchall()} + assert "dynamic__meta_version" in tables + assert "dynamic2__meta_version" in tables + + # Clean 2nd table + with mock.patch.object(builtins, "input", lambda _: "y"): + cli.main(cli_args=duckdb_args(["clean", *STUDY_ARGS, "--option=prefix:dynamic2"], tmp_path)) + + db = duckdb.connect(f"{tmp_path}/duck.db") + tables = {row[0] for row in db.cursor().execute("show tables").fetchall()} + assert "dynamic__meta_version" in tables + assert "dynamic2__meta_version" not in tables + + +def test_cli_export_with_dynamic_prefix(tmp_path): + cli.main(cli_args=duckdb_args(["build", *STUDY_ARGS, "--option=prefix:abc"], tmp_path)) + cli.main(cli_args=duckdb_args(["export", *STUDY_ARGS, "--option=prefix:abc"], tmp_path)) + assert set(os.listdir(f"{tmp_path}/export")) == {"abc"} + assert set(os.listdir(f"{tmp_path}/export/abc")) == { + "abc__meta_version.csv", + "abc__meta_version.parquet", + } + + +def test_cli_generate_sql_with_dynamic_prefix(tmp_path): + shutil.copytree( + pathlib.Path(__file__).parent / "test_data/study_dynamic_prefix", + tmp_path, + dirs_exist_ok=True, + ) + study_args = [f"--study-dir={tmp_path}", "--target=dynamic"] + cli.main(cli_args=duckdb_args(["generate-sql", *study_args, "--option=prefix:abc"], tmp_path)) + assert set(os.listdir(f"{tmp_path}/reference_sql")) == {"meta.sql"} + + with open(f"{tmp_path}/reference_sql/meta.sql", encoding="utf8") as f: + sql = f.readlines() + assert sql[-1] == "CREATE TABLE abc__meta_version AS SELECT 1 AS data_package_version;\n" diff --git a/tests/testbed_utils.py b/tests/testbed_utils.py index 2aeec65b..eb231d82 100644 --- a/tests/testbed_utils.py +++ b/tests/testbed_utils.py @@ -245,5 +245,6 @@ def build(self, study="core") -> duckdb.DuckDBPyConnection: builder = cli.StudyRunner(config, data_path=str(self.path)) builder.clean_and_build_study( Path(__file__).parent.parent / "cumulus_library/studies" / study, + options={}, ) return duckdb.connect(db_file) diff --git a/tests/valueset/test_umls.py b/tests/valueset/test_umls.py index 7187d263..c4d1485c 100644 --- a/tests/valueset/test_umls.py +++ b/tests/valueset/test_umls.py @@ -12,10 +12,11 @@ os.environ, clear=True, ) -def test_umls_lookup(mock_db_config_rxnorm, prefix): +def test_umls_lookup(mock_db_config_rxnorm, prefix, tmp_path): + with open(f"{tmp_path}/manifest.toml", "w", encoding="utf8") as f: + f.write('study_prefix="test"') cursor = mock_db_config_rxnorm.db.cursor() - manifest = StudyManifest() - manifest._study_config = {"study_prefix": "test"} + manifest = StudyManifest(tmp_path) mock_db_config_rxnorm.options = {"umls_stewards": "medrt"} valueset_config = valueset_utils.ValuesetConfig( umls_stewards={"medrt": {"sab": "MED-RT", "search_terms": ["Opioid"]}}