avoid circular import

HansVRP · HansVRP · commit 7809ca52f10b · 2025-11-05T15:52:54.000+01:00
diff --git a/openeo/extra/job_management/_df_schema.py b/openeo/extra/job_management/_df_schema.py
@@ -0,0 +1,42 @@
+import dataclasses
+from typing import Any, Mapping
+import pandas as pd
+
+@dataclasses.dataclass(frozen=True)
+class _ColumnProperties:
+    """Expected/required properties of a column in the job manager related dataframes"""
+
+    dtype: str = "object"
+    default: Any = None
+
+# Expected columns in the job DB dataframes.
+# TODO: make this part of public API when settled?
+# TODO: move non official statuses to seperate column (not_started, queued_for_start)
+_COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = {
+    "id": _ColumnProperties(dtype="str"),
+    "backend_name": _ColumnProperties(dtype="str"),
+    "status": _ColumnProperties(dtype="str", default="not_started"),
+    # TODO: use proper date/time dtype instead of legacy str for start times?
+    "start_time": _ColumnProperties(dtype="str"),
+    "running_start_time": _ColumnProperties(dtype="str"),
+    # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager,
+    #       but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`.
+    #       Since bfd99e34 they are not really required to be present anymore, can we make that more explicit?
+    "cpu": _ColumnProperties(dtype="str"),
+    "memory": _ColumnProperties(dtype="str"),
+    "duration": _ColumnProperties(dtype="str"),
+    "costs": _ColumnProperties(dtype="float64"),
+}
+
+def _normalize(df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Normalize given pandas dataframe (creating a new one):
+        ensure we have the required columns.
+
+        :param df: The dataframe to normalize.
+        :return: a new dataframe that is normalized.
+        """
+        new_columns = {col: req.default for (col, req) in _COLUMN_REQUIREMENTS.items() if col not in df.columns}
+        df = df.assign(**new_columns)
+
+        return df
diff --git a/openeo/extra/job_management/_job_db.py b/openeo/extra/job_management/_job_db.py
@@ -7,7 +7,7 @@
 import pandas as pd
 
 from openeo.extra.job_management._interface import JobDatabaseInterface
-from openeo.extra.job_management._manager import MultiBackendJobManager
+from openeo.extra.job_management._df_schema import _normalize, _COLUMN_REQUIREMENTS
 
 _log = logging.getLogger(__name__)
 
@@ -40,7 +40,7 @@ def initialize_from_df(self, df: pd.DataFrame, *, on_exists: str = "error"):
             else:
                 # TODO handle other on_exists modes: e.g. overwrite, merge, ...
                 raise ValueError(f"Invalid on_exists={on_exists!r}")
-        df = MultiBackendJobManager._normalize_df(df)
+        df = _normalize(df)
         self.persist(df)
         # Return self to allow chaining with constructor.
         return self
@@ -133,7 +133,7 @@ def read(self) -> pd.DataFrame:
         df = pd.read_csv(
             self.path,
             # TODO: possible to avoid hidden coupling with MultiBackendJobManager here?
-            dtype={c: r.dtype for (c, r) in MultiBackendJobManager._COLUMN_REQUIREMENTS.items()},
+            dtype={c: r.dtype for (c, r) in _COLUMN_REQUIREMENTS.items()},
         )
         if (
             "geometry" in df.columns
@@ -203,3 +203,42 @@ def persist(self, df: pd.DataFrame):
         self.df.to_parquet(self.path, index=False)
 
 
+def get_job_db(path: Union[str, Path]) -> JobDatabaseInterface:
+    """
+    Factory to get a job database at a given path,
+    guessing the database type from filename extension.
+
+    :param path: path to job database file.
+
+    .. versionadded:: 0.33.0
+    """
+    path = Path(path)
+    if path.suffix.lower() in {".csv"}:
+        job_db = CsvJobDatabase(path=path)
+    elif path.suffix.lower() in {".parquet", ".geoparquet"}:
+        job_db = ParquetJobDatabase(path=path)
+    else:
+        raise ValueError(f"Could not guess job database type from {path!r}")
+    return job_db
+
+
+def create_job_db(path: Union[str, Path], df: pd.DataFrame, *, on_exists: str = "error"):
+    """
+    Factory to create a job database at given path,
+    initialized from a given dataframe,
+    and its database type guessed from filename extension.
+
+    :param path: Path to the job database file.
+    :param df: DataFrame to store in the job database.
+    :param on_exists: What to do when the job database already exists:
+        - "error": (default) raise an exception
+        - "skip": work with existing database, ignore given dataframe and skip any initialization
+
+    .. versionadded:: 0.33.0
+    """
+    job_db = get_job_db(path)
+    if isinstance(job_db, FullDataFrameJobDatabase):
+        job_db.initialize_from_df(df=df, on_exists=on_exists)
+    else:
+        raise NotImplementedError(f"Initialization of {type(job_db)} is not supported.")
+    return job_db
diff --git a/openeo/extra/job_management/_manager.py b/openeo/extra/job_management/_manager.py
@@ -13,7 +13,6 @@
     Callable,
     Dict,
     List,
-    Mapping,
     NamedTuple,
     Optional,
     Tuple,
@@ -31,7 +30,8 @@
     _JobStartTask,
 )
 from openeo.extra.job_management._interface import JobDatabaseInterface
-#from openeo.extra.job_management._job_db import get_job_db #TODO circular import
+from openeo.extra.job_management._job_db import get_job_db 
+from openeo.extra.job_management._df_schema import _normalize
 
 from openeo.rest import OpenEoApiError
 from openeo.rest.auth.auth import BearerAuth
@@ -55,13 +55,6 @@ class _Backend(NamedTuple):
     # Maximum number of jobs to allow in parallel on a backend
     parallel_jobs: int
 
-@dataclasses.dataclass(frozen=True)
-class _ColumnProperties:
-    """Expected/required properties of a column in the job manager related dataframes"""
-
-    dtype: str = "object"
-    default: Any = None
-
 
 
 
@@ -132,24 +125,6 @@ def start_job(
         Added ``cancel_running_job_after`` parameter.
     """
 
-    # Expected columns in the job DB dataframes.
-    # TODO: make this part of public API when settled?
-    # TODO: move non official statuses to seperate column (not_started, queued_for_start)
-    _COLUMN_REQUIREMENTS: Mapping[str, _ColumnProperties] = {
-        "id": _ColumnProperties(dtype="str"),
-        "backend_name": _ColumnProperties(dtype="str"),
-        "status": _ColumnProperties(dtype="str", default="not_started"),
-        # TODO: use proper date/time dtype instead of legacy str for start times?
-        "start_time": _ColumnProperties(dtype="str"),
-        "running_start_time": _ColumnProperties(dtype="str"),
-        # TODO: these columns "cpu", "memory", "duration" are not referenced explicitly from MultiBackendJobManager,
-        #       but are indirectly coupled through handling of VITO-specific "usage" metadata in `_track_statuses`.
-        #       Since bfd99e34 they are not really required to be present anymore, can we make that more explicit?
-        "cpu": _ColumnProperties(dtype="str"),
-        "memory": _ColumnProperties(dtype="str"),
-        "duration": _ColumnProperties(dtype="str"),
-        "costs": _ColumnProperties(dtype="float64"),
-    }
 
     def __init__(
         self,
@@ -259,10 +234,7 @@ def _normalize_df(cls, df: pd.DataFrame) -> pd.DataFrame:
         :param df: The dataframe to normalize.
         :return: a new dataframe that is normalized.
         """
-        new_columns = {col: req.default for (col, req) in cls._COLUMN_REQUIREMENTS.items() if col not in df.columns}
-        df = df.assign(**new_columns)
-
-        return df
+        return _normalize(df)
 
     def start_job_thread(self, start_job: Callable[[], BatchJob], job_db: JobDatabaseInterface):
         """