Allow tasks to depend on other tasks. (#493)

tobiasraabe · web-flow · commit 3c912ef20089 · 2023-11-16T18:02:33.000+01:00
diff --git a/docs/source/changes.md b/docs/source/changes.md
@@ -18,6 +18,7 @@ releases are available on [PyPI](https://pypi.org/project/pytask) and
   when a product annotation is used with the argument name `produces`. And, allow
   `produces` to intake any node.
 - {pull}`490` refactors and better tests parsing of dependencies.
+- {pull}`493` allows tasks to depend on other tasks.
 - {pull}`496` makes pytask even lazier. Now, when a task produces a node whose hash
   remains the same, the consecutive tasks are not executed. It remained from when pytask
   relied on timestamps.
diff --git a/docs/source/tutorials/defining_dependencies_products.md b/docs/source/tutorials/defining_dependencies_products.md
@@ -410,6 +410,38 @@ def task_fit_model(depends_on, produces):
 :::
 ::::
 
+## Depending on a task
+
+In some situations you want to define a task depending on another task without
+specifying the relationship explicitly.
+
+pytask allows you to do that, but you loose features like access to paths which is why
+defining dependencies explicitly is always preferred.
+
+There are two modes for it and both use {func}`@task(after=...) <pytask.task>`.
+
+First, you can pass the task function or multiple task functions to the decorator.
+Applied to the tasks from before, we could have written `task_plot_data` as
+
+```python
+@task(after=task_create_random_data)
+def task_plot_data(...):
+    ...
+```
+
+You can also pass a list of task functions.
+
+The second mode is to pass an expression, a substring of the name of the dependent
+tasks. Here, we can pass the function name or a significant part of the function
+name.
+
+```python
+@task(after="random_data")
+def task_plot_data(...):
+    ...
+```
+
+You will learn more about expressions in {doc}`selecting_tasks`.
 
 ## References
 
diff --git a/src/_pytask/collect.py b/src/_pytask/collect.py
@@ -254,6 +254,8 @@ def pytask_collect_task(
         )
 
         markers = get_all_marks(obj)
+        collection_id = obj.pytask_meta._id if hasattr(obj, "pytask_meta") else None
+        after = obj.pytask_meta.after if hasattr(obj, "pytask_meta") else []
 
         # Get the underlying function to avoid having different states of the function,
         # e.g. due to pytask_meta, in different layers of the wrapping.
@@ -266,6 +268,7 @@ def pytask_collect_task(
                 depends_on=dependencies,
                 produces=products,
                 markers=markers,
+                attributes={"collection_id": collection_id, "after": after},
             )
         return Task(
             base_name=name,
@@ -274,6 +277,7 @@ def pytask_collect_task(
             depends_on=dependencies,
             produces=products,
             markers=markers,
+            attributes={"collection_id": collection_id, "after": after},
         )
     if isinstance(obj, PTask) and not inspect.isclass(obj):
         return obj
@@ -294,7 +298,7 @@ def pytask_collect_task(
 
 Please, align the names to ensure reproducibility on case-sensitive file systems \
 (often Linux or macOS) or disable this error with 'check_casing_of_paths = false' in \
-your pytask configuration file.
+the pyproject.toml file.
 
 Hint: If parts of the path preceding your project directory are not properly \
 formatted, check whether you need to call `.resolve()` on `SRC`, `BLD` or other paths \
diff --git a/src/_pytask/dag.py b/src/_pytask/dag.py
@@ -15,6 +15,7 @@
 from _pytask.console import render_to_string
 from _pytask.console import TASK_ICON
 from _pytask.exceptions import ResolvingDependenciesError
+from _pytask.mark import select_by_after_keyword
 from _pytask.node_protocols import PNode
 from _pytask.node_protocols import PTask
 from _pytask.nodes import PythonNode
@@ -93,6 +94,30 @@ def _add_product(dag: nx.DiGraph, task: PTask, node: PNode) -> None:
     return dag
 
 
+@hookimpl
+def pytask_dag_modify_dag(session: Session, dag: nx.DiGraph) -> None:
+    """Create dependencies between tasks when using ``@task(after=...)``."""
+    temporary_id_to_task = {
+        task.attributes["collection_id"]: task
+        for task in session.tasks
+        if "collection_id" in task.attributes
+    }
+    for task in session.tasks:
+        after = task.attributes.get("after")
+        if isinstance(after, list):
+            for temporary_id in after:
+                other_task = temporary_id_to_task[temporary_id]
+                for successor in dag.successors(other_task.signature):
+                    dag.add_edge(successor, task.signature)
+        elif isinstance(after, str):
+            task_signature = task.signature
+            signatures = select_by_after_keyword(session, after)
+            signatures.discard(task_signature)
+            for signature in signatures:
+                for successor in dag.successors(signature):
+                    dag.add_edge(successor, task.signature)
+
+
 def _check_if_dag_has_cycles(dag: nx.DiGraph) -> None:
     """Check if DAG has cycles."""
     try:
diff --git a/src/_pytask/mark/__init__.py b/src/_pytask/mark/__init__.py
@@ -39,6 +39,7 @@
     "MarkDecorator",
     "MarkGenerator",
     "ParseError",
+    "select_by_after_keyword",
     "select_by_keyword",
     "select_by_mark",
 ]
@@ -168,6 +169,22 @@ def select_by_keyword(session: Session, dag: nx.DiGraph) -> set[str]:
     return remaining
 
 
+def select_by_after_keyword(session: Session, after: str) -> set[str]:
+    """Select tasks defined by the after keyword."""
+    try:
+        expression = Expression.compile_(after)
+    except ParseError as e:
+        msg = f"Wrong expression passed to 'after': {after}: {e}"
+        raise ValueError(msg) from None
+
+    ancestors: set[str] = set()
+    for task in session.tasks:
+        if after and expression.evaluate(KeywordMatcher.from_task(task)):
+            ancestors.add(task.signature)
+
+    return ancestors
+
+
 @define(slots=True)
 class MarkMatcher:
     """A matcher for markers which are present.
diff --git a/src/_pytask/mark/__init__.pyi b/src/_pytask/mark/__init__.pyi
@@ -10,6 +10,7 @@ from _pytask.tree_util import PyTree
 from _pytask.session import Session
 import networkx as nx
 
+def select_by_after_keyword(session: Session, after: str) -> set[str]: ...
 def select_by_keyword(session: Session, dag: nx.DiGraph) -> set[str]: ...
 def select_by_mark(session: Session, dag: nx.DiGraph) -> set[str]: ...
 
@@ -54,4 +55,5 @@ __all__ = [
     "ParseError",
     "select_by_keyword",
     "select_by_mark",
+    "select_by_after_keyword",
 ]
diff --git a/src/_pytask/models.py b/src/_pytask/models.py
@@ -2,8 +2,11 @@
 from __future__ import annotations
 
 from typing import Any
+from typing import Callable
 from typing import NamedTuple
 from typing import TYPE_CHECKING
+from uuid import UUID
+from uuid import uuid4
 
 from attrs import define
 from attrs import field
@@ -16,18 +19,39 @@
 
 @define
 class CollectionMetadata:
-    """A class for carrying metadata from functions to tasks."""
-
+    """A class for carrying metadata from functions to tasks.
+
+    Attributes
+    ----------
+    after
+        An expression or a task function or a list of task functions that need to be
+        executed before this task can.
+    id_
+        An id for the task if it is part of a parametrization. Otherwise, an automatic
+        id will be generated. See
+        :doc:`this tutorial <../tutorials/repeating_tasks_with_different_inputs>` for
+        more information.
+    kwargs
+        A dictionary containing keyword arguments which are passed to the task when it
+        is executed.
+    markers
+        A list of markers that are attached to the task.
+    name
+        Use it to override the name of the task that is, by default, the name of the
+        callable.
+    produces
+        Definition of products to parse the function returns and store them. See
+        :doc:`this how-to guide <../how_to_guides/using_task_returns>` for more
+        information.
+    """
+
+    after: str | list[Callable[..., Any]] = field(factory=list)
     id_: str | None = None
-    """The id for a single parametrization."""
     kwargs: dict[str, Any] = field(factory=dict)
-    """Contains kwargs which are necessary for the task function on execution."""
     markers: list[Mark] = field(factory=list)
-    """Contains the markers of the function."""
     name: str | None = None
-    """The name of the task function."""
     produces: PyTree[Any] | None = None
-    """Definition of products to handle returns."""
+    _id: UUID = field(factory=uuid4)
 
 
 class NodeInfo(NamedTuple):
diff --git a/src/_pytask/task_utils.py b/src/_pytask/task_utils.py
@@ -39,6 +39,7 @@
 def task(
     name: str | None = None,
     *,
+    after: str | Callable[..., Any] | list[Callable[..., Any]] | None = None,
     id: str | None = None,  # noqa: A002
     kwargs: dict[Any, Any] | None = None,
     produces: PyTree[Any] | None = None,
@@ -55,6 +56,9 @@ def task(
     name
         Use it to override the name of the task that is, by default, the name of the
         callable.
+    after
+        An expression or a task function or a list of task functions that need to be
+        executed before this task can.
     id
         An id for the task if it is part of a parametrization. Otherwise, an automatic
         id will be generated. See
@@ -102,20 +106,23 @@ def wrapper(func: Callable[..., Any]) -> Callable[..., Any]:
 
         parsed_kwargs = {} if kwargs is None else kwargs
         parsed_name = name if isinstance(name, str) else func.__name__
+        parsed_after = _parse_after(after)
 
         if hasattr(unwrapped, "pytask_meta"):
             unwrapped.pytask_meta.name = parsed_name
             unwrapped.pytask_meta.kwargs = parsed_kwargs
             unwrapped.pytask_meta.markers.append(Mark("task", (), {}))
             unwrapped.pytask_meta.id_ = id
             unwrapped.pytask_meta.produces = produces
+            unwrapped.pytask_meta.after = parsed_after
         else:
             unwrapped.pytask_meta = CollectionMetadata(
                 name=parsed_name,
                 kwargs=parsed_kwargs,
                 markers=[Mark("task", (), {})],
                 id_=id,
                 produces=produces,
+                after=parsed_after,
             )
 
         # Store it in the global variable ``COLLECTED_TASKS`` to avoid garbage
@@ -131,6 +138,30 @@ def wrapper(func: Callable[..., Any]) -> Callable[..., Any]:
     return wrapper
 
 
+def _parse_after(
+    after: str | Callable[..., Any] | list[Callable[..., Any]] | None
+) -> str | list[Callable[..., Any]]:
+    if not after:
+        return []
+    if isinstance(after, str):
+        return after
+    if callable(after):
+        if not hasattr(after, "pytask_meta"):
+            after.pytask_meta = CollectionMetadata()  # type: ignore[attr-defined]
+        return [after.pytask_meta._id]  # type: ignore[attr-defined]
+    if isinstance(after, list):
+        new_after = []
+        for func in after:
+            if not hasattr(func, "pytask_meta"):
+                func.pytask_meta = CollectionMetadata()  # type: ignore[attr-defined]
+            new_after.append(func.pytask_meta._id)  # type: ignore[attr-defined]
+    msg = (
+        "'after' should be an expression string, a task, or a list of class. Got "
+        f"{after}, instead."
+    )
+    raise TypeError(msg)
+
+
 def parse_collected_tasks_with_task_marker(
     tasks: list[Callable[..., Any]],
 ) -> dict[str, Callable[..., Any]]:
diff --git a/tests/test_task.py b/tests/test_task.py
@@ -615,3 +615,50 @@ def func(path: Annotated[Path, Product]):
     assert result.exit_code == ExitCode.COLLECTION_FAILED
     assert "Duplicated tasks" in result.output
     assert "id=b.txt" in result.output
+
+
+def test_task_will_be_executed_after_another_one_with_string(runner, tmp_path):
+    source = """
+    from pytask import task
+    from pathlib import Path
+    from typing_extensions import Annotated
+
+    @task(after="task_first")
+    def task_second():
+        assert Path(__file__).parent.joinpath("out.txt").exists()
+
+    def task_first() -> Annotated[str, Path("out.txt")]:
+        return "Hello, World!"
+    """
+    tmp_path.joinpath("task_example.py").write_text(textwrap.dedent(source))
+
+    result = runner.invoke(cli, [tmp_path.as_posix()])
+    assert result.exit_code == ExitCode.OK
+    assert "2  Succeeded" in result.output
+
+    # Make sure that the dependence does not only apply to the task (and task module),
+    # but also it products.
+    tmp_path.joinpath("out.txt").write_text("Hello, Moon!")
+    result = runner.invoke(cli, [tmp_path.as_posix()])
+    assert result.exit_code == ExitCode.OK
+    assert "1  Succeeded" in result.output
+    assert "1  Skipped because unchanged" in result.output
+
+
+def test_task_will_be_executed_after_another_one_with_function(tmp_path):
+    source = """
+    from pytask import task
+    from pathlib import Path
+    from typing_extensions import Annotated
+
+    def task_first() -> Annotated[str, Path("out.txt")]:
+        return "Hello, World!"
+
+    @task(after=task_first)
+    def task_second():
+        assert Path(__file__).parent.joinpath("out.txt").exists()
+    """
+    tmp_path.joinpath("task_example.py").write_text(textwrap.dedent(source))
+
+    session = build(paths=tmp_path)
+    assert session.exit_code == ExitCode.OK