chore(docs): update quickstart.qmd (#1299)

mesejo · dlovell · web-flow · commit 319eaf14a587 · 2025-09-23T16:54:02.000-04:00
Additionally, this switches on the execution of docstrings, since the ibis examples are working again. closes #1298 --------- Co-authored-by: dlovell <dlovell@gmail.com>
diff --git a/docs/_renderer.py b/docs/_renderer.py
@@ -20,14 +20,15 @@ def render(self, el: qd.ast.ExampleCode) -> str:
         prompt = ">>> "
         continuation = "..."
 
+        skip_doctest = "doctest: +SKIP"
         expect_failure = "quartodoc: +EXPECTED_FAILURE"
         quartodoc_skip_doctest = "quartodoc: +SKIP"
 
         def chunker(line):
             return line.startswith((prompt, continuation))
 
         def should_skip(line):
-            return True
+            return quartodoc_skip_doctest in line or skip_doctest in line
 
         for chunk in toolz.partitionby(chunker, lines):
             first, *rest = chunk
@@ -59,6 +60,13 @@ def should_skip(line):
 
                 # if we expect failures, don't fail the notebook execution and
                 # render the error message
+                if expect_failure in first or any(
+                    expect_failure in line for line in rest
+                ):
+                    assert start and end, (
+                        "expected failure should never occur alongside a skipped doctest example"
+                    )
+                    result.append("#| error: true")
 
                 # remove the quartodoc markers from the rendered code
                 result.append(
diff --git a/docs/tutorials/getting_started/quickstart.qmd b/docs/tutorials/getting_started/quickstart.qmd
@@ -67,7 +67,7 @@ xorq run builds/7061dd65ff3c --limit 10
 
 To serve your pipeline as an endpoint, you can use the `xorq serve-unbound` command:
 ```bash
-xorq serve-unbound builds/7061dd65ff3c --host localhost --port 8001 --cache-dir penguins_example b2370a29c19df8e1e639c63252dacd0e
+xorq serve-unbound builds/7061dd65ff3c --host localhost --port 8001 --cache-dir penguins_example 405154f690d20f4adbcc375252628b75
 # This replaces a specific node hash with an exchanger input and serves the unbound expr as do_exchange
 ```
 
@@ -233,19 +233,19 @@ For deployments, you can serve a specific built expression as an endpoint using
 Catalog service:
 
 ```bash
-xorq serve-unbound builds/7061dd65ff3c --host localhost --port 8001 --cache-dir penguins_example b2370a29c19df8e1e639c63252dacd0e
+xorq serve-unbound builds/7061dd65ff3c --host localhost --port 8001 --cache-dir penguins_example 405154f690d20f4adbcc375252628b75
 ```
 
 **Understanding the command:**
 
 - `builds/7061dd65ff3c`: Your built pipeline directory
 - `--host localhost --port 8001`: Server configuration
 - `--cache-dir penguins_example`: Directory for caching results
-- `b2370a29c19df8e1e639c63252dacd0e`: The specific node hash to serve
+- `405154f690d20f4adbcc375252628b75`: The specific node hash to serve
 
 ### Finding the Node Hash
 
-The node hash (like `b2370a29c19df8e1e639c63252dacd0e`) identifies a specific
+The node hash (like `405154f690d20f4adbcc375252628b75`) identifies a specific
 expression node in your pipeline. You can find this hash using:
 
 ```python
diff --git a/python/xorq/expr/ml/split_lib.py b/python/xorq/expr/ml/split_lib.py
@@ -63,7 +63,7 @@ def _calculate_bounds(
 
 def calc_split_conditions(
     table: ir.Table,
-    unique_key: str | tuple[str] | list[str],
+    unique_key: str | tuple[str] | list[str] | Selector,
     test_sizes: Iterable[float] | float,
     num_buckets: int = 10000,
     random_seed: int | None = None,
@@ -73,7 +73,7 @@ def calc_split_conditions(
     ----------
     table : ir.Table
         The input Ibis table to be split.
-    unique_key : str | tuple[str] | list[str]
+    unique_key : str | tuple[str] | list[str] | Selector
         The column name(s) that uniquely identify each row in the table. This
         unique_key is used to create a deterministic split of the dataset
         through a hashing process.
@@ -149,7 +149,7 @@ def calc_split_conditions(
 
 def calc_split_column(
     table: ir.Table,
-    unique_key: str | tuple[str] | list[str],
+    unique_key: str | tuple[str] | list[str] | Selector,
     test_sizes: Iterable[float] | float,
     num_buckets: int = 10000,
     random_seed: int | None = None,
@@ -160,7 +160,7 @@ def calc_split_column(
     ----------
     table : ir.Table
         The input Ibis table to be split.
-    unique_key : str | tuple[str] | list[str]
+    unique_key : str | tuple[str] | list[str] | Selector
         The column name(s) that uniquely identify each row in the table. This
         unique_key is used to create a deterministic split of the dataset
         through a hashing process.
@@ -235,15 +235,15 @@ def train_test_splits(
     ----------
     table : ir.Table
         The input Ibis table to be split.
-    unique_key : str | tuple[str] | list[str]
-        The column name(s) that uniquely identify each row in the table. This
-        unique_key is used to create a deterministic split of the dataset
-        through a hashing process.
     test_sizes : Iterable[float] | float
         An iterable of floats representing the desired proportions for data splits.
         Each value should be between 0 and 1, and their sum must equal 1. The
         order of test sizes determines the order of the generated subsets. If float is passed
         it assumes that the value is for the test size and that a tradition tain test split of (1-test_size, test_size) is returned.
+    unique_key : str | tuple[str] | list[str] | Selector, optional
+        The column name(s) that uniquely identify each row in the table. This
+        unique_key is used to create a deterministic split of the dataset
+        through a hashing process.
     num_buckets : int, optional
         The number of buckets into which the data can be binned after being
         hashed (default is 10000). It controls how finely the data is divided
@@ -273,7 +273,7 @@ def train_test_splits(
     >>> table = xo.memtable({"key": range(100), "value": range(100,200)})
     >>> unique_key = "key"
     >>> test_sizes = [0.2, 0.3, 0.5]
-    >>> splits = xo.train_test_splits(table, unique_key, test_sizes, num_buckets=10, random_seed=42)
+    >>> splits = xo.train_test_splits(table, test_sizes, unique_key=unique_key, num_buckets=10, random_seed=42)
     >>> for i, split_table in enumerate(splits):
     ...     print(f"Split {i+1} size: {split_table.count().execute()}")
     ...     print(split_table.execute())
diff --git a/python/xorq/tests/test_cli.py b/python/xorq/tests/test_cli.py
@@ -4,6 +4,7 @@
 from itertools import chain
 from pathlib import Path
 
+import pandas as pd
 import pytest
 
 import xorq.api as xo
@@ -666,3 +667,101 @@ def test_serve_unbound_tag_get_exchange_udf(fixture_dir, tmp_path):
     assert not actual.empty
 
     serve_popened.popen.terminate()
+
+
+@pytest.mark.slow(level=1)
+def test_serve_penguins_template(tmpdir, tmp_path):
+    tmpdir = Path(tmpdir)
+    path = tmpdir.joinpath("xorq-template-penguins")
+    init_args = (
+        "xorq",
+        "init",
+        "--path",
+        str(path),
+        "--template",
+        "penguins",
+    )
+
+    (returncode, stdout, stderr) = subprocess_run(init_args)
+
+    assert returncode == 0, stderr
+    assert path.exists()
+    assert path.joinpath("pyproject.toml").exists()
+    assert path.joinpath("requirements.txt").exists()
+
+    target_dir = tmp_path / "build"
+    build_args = [
+        "xorq",
+        "build",
+        str(path / "expr.py"),
+        "--builds-dir",
+        str(target_dir),
+    ]
+    (returncode, stdout, stderr) = subprocess_run(build_args)
+
+    assert "Building expr" in stderr.decode("ascii")
+    assert returncode == 0, stderr
+
+    if match := re.search(f"{target_dir}/([0-9a-f]+)", stdout.decode("ascii")):
+        serve_hash = "405154f690d20f4adbcc375252628b75"
+
+        serve_args = (
+            "xorq",
+            "serve-unbound",
+            str(target_dir / match.group()),
+            "--to_unbind_hash",
+            serve_hash,
+        )
+        serve_popened = Popened(serve_args, deferred=False)
+        port = peek_port(serve_popened)
+
+        # Create sample penguin data using memtable instead of reading from URL
+        sample_data = pd.DataFrame(
+            {
+                "bill_length_mm": [
+                    39.1,
+                    39.5,
+                    40.3,
+                    36.7,
+                    39.3,
+                    38.9,
+                    39.2,
+                    34.1,
+                    42.0,
+                    37.8,
+                ],
+                "bill_depth_mm": [
+                    18.7,
+                    17.4,
+                    18.0,
+                    19.3,
+                    20.6,
+                    17.8,
+                    19.6,
+                    18.1,
+                    20.2,
+                    17.1,
+                ],
+                "species": [
+                    "Adelie",
+                    "Adelie",
+                    "Adelie",
+                    "Adelie",
+                    "Adelie",
+                    "Chinstrap",
+                    "Chinstrap",
+                    "Chinstrap",
+                    "Gentoo",
+                    "Gentoo",
+                ],
+            }
+        )
+
+        expr = xo.memtable(sample_data, name="penguins")
+
+        actual = hit_server(port=port, expr=expr)
+        assert not actual.empty
+        assert actual["predicted"].isin(("Adelie", "Chinstrap", "Gentoo")).all()
+        assert len(actual) == len(sample_data)
+    else:
+        raise AssertionError("No expression hash")