Convert a Xarray Dataset partition to a pyarrow record batch directly (without going through pandas).

alxmrs · alxmrs · commit d055c12ba96c · 2026-03-01T17:44:34.000-08:00
diff --git a/xarray_sql/df.py b/xarray_sql/df.py
@@ -1,12 +1,11 @@
 import itertools
-import warnings
 from collections.abc import Callable, Hashable, Iterator, Mapping
+from typing import Any
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 import xarray as xr
-from datafusion.context import ArrowStreamExportable
 
 Block = dict[Hashable, slice]
 Chunks = dict[str, int] | None
@@ -155,6 +154,61 @@ def pivot(ds: xr.Dataset) -> pd.DataFrame:
   return ds.to_dataframe().reset_index()  # type: ignore[no-any-return]
 
 
+def dataset_to_record_batch(
+    ds: xr.Dataset, schema: pa.Schema
+) -> pa.RecordBatch:
+  """Convert an xarray Dataset partition to an Arrow RecordBatch.
+
+  Builds the RecordBatch directly from numpy arrays, bypassing the pandas
+  round-trip (to_dataframe → reset_index → from_pandas) used by pivot().
+  For large partitions this reduces peak memory from ~5× to ~2× the
+  partition size.
+
+  Dimension coordinates are broadcast to the full partition shape and
+  ravelled. np.broadcast_to() is zero-copy; the ravel() forces one copy
+  per coordinate (unavoidable, since broadcast arrays are non-contiguous).
+  Data variable arrays are ravelled in-place — a zero-copy view when the
+  underlying array is already C-contiguous (the common case for numpy-backed
+  xarray datasets).
+
+  Args:
+      ds: A partition-sized xarray Dataset (already sliced via isel).
+      schema: The Arrow schema for the output, as produced by _parse_schema.
+          Column order in the output matches schema field order.
+
+  Returns:
+      A RecordBatch with one column per dimension coordinate and data
+      variable, in schema order.
+  """
+  # Use the data variable's dimension order as canonical so coordinate
+  # broadcasts and data variable ravels use the same layout. All data
+  # variables are validated to share the same dims tuple.
+  if ds.data_vars:
+    first_var = next(iter(ds.data_vars.values()))
+    dim_names = list(first_var.dims)
+    shape = first_var.shape
+  else:
+    dim_names = list(ds.sizes.keys())
+    shape = tuple(ds.sizes[d] for d in dim_names)
+
+  arrays = []
+  for field in schema:
+    name = field.name
+    if name in ds.coords and name in ds.dims:
+      # Broadcast 1-D coordinate to the full N-D partition shape, then ravel.
+      axis = dim_names.index(name)
+      coord = ds.coords[name].values
+      reshape = [1] * len(shape)
+      reshape[axis] = coord.shape[0]
+      arr = np.broadcast_to(coord.reshape(reshape), shape).ravel()
+      arrays.append(pa.array(arr, type=field.type))
+    else:
+      # Data variable: ravel to 1-D (zero-copy for C-contiguous arrays).
+      arrays.append(pa.array(ds[name].values.ravel(), type=field.type))
+
+  return pa.RecordBatch.from_arrays(arrays, schema=schema)
+
+
 def _parse_schema(ds) -> pa.Schema:
   """Extracts a `pa.Schema` from the Dataset, treating dims and data_vars as columns."""
   columns = []
@@ -173,12 +227,12 @@ def _parse_schema(ds) -> pa.Schema:
 
 
 # Type alias for partition metadata: maps dimension name to (min, max, dtype_str) values
-PartitionBounds = t.Dict[str, t.Tuple[t.Any, t.Any, str]]
+PartitionBounds = dict[str, tuple[Any, Any, str]]
 
 
 def partition_metadata(
-    ds: xr.Dataset, blocks: t.List[Block]
-) -> t.List[PartitionBounds]:
+    ds: xr.Dataset, blocks: list[Block]
+) -> list[PartitionBounds]:
   """Compute min/max coordinate values for each partition.
 
   This metadata enables filter pushdown: SQL queries with WHERE clauses
diff --git a/xarray_sql/df_test.py b/xarray_sql/df_test.py
@@ -8,7 +8,7 @@
 import xarray as xr
 
 from .reader import read_xarray
-from .df import explode, block_slices, from_map, pivot, from_map_batched
+from .df import explode, block_slices, dataset_to_record_batch, from_map, pivot, from_map_batched, _parse_schema
 
 
 def rand_wx(start: str, end: str) -> xr.Dataset:
@@ -177,6 +177,54 @@ def make_arrow_table(x):
   assert len(result) == 3
 
 
+def test_dataset_to_record_batch_matches_pivot(air_small):
+  """dataset_to_record_batch should contain the same rows as pivot.
+
+  Row ordering may differ (pivot uses ds.dims key order; dataset_to_record_batch
+  uses the data variable's own dim order). Both orderings are valid for SQL, so
+  we sort by the coordinate columns before comparing.
+  """
+  schema = _parse_schema(air_small)
+  dim_cols = [f.name for f in schema if f.name in air_small.dims]
+  blocks = list(block_slices(air_small, chunks={"time": 4, "lat": 3, "lon": 4}))
+
+  for block in blocks:
+    ds_block = air_small.isel(block)
+    actual_df = (
+        dataset_to_record_batch(ds_block, schema)
+        .to_pandas()
+        .sort_values(dim_cols)
+        .reset_index(drop=True)
+    )
+    expected_df = (
+        pa.RecordBatch.from_pandas(pivot(ds_block), schema=schema)
+        .to_pandas()
+        .sort_values(dim_cols)
+        .reset_index(drop=True)
+    )
+
+    pd.testing.assert_frame_equal(actual_df, expected_df, check_like=False)
+
+
+def test_dataset_to_record_batch_column_order(air_small):
+  """Output column order must match schema (dims first, then data vars)."""
+  schema = _parse_schema(air_small)
+  block = next(block_slices(air_small, chunks={"time": 4, "lat": 3, "lon": 4}))
+  batch = dataset_to_record_batch(air_small.isel(block), schema)
+  assert batch.schema.names == schema.names
+
+
+def test_dataset_to_record_batch_row_count(air_small):
+  """Row count must equal the product of the block dimension sizes."""
+  schema = _parse_schema(air_small)
+  chunks = {"time": 4, "lat": 3, "lon": 4}
+  for block in block_slices(air_small, chunks=chunks):
+    ds_block = air_small.isel(block)
+    expected_rows = int(np.prod([ds_block.sizes[d] for d in ds_block.sizes]))
+    batch = dataset_to_record_batch(ds_block, schema)
+    assert batch.num_rows == expected_rows
+
+
 def test_from_map_batched_basic_functionality(air_small):
   blocks = list(block_slices(air_small, chunks={"time": 4, "lat": 3, "lon": 4}))
 
@@ -334,7 +382,9 @@ def test_read_xarray_loads_one_chunk_at_a_time(large_ds):
   for peak in peaks:
     assert mean_peak * 1.1 > peak
     assert chunk_size * 7 > peak
-    assert chunk_size * 4 < peak
+    # Lower bound: at least chunk + Arrow output must be allocated.
+    # The numpy-direct path peaks at ~2.5x (vs ~5x for the old pandas path).
+    assert chunk_size * 1.5 < peak
 
   assert max(peaks) < large_ds.nbytes
 
diff --git a/xarray_sql/reader.py b/xarray_sql/reader.py
@@ -16,7 +16,14 @@
 import pyarrow as pa
 import xarray as xr
 
-from .df import Block, Chunks, block_slices, partition_metadata, pivot, _parse_schema
+from .df import (
+    Block,
+    Chunks,
+    _parse_schema,
+    block_slices,
+    dataset_to_record_batch,
+    partition_metadata,
+)
 
 if TYPE_CHECKING:
   from ._native import LazyArrowStreamTable
@@ -95,9 +102,9 @@ def _generate_batches(self) -> Iterator[pa.RecordBatch]:
       if self._iteration_callback is not None:
         self._iteration_callback(block)
 
-      # Convert this block to a RecordBatch
-      df = pivot(self._ds.isel(block))
-      yield pa.RecordBatch.from_pandas(df, schema=self._schema)
+      # Convert this block to a RecordBatch directly from numpy arrays,
+      # bypassing the pandas round-trip for lower peak memory usage.
+      yield dataset_to_record_batch(self._ds.isel(block), self._schema)
 
   def __arrow_c_stream__(
       self, requested_schema: object | None = None
@@ -246,9 +253,9 @@ def make_stream() -> pa.RecordBatchReader:
       if _iteration_callback is not None:
         _iteration_callback(block)
 
-      # Extract just this block from the dataset and convert to Arrow
-      df = pivot(ds.isel(block))
-      batch = pa.RecordBatch.from_pandas(df, schema=schema)
+      # Convert this block to Arrow directly from numpy arrays,
+      # bypassing the pandas round-trip for lower peak memory usage.
+      batch = dataset_to_record_batch(ds.isel(block), schema)
       return pa.RecordBatchReader.from_batches(schema, [batch])
 
     return make_stream