Iterating through block into multiple record batches per partition. (#135)

alxmrs · web-flow · commit b5242e3e0811 · 2026-03-01T18:33:16.000-08:00
This is better for smaller peak memory and more parallelizable by the partition. Fixes #128.
diff --git a/xarray_sql/df.py b/xarray_sql/df.py
@@ -209,6 +209,82 @@ def dataset_to_record_batch(
   return pa.RecordBatch.from_arrays(arrays, schema=schema)
 
 
+#: Default number of rows per emitted Arrow RecordBatch.
+#: 64 K rows balances DataFusion pipeline depth against per-batch overhead.
+DEFAULT_BATCH_SIZE: int = 65_536
+
+
+def iter_record_batches(
+    ds: xr.Dataset,
+    schema: pa.Schema,
+    batch_size: int = DEFAULT_BATCH_SIZE,
+) -> Iterator[pa.RecordBatch]:
+  """Yield RecordBatches of at most *batch_size* rows from a partition Dataset.
+
+  Unlike :func:`dataset_to_record_batch`, which materialises the entire
+  partition as one batch, this generator emits smaller batches so that
+  DataFusion can begin filtering and aggregating before the full partition
+  is loaded.  Peak memory per batch is O(batch_size) for coordinate columns
+  and O(partition_size) for data-variable columns (which must be loaded in
+  full from storage).
+
+  Coordinate values are computed per batch via strided index arithmetic —
+  no broadcast array spanning the whole partition is ever allocated.  Data
+  variable flat arrays are loaded once (triggering any remote I/O) and then
+  sliced as zero-copy views for each batch.
+
+  Args:
+      ds: A partition-sized xarray Dataset (already sliced via isel).
+      schema: The Arrow schema for the output, as produced by _parse_schema.
+      batch_size: Maximum number of rows per yielded RecordBatch.
+
+  Yields:
+      RecordBatches in schema column order, covering all rows of the
+      partition exactly once.
+  """
+  if ds.data_vars:
+    first_var = next(iter(ds.data_vars.values()))
+    dim_names = list(first_var.dims)
+    shape = first_var.shape
+  else:
+    dim_names = list(ds.sizes.keys())
+    shape = tuple(ds.sizes[d] for d in dim_names)
+
+  total_rows = int(np.prod(shape))
+
+  # Preload small 1-D coordinate arrays (negligible memory).
+  coord_values = {name: ds.coords[name].values for name in dim_names}
+
+  # C-order stride for each dimension: stride[k] = prod(shape[k+1:]).
+  # Flat row index i → coordinate index for dim k: (i // stride[k]) % shape[k].
+  strides = [int(np.prod(shape[k + 1 :])) for k in range(len(shape))]
+
+  # Load data-variable arrays fully (triggers Dask/Zarr compute once).
+  # ravel() is a zero-copy view for C-contiguous arrays.
+  data_arrays = {}
+  for field in schema:
+    if field.name not in ds.dims:
+      data_arrays[field.name] = ds[field.name].values.ravel()
+
+  for row_start in range(0, total_rows, batch_size):
+    row_end = min(row_start + batch_size, total_rows)
+    row_idx = np.arange(row_start, row_end)
+
+    arrays = []
+    for field in schema:
+      name = field.name
+      if name in ds.coords and name in ds.dims:
+        k = dim_names.index(name)
+        coord_idx = (row_idx // strides[k]) % shape[k]
+        arrays.append(pa.array(coord_values[name][coord_idx], type=field.type))
+      else:
+        arrays.append(
+            pa.array(data_arrays[name][row_start:row_end], type=field.type)
+        )
+
+    yield pa.RecordBatch.from_arrays(arrays, schema=schema)
+
+
 def _parse_schema(ds) -> pa.Schema:
   """Extracts a `pa.Schema` from the Dataset, treating dims and data_vars as columns."""
   columns = []
diff --git a/xarray_sql/df_test.py b/xarray_sql/df_test.py
@@ -8,7 +8,17 @@
 import xarray as xr
 
 from .reader import read_xarray
-from .df import explode, block_slices, dataset_to_record_batch, from_map, pivot, from_map_batched, _parse_schema
+from .df import (
+    DEFAULT_BATCH_SIZE,
+    _parse_schema,
+    block_slices,
+    dataset_to_record_batch,
+    explode,
+    from_map,
+    from_map_batched,
+    iter_record_batches,
+    pivot,
+)
 
 
 def rand_wx(start: str, end: str) -> xr.Dataset:
@@ -177,6 +187,55 @@ def make_arrow_table(x):
   assert len(result) == 3
 
 
+def test_iter_record_batches_splits_into_multiple_batches(air_small):
+  """iter_record_batches should emit >1 batch when partition exceeds batch_size."""
+  schema = _parse_schema(air_small)
+  block = next(block_slices(air_small, chunks={"time": 4, "lat": 3, "lon": 4}))
+  ds_block = air_small.isel(block)
+  total_rows = int(np.prod([ds_block.sizes[d] for d in ds_block.sizes]))
+
+  small_batch = 16  # force many small batches
+  batches = list(iter_record_batches(ds_block, schema, batch_size=small_batch))
+
+  assert len(batches) == -(-total_rows // small_batch)  # ceiling division
+  assert all(b.num_rows <= small_batch for b in batches)
+  assert sum(b.num_rows for b in batches) == total_rows
+
+
+def test_iter_record_batches_matches_dataset_to_record_batch(air_small):
+  """Concatenating all iter_record_batches output must equal dataset_to_record_batch."""
+  schema = _parse_schema(air_small)
+  dim_cols = [f.name for f in schema if f.name in air_small.dims]
+  block = next(block_slices(air_small, chunks={"time": 4, "lat": 3, "lon": 4}))
+  ds_block = air_small.isel(block)
+
+  batches = list(iter_record_batches(ds_block, schema, batch_size=16))
+  actual_df = (
+      pa.Table.from_batches(batches)
+      .to_pandas()
+      .sort_values(dim_cols)
+      .reset_index(drop=True)
+  )
+  expected_df = (
+      dataset_to_record_batch(ds_block, schema)
+      .to_pandas()
+      .sort_values(dim_cols)
+      .reset_index(drop=True)
+  )
+  pd.testing.assert_frame_equal(actual_df, expected_df)
+
+
+def test_iter_record_batches_default_batch_size():
+  """A single-batch partition (rows <= DEFAULT_BATCH_SIZE) yields exactly one batch."""
+  ds = xr.tutorial.open_dataset("air_temperature").isel(time=slice(0, 2))
+  schema = _parse_schema(ds)
+  total_rows = int(np.prod([ds.sizes[d] for d in ds.sizes]))
+  assert total_rows <= DEFAULT_BATCH_SIZE, "fixture too large — adjust isel"
+  batches = list(iter_record_batches(ds, schema))
+  assert len(batches) == 1
+  assert batches[0].num_rows == total_rows
+
+
 def test_dataset_to_record_batch_matches_pivot(air_small):
   """dataset_to_record_batch should contain the same rows as pivot.
 
@@ -371,20 +430,19 @@ def test_read_xarray_loads_one_chunk_at_a_time(large_ds):
     sizes.append(cur_size)
     peaks.append(cur_peak)
 
-  mean_size = np.mean(sizes)
-  mean_peak = np.mean(peaks)
-
   for size in sizes:
-    assert mean_size * 1.1 > size
-    assert chunk_size * 3 > size
-    assert chunk_size * 2 < size
+    # Observed range: 1.59–1.83× chunk_size.
+    # iter_record_batches holds data-variable arrays (≈1× chunk) while
+    # yielding sub-batches, plus the current Arrow batch (≈0.65× chunk).
+    assert chunk_size * 1.3 < size, f"size {size} unexpectedly low"
+    assert chunk_size * 2.2 > size, f"size {size} unexpectedly high"
 
   for peak in peaks:
-    assert mean_peak * 1.1 > peak
-    assert chunk_size * 7 > peak
-    # Lower bound: at least chunk + Arrow output must be allocated.
-    # The numpy-direct path peaks at ~2.5x (vs ~5x for the old pandas path).
-    assert chunk_size * 1.5 < peak
+    # Observed range: 1.84–3.28× chunk_size.
+    # Peak includes data arrays + Arrow batch + temporary coordinate index
+    # arrays; the first batch of each chunk is highest (Dask compute overhead).
+    assert chunk_size * 1.5 < peak, f"peak {peak} unexpectedly low"
+    assert chunk_size * 4.0 > peak, f"peak {peak} unexpectedly high"
 
   assert max(peaks) < large_ds.nbytes
 
diff --git a/xarray_sql/reader.py b/xarray_sql/reader.py
@@ -19,9 +19,10 @@
 from .df import (
     Block,
     Chunks,
+    DEFAULT_BATCH_SIZE,
     _parse_schema,
     block_slices,
-    dataset_to_record_batch,
+    iter_record_batches,
     partition_metadata,
 )
 
@@ -61,6 +62,7 @@ def __init__(
       ds: xr.Dataset,
       chunks: Chunks = None,
       *,
+      batch_size: int = DEFAULT_BATCH_SIZE,
       _iteration_callback: Callable[[Block], None] | None = None,
   ):
     """Initialize the lazy reader.
@@ -69,12 +71,16 @@ def __init__(
         ds: An xarray Dataset. All data_vars must share the same dimensions.
         chunks: Xarray-like chunks specification. If not provided, uses
             the Dataset's existing chunks.
+        batch_size: Maximum rows per emitted Arrow RecordBatch.  Smaller
+            values let DataFusion start processing earlier at the cost of
+            more Python→Arrow conversion calls.
         _iteration_callback: Internal callback for testing. Called with
             each block dict just before it's converted to Arrow. This
             allows tests to track when iteration actually occurs.
     """
     self._ds = ds
     self._chunks = chunks
+    self._batch_size = batch_size
     self._schema = _parse_schema(ds)
     self._iteration_callback = _iteration_callback
     self._consumed = False
@@ -95,16 +101,17 @@ def _generate_batches(self) -> Iterator[pa.RecordBatch]:
     """Generate RecordBatches lazily from xarray blocks.
 
     This generator is only consumed when the Arrow stream's get_next
-    is called, ensuring true lazy evaluation.
+    is called, ensuring true lazy evaluation.  Each xarray block is
+    emitted as one or more RecordBatches of at most self._batch_size rows.
     """
     for block in block_slices(self._ds, self._chunks):
       # Call the iteration callback if provided (for testing)
       if self._iteration_callback is not None:
         self._iteration_callback(block)
 
-      # Convert this block to a RecordBatch directly from numpy arrays,
-      # bypassing the pandas round-trip for lower peak memory usage.
-      yield dataset_to_record_batch(self._ds.isel(block), self._schema)
+      yield from iter_record_batches(
+          self._ds.isel(block), self._schema, self._batch_size
+      )
 
   def __arrow_c_stream__(
       self, requested_schema: object | None = None
@@ -179,6 +186,7 @@ def read_xarray_table(
     ds: xr.Dataset,
     chunks: Chunks = None,
     *,
+    batch_size: int = DEFAULT_BATCH_SIZE,
     _iteration_callback: Callable[[Block], None] | None = None,
 ) -> "LazyArrowStreamTable":
   """Create a lazy DataFusion table from an xarray Dataset.
@@ -208,6 +216,9 @@ def read_xarray_table(
       ds: An xarray Dataset. All data_vars must share the same dimensions.
       chunks: Xarray-like chunks specification. If not provided, uses
           the Dataset's existing chunks.
+      batch_size: Maximum rows per Arrow RecordBatch emitted per partition.
+          Smaller values let DataFusion start processing earlier; the default
+          (65 536) works well for most datasets.
       _iteration_callback: Internal callback for testing. Called with
           each block dict just before it's converted to Arrow.
 
@@ -253,10 +264,9 @@ def make_stream() -> pa.RecordBatchReader:
       if _iteration_callback is not None:
         _iteration_callback(block)
 
-      # Convert this block to Arrow directly from numpy arrays,
-      # bypassing the pandas round-trip for lower peak memory usage.
-      batch = dataset_to_record_batch(ds.isel(block), schema)
-      return pa.RecordBatchReader.from_batches(schema, [batch])
+      return pa.RecordBatchReader.from_batches(
+          schema, iter_record_batches(ds.isel(block), schema, batch_size)
+      )
 
     return make_stream