Arrow batch_mode fail fast for unsupported options (#255)

BryanCutler · terrytangyuan · commit 1bd5aa99fabf · 2019-05-29T16:29:05.000-04:00
* Fast fail for unsupported batch_mode

* expanded ArrowDataset batching documentation and added to arrow/README.md
diff --git a/tensorflow_io/arrow/README.md b/tensorflow_io/arrow/README.md
@@ -109,3 +109,23 @@ with tf.Session() as sess:
 
 An alternate constructor can also be used to infer output types and shapes from
 a given `pyarrow.Schema`, e.g. `dataset = ArrowStreamDataset.from_schema(host, schema)`
+
+## Creating Batches with Arrow Datasets
+
+Arrow Datasets have optional parameters to specify a `batch_size` and
+`batch_mode`. Supported `batch_modes` are: 'keep_remainder', 'drop_remainder'
+and 'auto'. If the last elements of the Dataset do not combine to the set
+`batch_size`, then 'keep_remainder' will return a partial batch, while
+'drop_remainder' will discard the partial batch. Setting `batch_mode` to 'auto'
+will automatically set a batch size to the number of records in the incoming
+Arrow record batches. This a good option to use if the incoming Arrow record
+batch size can be controlled to ensure the output batch size is not too large
+and sequential Arrow record batches are sized equally.
+
+Setting the `batch_size` or using `batch_mode` of 'auto' can be more efficient
+than using `tf.data.Dataset.batch()` on an Arrow Dataset. This is because the
+output tensor can be sized to the desired batch size on creation, and then data
+is transferred directly from Arrow memory. Otherwise, if batching elements with
+the output of an Arrow Dataset, e.g. `ArrowDataset(...).batch(batch_size=4)`,
+then the tensor data will need to be aggregated and copied to get the final
+batched outputs.
diff --git a/tensorflow_io/arrow/python/ops/arrow_dataset_ops.py b/tensorflow_io/arrow/python/ops/arrow_dataset_ops.py
@@ -88,6 +88,8 @@ class ArrowBaseDataset(data.Dataset):
   and corresponding output tensor types, shapes and classes.
   """
 
+  batch_modes_supported = ('keep_remainder', 'drop_remainder', 'auto')
+
   def __init__(self,
                columns,
                output_types,
@@ -104,6 +106,10 @@ def __init__(self,
         batch_size or 0,
         dtype=dtypes.int64,
         name="batch_size")
+    if batch_mode not in self.batch_modes_supported:
+      raise ValueError(
+          "Unsupported batch_mode: '{}', must be one of {}"
+          .format(batch_mode, self.batch_modes_supported))
     self._batch_mode = tensorflow.convert_to_tensor(
         batch_mode,
         dtypes.string,
@@ -147,7 +153,10 @@ def __init__(self,
       output_types: Tensor dtypes of the output tensors
       output_shapes: TensorShapes of the output tensors or None to
                      infer partial
-      batch_size: Batch size of output tensors
+      batch_size: Batch size of output tensors, setting a batch size here
+                  will create batched Tensors from Arrow memory and can be more
+                  efficient than using tf.data.Dataset.batch().
+                  NOTE: batch_size does not need to be set if batch_mode='auto'
       batch_mode: Mode of batching, supported strings:
                   "keep_remainder" (default, keeps partial batch data),
                   "drop_remainder" (discard partial batch data),
@@ -187,7 +196,10 @@ def from_record_batches(cls,
       output_types: Tensor dtypes of the output tensors
       output_shapes: TensorShapes of the output tensors or None to
                      infer partial
-      batch_size: Batch size of output tensors
+      batch_size: Batch size of output tensors, setting a batch size here
+                  will create batched tensors from Arrow memory and can be more
+                  efficient than using tf.data.Dataset.batch().
+                  NOTE: batch_size does not need to be set if batch_mode='auto'
       batch_mode: Mode of batching, supported strings:
                   "keep_remainder" (default, keeps partial batch data),
                   "drop_remainder" (discard partial batch data),
@@ -230,7 +242,10 @@ def from_pandas(cls,
       df: a Pandas DataFrame
       columns: Optional column indices to use, if None all are used
       preserve_index: Flag to include the DataFrame index as the last column
-      batch_size: Batch size of output tensors
+      batch_size: Batch size of output tensors, setting a batch size here
+                  will create batched tensors from Arrow memory and can be more
+                  efficient than using tf.data.Dataset.batch().
+                  NOTE: batch_size does not need to be set if batch_mode='auto'
       batch_mode: Mode of batching, supported strings:
                   "keep_remainder" (default, keeps partial batch data),
                   "drop_remainder" (discard partial batch data),
@@ -274,7 +289,10 @@ def __init__(self,
       output_types: Tensor dtypes of the output tensors
       output_shapes: TensorShapes of the output tensors or None to
                      infer partial
-      batch_size: Batch size of output tensors
+      batch_size: Batch size of output tensors, setting a batch size here
+                  will create batched tensors from Arrow memory and can be more
+                  efficient than using tf.data.Dataset.batch().
+                  NOTE: batch_size does not need to be set if batch_mode='auto'
       batch_mode: Mode of batching, supported strings:
                   "keep_remainder" (default, keeps partial batch data),
                   "drop_remainder" (discard partial batch data),
@@ -316,7 +334,10 @@ def from_schema(cls,
                  in Arrow Feather format
       schema: Arrow schema defining the record batch data in the stream
       columns: A list of column indicies to use from the schema, None for all
-      batch_size: Batch size of output tensors
+      batch_size: Batch size of output tensors, setting a batch size here
+                  will create batched tensors from Arrow memory and can be more
+                  efficient than using tf.data.Dataset.batch().
+                  NOTE: batch_size does not need to be set if batch_mode='auto'
       batch_mode: Mode of batching, supported strings:
                   "keep_remainder" (default, keeps partial batch data),
                   "drop_remainder" (discard partial batch data),
@@ -355,7 +376,10 @@ def __init__(self,
       output_types: Tensor dtypes of the output tensors
       output_shapes: TensorShapes of the output tensors or None to
                      infer partial
-      batch_size: Batch size of output tensors
+      batch_size: Batch size of output tensors, setting a batch size here
+                  will create batched tensors from Arrow memory and can be more
+                  efficient than using tf.data.Dataset.batch().
+                  NOTE: batch_size does not need to be set if batch_mode='auto'
       batch_mode: Mode of batching, supported strings:
                   "keep_remainder" (default, keeps partial batch data),
                   "drop_remainder" (discard partial batch data),
@@ -397,7 +421,10 @@ def from_schema(cls,
             For a socket client, use "<HOST_IP>:<PORT>", for stdin use "STDIN".
       schema: Arrow schema defining the record batch data in the stream
       columns: A list of column indicies to use from the schema, None for all
-      batch_size: Batch size of output tensors
+      batch_size: Batch size of output tensors, setting a batch size here
+                  will create batched tensors from Arrow memory and can be more
+                  efficient than using tf.data.Dataset.batch().
+                  NOTE: batch_size does not need to be set if batch_mode='auto'
       batch_mode: Mode of batching, supported strings:
                   "keep_remainder" (default, keeps partial batch data),
                   "drop_remainder" (discard partial batch data),
diff --git a/tests/test_arrow.py b/tests/test_arrow.py
@@ -639,6 +639,22 @@ def test_batch_variable_length_list(self):
     with self.assertRaisesRegexp(errors.OpError, 'variable.*unsupported'):
       self.run_test_case(dataset, truth_data, batch_size=batch_size)
 
+  def test_unsupported_batch_mode(self):
+    """Test using an unsupported batch mode
+    """
+    truth_data = TruthData(
+        self.scalar_data,
+        self.scalar_dtypes,
+        self.scalar_shapes)
+
+    with self.assertRaisesRegexp(ValueError, 'Unsupported batch_mode.*doh'):
+      arrow_io.ArrowDataset.from_record_batches(
+          [self.make_record_batch(truth_data)],
+          list(range(len(truth_data.output_types))),
+          truth_data.output_types,
+          truth_data.output_shapes,
+          batch_mode='doh')
+
 
 if __name__ == "__main__":
   test.main()