nextml-code
diff --git a/‎datastream/datastream.py
Lines changed: 8 additions & 4 deletions b/‎datastream/datastream.py
Lines changed: 8 additions & 4 deletions
diff --git a/‎docs/dataset.md
Lines changed: 225 additions & 19 deletions b/‎docs/dataset.md
Lines changed: 225 additions & 19 deletions
@@ -106,10 +106,14 @@ def merge(
     @staticmethod
     def zip(datastreams: List[Datastream]) -> Datastream[Tuple]:
         """
-        Zip multiple datastreams together so that all combinations of examples
-        are possible (i.e. the product) creating tuples like
-        ``(example1, example2, ...)``. The samples are drawn independently
-        from each underlying datastream.
+        Zip multiple datastreams together so that samples are drawn independently
+        from each underlying datastream, creating tuples like
+        ``(example1, example2, ...)``. The samples are drawn independently from
+        each underlying datastream.
+
+        Note: This is different from ``Dataset.combine``, which creates all
+        possible combinations (cartesian product) of examples. If you need all
+        possible combinations, use ``Dataset.combine`` instead.
         """
         return Datastream(
             Dataset.combine([datastream.dataset for datastream in datastreams]),
 
@@ -2,6 +2,8 @@
 
 A `Dataset[T]` is a mapping that allows pipelining of functions in a readable syntax returning an example of type `T`.
 
+<!--pytest-codeblocks:importorskip(datastream)-->
+
 ```python
 from datastream import Dataset
 
@@ -25,15 +27,49 @@ assert dataset[2] == ('banana', 28)
 
 ## Class Methods
 
-### from_subscriptable
+### `from_subscriptable`
+
+```python
+from_subscriptable(data: Subscriptable[T]) -> Dataset[T]
+```
 
 Create `Dataset` based on subscriptable i.e. implements `__getitem__` and `__len__`.
 
+#### Parameters
+
+- `data`: Any object that implements `__getitem__` and `__len__`
+
+#### Returns
+
+- A new Dataset instance
+
+#### Notes
+
 Should only be used for simple examples as a `Dataset` created with this method does not support methods that require a source dataframe like `Dataset.split` and `Dataset.subset`.
 
-### from_dataframe
+### `from_dataframe`
+
+```python
+from_dataframe(df: pd.DataFrame) -> Dataset[pd.Series]
+```
+
+Create `Dataset` based on `pandas.DataFrame`.
+
+#### Parameters
 
-Create `Dataset` based on `pandas.DataFrame`. `Dataset.__getitem__` will return a row from the dataframe and `Dataset.map` should be given a function that takes a row from the dataframe as input.
+- `df`: Source pandas DataFrame
+
+#### Returns
+
+- A new Dataset instance where `__getitem__` returns a row from the dataframe
+
+#### Notes
+
+`Dataset.map` should be given a function that takes a row from the dataframe as input.
+
+#### Examples
+
+<!--pytest-codeblocks:importorskip(datastream)-->
 
 ```python
 import pandas as pd
@@ -49,10 +85,30 @@ dataset = (
 assert dataset[-1] == 4
 ```
 
-### from_paths
+### `from_paths`
+
+```python
+from_paths(paths: List[str], pattern: str) -> Dataset[pd.Series]
+```
 
 Create `Dataset` from paths using regex pattern that extracts information from the path itself.
-`Dataset.__getitem__` will return a row from the dataframe and `Dataset.map` should be given a function that takes a row from the dataframe as input.
+
+#### Parameters
+
+- `paths`: List of file paths
+- `pattern`: Regex pattern with named groups to extract information from paths
+
+#### Returns
+
+- A new Dataset instance where `__getitem__` returns a row from the generated dataframe
+
+#### Notes
+
+`Dataset.map` should be given a function that takes a row from the dataframe as input.
+
+#### Examples
+
+<!--pytest-codeblocks:importorskip(datastream)-->
 
 ```python
 from datastream import Dataset
@@ -68,10 +124,26 @@ assert dataset[-1] == 'damage'
 
 ## Instance Methods
 
-### map
+### `map`
+
+```python
+map(self, function: Callable[[T], U]) -> Dataset[U]
+```
 
 Creates a new dataset with the function added to the dataset pipeline.
 
+#### Parameters
+
+- `function`: Function to apply to each example
+
+#### Returns
+
+- A new Dataset with the mapping function added to the pipeline
+
+#### Examples
+
+<!--pytest-codeblocks:importorskip(datastream)-->
+
 ```python
 from datastream import Dataset
 
@@ -83,11 +155,30 @@ dataset = (
 assert dataset[-1] == 4
 ```
 
-### starmap
+### `starmap`
+
+```python
+starmap(self, function: Callable[..., U]) -> Dataset[U]
+```
 
 Creates a new dataset with the function added to the dataset pipeline.
+
+#### Parameters
+
+- `function`: Function that accepts multiple arguments unpacked from the pipeline output
+
+#### Returns
+
+- A new Dataset with the mapping function added to the pipeline
+
+#### Notes
+
 The dataset's pipeline should return an iterable that will be expanded as arguments to the mapped function.
 
+#### Examples
+
+<!--pytest-codeblocks:importorskip(datastream)-->
+
 ```python
 from datastream import Dataset
 
@@ -100,11 +191,29 @@ dataset = (
 assert dataset[-1] == 7
 ```
 
-### subset
+### `subset`
+
+```python
+subset(self, function: Callable[[pd.DataFrame], pd.Series]) -> Dataset[T]
+```
+
+Select a subset of the dataset using a function that receives the source dataframe as input.
 
-Select a subset of the dataset using a function that receives the source dataframe as input and is expected to return a boolean mask.
+#### Parameters
 
-Note that this function can still be called after multiple operations such as mapping functions as it uses the source dataframe.
+- `function`: Function that takes a DataFrame and returns a boolean mask
+
+#### Returns
+
+- A new Dataset containing only the selected examples
+
+#### Notes
+
+This function can still be called after multiple operations such as mapping functions as it uses the source dataframe.
+
+#### Examples
+
+<!--pytest-codeblocks:importorskip(datastream)-->
 
 ```python
 import pandas as pd
@@ -121,9 +230,36 @@ dataset = (
 assert dataset[-1] == 2
 ```
 
-### split
+### `split`
+
+```python
+split(
+    self,
+    key_column: str,
+    proportions: Dict[str, float],
+    stratify_column: Optional[str] = None,
+    filepath: Optional[str] = None,
+    seed: Optional[int] = None,
+) -> Dict[str, Dataset[T]]
+```
+
+Split dataset into multiple parts.
+
+#### Parameters
+
+- `key_column`: Column to use as unique identifier for examples
+- `proportions`: Dictionary mapping split names to proportions
+- `stratify_column`: Optional column to use for stratification
+- `filepath`: Optional path to save/load split configuration
+- `seed`: Optional random seed for reproducibility
+
+#### Returns
+
+- Dictionary mapping split names to Dataset instances
 
-Split dataset into multiple parts. Optionally you can stratify on a column in the source dataframe or save the split to a json file.
+#### Notes
+
+Optionally you can stratify on a column in the source dataframe or save the split to a json file.
 If you are sure that the split strategy will not change then you can safely use a seed instead of a filepath.
 
 Saved splits can continue from the old split and handle:
@@ -133,6 +269,10 @@ Saved splits can continue from the old split and handle:
 - Adapt after removing examples from dataset
 - Adapt to new stratification
 
+#### Examples
+
+<!--pytest-codeblocks:importorskip(datastream)-->
+
 ```python
 import numpy as np
 import pandas as pd
@@ -154,9 +294,21 @@ assert len(split_datasets['train']) == 80
 assert split_datasets['test'][0] == 3
 ```
 
-### zip_index
+### `zip_index`
+
+```python
+zip_index(self) -> Dataset[Tuple[T, int]]
+```
+
+Zip the output with its underlying Dataset index.
 
-Zip the output with its underlying Dataset index. The output of the pipeline will be a tuple `(output, index)`.
+#### Returns
+
+- A new Dataset where each example is a tuple of `(output, index)`
+
+#### Examples
+
+<!--pytest-codeblocks:importorskip(datastream)-->
 
 ```python
 from datastream import Dataset
@@ -165,10 +317,26 @@ dataset = Dataset.from_subscriptable([4, 5, 6]).zip_index()
 assert dataset[0] == (4, 0)
 ```
 
-### cache
+### `cache`
+
+```python
+cache(self, key_column: str) -> Dataset[T]
+```
 
 Cache intermediate step in-memory based on key column.
 
+#### Parameters
+
+- `key_column`: Column to use as cache key
+
+#### Returns
+
+- A new Dataset with caching enabled
+
+#### Examples
+
+<!--pytest-codeblocks:importorskip(datastream)-->
+
 ```python
 import pandas as pd
 from datastream import Dataset
@@ -178,12 +346,30 @@ dataset = Dataset.from_dataframe(df).cache('key')
 assert dataset[0]['value'] == 1
 ```
 
-### concat
+### `concat`
+
+```python
+concat(datasets: List[Dataset[T]]) -> Dataset[T]
+```
+
+Concatenate multiple datasets together.
+
+#### Parameters
+
+- `datasets`: List of datasets to concatenate
+
+#### Returns
+
+- A new Dataset combining all input datasets
 
-Concatenate multiple datasets together so that they behave like a single dataset.
+#### Notes
 
 Consider using `Datastream.merge` if you have multiple data sources instead as it allows you to control the number of samples from each source in the training batches.
 
+#### Examples
+
+<!--pytest-codeblocks:importorskip(datastream)-->
+
 ```python
 from datastream import Dataset
 
@@ -194,9 +380,29 @@ assert len(combined) == 4
 assert combined[2] == 3
 ```
 
-### combine
+### `combine`
+
+```python
+combine(datasets: List[Dataset]) -> Dataset[Tuple]
+```
+
+Zip multiple datasets together so that all combinations of examples are possible.
+
+#### Parameters
+
+- `datasets`: List of datasets to combine
+
+#### Returns
+
+- A new Dataset yielding tuples of all possible combinations
+
+#### Notes
+
+Creates tuples like `(example1, example2, ...)` for all possible combinations (i.e. the cartesian product).
+
+#### Examples
 
-Zip multiple datasets together so that all combinations of examples are possible (i.e. the product) creating tuples like `(example1, example2, ...)`.
+<!--pytest-codeblocks:importorskip(datastream)-->
 
 ```python
 from datastream import Dataset