ydataai · portellaa · Jan 16, 2024 · Jan 11, 2024 · Jan 11, 2024 · Jan 12, 2024
diff --git a/Makefile b/Makefile
@@ -68,13 +68,16 @@ package:  ### Builds the package in wheel format
 	echo "$(version)" > src/ydata/sdk/VERSION
 	stubgen src/ydata/sdk -o src --export-less
 	$(PYTHON) -m build --wheel
-	twine check dist/*
+	$(PYTHON) -m twine check dist/*
 
 wheel:  ### Compiles the wheel
 	test -d wheels || mkdir -p wheels
 	cp dist/ydata_sdk-$(version)-py3-none-any.whl wheels/ydata_sdk-$(version)-py$(PYV)-none-any.whl
 	$(PYTHON) -m pyc_wheel wheels/ydata_sdk-$(version)-py$(PYV)-none-any.whl
-	twine check wheels/*
+	$(PYTHON) -m twine check wheels/*
+
+upload:
+	$(PYTHON) -m twine upload -r ydata wheels/ydata_sdk-$(version)-py310-none-any.whl
 
 publish-docs: ### Publishes the documentation
 	mike deploy --push --update-aliases $(version) latest
diff --git a/docs/examples/synthesize_timeseries_data.md b/docs/examples/synthesize_timeseries_data.md
@@ -2,9 +2,9 @@
 
 **Use YData's *TimeSeriesSynthesizer* to generate time-series synthetic data**
 
-Tabular data is the most common type of data we encounter in data problems.
+Timeseries is the most common type of data we encounter in data problems.
 
-When thinking about tabular data, we assume independence between different records, but this does not happen in reality. Suppose we check events from our day-to-day life, such as room temperature changes, bank account transactions, stock price fluctuations, and air quality measurements in our neighborhood. In that case, we might end up with datasets where measures and records evolve and are related through time. This type of data is known to be sequential or time-series data.
+When thinking about timeseries data, we assume independence between different records, but this does not happen in reality. Suppose we check events from our day-to-day life, such as room temperature changes, bank account transactions, stock price fluctuations, and air quality measurements in our neighborhood. In that case, we might end up with datasets where measures and records evolve and are related through time. This type of data is known to be sequential or time-series data.
 
 Thus, sequential or time-series data refers to any data containing elements ordered into sequences in a structured format.
 Dissecting any time-series dataset, we see differences in variables' behavior that need to be understood for an effective generation of synthetic data. Typically any time-series dataset is composed of the following:

diff --git a/docs/examples/synthesizer_multitable.md b/docs/examples/synthesizer_multitable.md
@@ -0,0 +1,17 @@
+# Synthesize Multi Table
+
+**Use YData's *MultiTableSynthesizer* to generate multi table synthetic data from multiple RDBMS tables**
+
+Multi table is the way to synthesize data from multiple tables from a database, with a relational in mind...
+
+Quickstart example:
+
+```python
+--8<-- "examples/synthesizers/multi_table_quickstart.py"
+```
+
+Sample write connector overriding example:
+
+```python
+--8<-- "examples/synthesizers/multi_table_sample_write_override.py"
+```
diff --git a/docs/sdk/reference/api/synthesizers/multitable.md b/docs/sdk/reference/api/synthesizers/multitable.md
@@ -0,0 +1 @@
+::: ydata.sdk.synthesizers.multitable.MultiTableSynthesizer
diff --git a/examples/synthesizers/multi_table_quickstart.py b/examples/synthesizers/multi_table_quickstart.py
@@ -0,0 +1,25 @@
+import os
+
+from ydata.sdk.datasources import DataSource
+from ydata.sdk.synthesizers import MultiTableSynthesizer
+
+# Do not forget to add your token as env variables
+os.environ["YDATA_TOKEN"] = '<TOKEN>'  # Remove if already defined
+
+# In this example, we demonstrate how to train a synthesizer from an existing multi table RDBMS datasource.
+# After training a Multi Table Synthesizer, we request a sample.
+# In this case, we don't return the Dataset for the sample, it will be saved in the database
+# that the connector refers to.
+
+X = DataSource.get('<DATASOURCE_UID>')
+
+# Initialize a multi table synthesizer with the connector to write to
+# As long as the synthesizer does not call `fit`, it exists only locally
+# write_connector can be an UID or a Connector instance
+synth = MultiTableSynthesizer(write_connector='<CONNECTOR_UID')
+
+# The synthesizer training is requested
+synth.fit(X)
+
+# We request a synthetic dataset with a fracion of 1.5
+synth.sample(frac=1.5)
diff --git a/examples/synthesizers/multi_table_sample_write_override.py b/examples/synthesizers/multi_table_sample_write_override.py
@@ -0,0 +1,32 @@
+import os
+
+from ydata.sdk.connectors import Connector
+from ydata.sdk.datasources import DataSource
+from ydata.sdk.synthesizers import MultiTableSynthesizer
+
+# Do not forget to add your token as env variables
+os.environ["YDATA_TOKEN"] = '<TOKEN>'  # Remove if already defined
+
+# In this example, we demonstrate how to train a synthesizer from an existing multi table RDBMS datasource.
+# After training a Multi Table Synthesizer, we request a sample.
+# In this case, we don't return the Dataset for the sample, it will be saved in the database
+# that the connector refers to.
+
+X = DataSource.get('<DATASOURCE_UID>')
+
+# For demonstration purposes, we will use a connector instance, but you can just send the UID
+
+write_connector = Connector.get('<CONNECTOR_UID>')
+
+# Initialize a multi table synthesizer with the connector to write to
+# As long as the synthesizer does not call `fit`, it exists only locally
+# write_connector can be an UID or a Connector instance
+synth = MultiTableSynthesizer(write_connector=write_connector)
+
+# The synthesizer training is requested
+synth.fit(X)
+
+# We request a synthetic dataset with a fracion of 1.5
+# In this case we use a Connector instance.
+# You can just use the <CONNECTOR_UID> you don't need to get the connector upfront.
+synth.sample(frac=1.5, write_connector=write_connector)
diff --git a/src/ydata/sdk/connectors/connector.py b/src/ydata/sdk/connectors/connector.py
@@ -47,7 +47,7 @@ def uid(self) -> UID:
         return self._model.uid
 
     @property
-    def type(self) -> str:
+    def type(self) -> ConnectorType:
         return self._model.type
 
     @staticmethod

diff --git a/src/ydata/sdk/datasources/_models/datasource.py b/src/ydata/sdk/datasources/_models/datasource.py
@@ -1,37 +1,20 @@
 from dataclasses import dataclass
-from typing import Optional
 
 from ydata.sdk.common.types import UID
 from ydata.sdk.datasources._models.datatype import DataSourceType
 from ydata.sdk.datasources._models.metadata.metadata import Metadata
-from ydata.sdk.datasources._models.status import State, Status
+from ydata.sdk.datasources._models.status import Status
 
 
 @dataclass
 class DataSource:
 
-    uid: Optional[UID] = None
-    author: Optional[str] = None
-    name: Optional[str] = None
-    datatype: Optional[DataSourceType] = None
-    metadata: Optional[Metadata] = None
-    status: Optional[Status] = None
-    state: Optional[State] = None
-
-    def __post_init__(self):
-        if self.metadata is not None:
-            self.metadata = Metadata(**self.metadata)
-
-        if self.state is not None:
-            data = {
-                'validation': self.state.get('validation', {}).get('state', 'unknown'),
-                'metadata': self.state.get('metadata', {}).get('state', 'unknown'),
-                'profiling': self.state.get('profiling', {}).get('state', 'unknown')
-            }
-            self.state = State.parse_obj(data)
-
-        if self.status is not None:
-            self.status = Status(self.status)
+    uid: UID | None = None
+    author: str | None = None
+    name: str | None = None
+    datatype: DataSourceType | None = None
+    metadata: Metadata | None = None
+    status: Status | None = None
 
     def to_payload(self):
         return {}
diff --git a/src/ydata/sdk/datasources/_models/status.py b/src/ydata/sdk/datasources/_models/status.py
@@ -27,7 +27,7 @@ class ProfilingState(StringEnum):
     AVAILABLE = 'available'
 
 
-class Status(StringEnum):
+class State(StringEnum):
     """Represent the status of a [`DataSource`][ydata.sdk.datasources.datasource.DataSource]."""
 
     AVAILABLE = 'available'
@@ -59,7 +59,8 @@ class Status(StringEnum):
     """
 
 
-class State(BaseModel):
+class Status(BaseModel):
+    state: State
     validation: ValidationState
     metadata: MetadataState
     profiling: ProfilingState
diff --git a/src/ydata/sdk/datasources/datasource.py b/src/ydata/sdk/datasources/datasource.py
@@ -13,7 +13,7 @@
 from ydata.sdk.datasources._models.datasource_list import DataSourceList
 from ydata.sdk.datasources._models.datatype import DataSourceType
 from ydata.sdk.datasources._models.metadata.metadata import Metadata
-from ydata.sdk.datasources._models.status import Status, ValidationState
+from ydata.sdk.datasources._models.status import Status
 from ydata.sdk.utils.model_mixin import ModelFactoryMixin
 from ydata.sdk.utils.model_utils import filter_dict
 
@@ -174,20 +174,9 @@ def _wait_for_metadata(datasource):
             sleep(BACKOFF)
         return datasource
 
-    @staticmethod
-    def _resolve_api_status(api_status: Dict) -> Status:
-        status = Status(api_status.get('state', Status.UNKNOWN.name))
-        validation = ValidationState(api_status.get('validation', {}).get(
-            'state', ValidationState.UNKNOWN.name))
-        if validation == ValidationState.FAILED:
-            status = Status.FAILED
-        return status
-
     @staticmethod
     def _model_from_api(data: Dict, datasource_type: Type[mDataSource]) -> mDataSource:
-        data['datatype'] = data.pop('dataType')
-        data['state'] = data['status']
-        data['status'] = DataSource._resolve_api_status(data['status'])
+        data['datatype'] = data.pop('dataType', None)
         data = filter_dict(datasource_type, data)
         model = datasource_type(**data)
         return model

diff --git a/src/ydata/sdk/synthesizers/__init__.py b/src/ydata/sdk/synthesizers/__init__.py
@@ -1,8 +1,9 @@
 from ydata.datascience.common import PrivacyLevel
 from ydata.sdk.synthesizers._models.synthesizers_list import SynthesizersList
+from ydata.sdk.synthesizers.multitable import MultiTableSynthesizer
 from ydata.sdk.synthesizers.regular import RegularSynthesizer
 from ydata.sdk.synthesizers.synthesizer import BaseSynthesizer as Synthesizer
 from ydata.sdk.synthesizers.timeseries import TimeSeriesSynthesizer
 
 __all__ = ["RegularSynthesizer", "TimeSeriesSynthesizer",
-           "Synthesizer", "SynthesizersList", "PrivacyLevel"]
+           "Synthesizer", "SynthesizersList", "PrivacyLevel", "MultiTableSynthesizer"]
diff --git a/src/ydata/sdk/synthesizers/_models/status.py b/src/ydata/sdk/synthesizers/_models/status.py
@@ -1,49 +1,64 @@
 from typing import Generic, TypeVar
 
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 from ydata.core.enum import StringEnum
 
 T = TypeVar("T")
 
 
 class GenericStateErrorStatus(BaseModel, Generic[T]):
-    state: T
+    state: T | None = Field(None)
+
+    class Config:
+        use_enum_values = True
 
 
 class PrepareState(StringEnum):
-    PREPARING = 'preparing'
-    DISCOVERING = 'discovering'
-    FINISHED = 'finished'
-    FAILED = 'failed'
-    UNKNOWN = 'unknown'
+    PREPARING = "preparing"
+    DISCOVERING = "discovering"
+    FINISHED = "finished"
+    FAILED = "failed"
 
 
 class TrainingState(StringEnum):
-    PREPARING = 'preparing'
-    RUNNING = 'running'
-    FINISHED = 'finished'
-    FAILED = 'failed'
-    UNKNOWN = 'unknown'
+    PREPARING = "preparing"
+    RUNNING = "running"
+    FINISHED = "finished"
+    FAILED = "failed"
 
 
 class ReportState(StringEnum):
-    UNKNOWN = 'unknown'
-    DISCOVERING = 'discovering'
-    FINISHED = 'finished'
-    FAILED = 'failed'
+    PREPARING = "preparing"
+    GENERATING = "generating"
+    AVAILABLE = "available"
+    FAILED = "failed"
 
 
 PrepareStatus = GenericStateErrorStatus[PrepareState]
 TrainingStatus = GenericStateErrorStatus[TrainingState]
 ReportStatus = GenericStateErrorStatus[ReportState]
 
 
-class Status(StringEnum):
-    NOT_INITIALIZED = 'not initialized'
-    FAILED = 'failed'
-    PREPARE = 'prepare'
-    TRAIN = 'train'
-    REPORT = 'report'  # Should not be here for SDK
-    READY = 'ready'
-    UNKNOWN = 'unknown'
+class Status(BaseModel):
+    class State(StringEnum):
+        NOT_INITIALIZED = 'not initialized'
+        UNKNOWN = 'unknown'
+
+        PREPARE = "prepare"
+        TRAIN = "train"
+        REPORT = "report"
+        READY = "ready"
+
+    state: State | None = Field(None)
+    prepare: PrepareStatus | None = Field(None)
+    training: TrainingStatus | None = Field(None)
+    report: ReportStatus | None = Field(None)
+
+    @staticmethod
+    def not_initialized() -> "Status":
+        return Status(state=Status.State.NOT_INITIALIZED)
+
+    @staticmethod
+    def unknown() -> "Status":
+        return Status(state=Status.State.UNKNOWN)
diff --git a/src/ydata/sdk/synthesizers/_models/synthesizer.py b/src/ydata/sdk/synthesizers/_models/synthesizer.py
@@ -1,11 +1,10 @@
-from dataclasses import dataclass, field
-from typing import Dict, Optional
+from pydantic import BaseModel, Field
 
+from .status import Status
 
-@dataclass
-class Synthesizer:
 
-    uid: Optional[str] = None
-    author: Optional[str] = None
-    name: Optional[str] = None
-    status: Optional[Dict] = field(default_factory=dict)
+class Synthesizer(BaseModel):
+    uid: str | None = None
+    author: str | None = None
+    name: str | None = None
+    status: Status | None = Field(None)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		::: ydata.sdk.synthesizers.multitable.MultiTableSynthesizer