Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(synthesizer): Support for MultiTable #81

Merged
merged 11 commits into from
Jan 16, 2024
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,16 @@ package: ### Builds the package in wheel format
echo "$(version)" > src/ydata/sdk/VERSION
stubgen src/ydata/sdk -o src --export-less
$(PYTHON) -m build --wheel
twine check dist/*
$(PYTHON) -m twine check dist/*

wheel: ### Compiles the wheel
test -d wheels || mkdir -p wheels
cp dist/ydata_sdk-$(version)-py3-none-any.whl wheels/ydata_sdk-$(version)-py$(PYV)-none-any.whl
$(PYTHON) -m pyc_wheel wheels/ydata_sdk-$(version)-py$(PYV)-none-any.whl
twine check wheels/*
$(PYTHON) -m twine check wheels/*

upload:
$(PYTHON) -m twine upload -r ydata wheels/ydata_sdk-$(version)-py$(PYV)-none-any.whl

publish-docs: ### Publishes the documentation
mike deploy --push --update-aliases $(version) latest
17 changes: 17 additions & 0 deletions docs/examples/synthesizer_multitable.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Synthesize Multi Table

**Use YData's *MultiTableSynthesizer* to generate multi table synthetic data from multiple RDBMS tables**

Multi table is the way to synthesize data from multiple tables from a database, with a relational in mind...

Quickstart example:

```python
--8<-- "examples/synthesizers/multi_table_quickstart.py"
```

Sample write connector overriding example:

```python
--8<-- "examples/synthesizers/multi_table_sample_write_override.py"
```
1 change: 1 addition & 0 deletions docs/sdk/reference/api/synthesizers/multitable.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
::: ydata.sdk.synthesizers.multitable.MultiTableSynthesizer
25 changes: 25 additions & 0 deletions examples/synthesizers/multi_table_quickstart.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os

from ydata.sdk.datasources import DataSource
from ydata.sdk.synthesizers import MultiTableSynthesizer

# Do not forget to add your token as env variables
os.environ["YDATA_TOKEN"] = '<TOKEN>' # Remove if already defined

# In this example, we demonstrate how to train a synthesizer from an existing multi table RDBMS datasource.
# After training a Multi Table Synthesizer, we request a sample.
# In this case, we don't return the Dataset for the sample, it will be saved in the database
# that the connector refers to.

X = DataSource.get('<DATASOURCE_UID>')

# Initialize a multi table synthesizer with the connector to write to
# As long as the synthesizer does not call `fit`, it exists only locally
# write_connector can be an UID or a Connector instance
synth = MultiTableSynthesizer(write_connector='<CONNECTOR_UID')

# The synthesizer training is requested
synth.fit(X)

# We request a synthetic dataset with a fracion of 1.5
synth.sample(frac=1.5)
32 changes: 32 additions & 0 deletions examples/synthesizers/multi_table_sample_write_override.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os

from ydata.sdk.connectors import Connector
from ydata.sdk.datasources import DataSource
from ydata.sdk.synthesizers import MultiTableSynthesizer

# Do not forget to add your token as env variables
os.environ["YDATA_TOKEN"] = '<TOKEN>' # Remove if already defined

# In this example, we demonstrate how to train a synthesizer from an existing multi table RDBMS datasource.
# After training a Multi Table Synthesizer, we request a sample.
# In this case, we don't return the Dataset for the sample, it will be saved in the database
# that the connector refers to.

X = DataSource.get('<DATASOURCE_UID>')

# For demonstration purposes, we will use a connector instance, but you can just send the UID

write_connector = Connector.get('<CONNECTOR_UID>')

# Initialize a multi table synthesizer with the connector to write to
# As long as the synthesizer does not call `fit`, it exists only locally
# write_connector can be an UID or a Connector instance
synth = MultiTableSynthesizer(write_connector=write_connector)

# The synthesizer training is requested
synth.fit(X)

# We request a synthetic dataset with a fracion of 1.5
# In this case we use a Connector instance.
# You can just use the <CONNECTOR_UID> you don't need to get the connector upfront.
synth.sample(frac=1.5, write_connector=write_connector)
18 changes: 12 additions & 6 deletions src/ydata/sdk/common/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,10 @@ def __init__(self, credentials: Optional[Union[str, Dict]] = None, project: Opti
if set_as_global:
self.__set_global()

def post(self, endpoint: str, data: Optional[Dict] = None, json: Optional[Dict] = None,
project: Project | None = None, files: Optional[Dict] = None, raise_for_status: bool = True) -> Response:
def post(
self, endpoint: str, data: Optional[Dict] = None, json: Optional[Dict] = None,
project: Optional[Project] = None, files: Optional[Dict] = None, raise_for_status: bool = True
) -> Response:
"""POST request to the backend.

Args:
Expand All @@ -83,8 +85,10 @@ def post(self, endpoint: str, data: Optional[Dict] = None, json: Optional[Dict]

return response

def get(self, endpoint: str, params: Optional[Dict] = None,
project: Project | None = None, cookies: Optional[Dict] = None, raise_for_status: bool = True) -> Response:
def get(
self, endpoint: str, params: Optional[Dict] = None, project: Optional[Project] = None,
cookies: Optional[Dict] = None, raise_for_status: bool = True
) -> Response:
"""GET request to the backend.

Args:
Expand All @@ -104,7 +108,9 @@ def get(self, endpoint: str, params: Optional[Dict] = None,

return response

def get_static_file(self, endpoint: str, project: Project | None = None, raise_for_status: bool = True) -> Response:
def get_static_file(
self, endpoint: str, project: Optional[Project] = None, raise_for_status: bool = True
) -> Response:
"""Retrieve a static file from the backend.

Args:
Expand Down Expand Up @@ -141,7 +147,7 @@ def _get_default_project(self, token: str):
return data['myWorkspace']

def __build_url(self, endpoint: str, params: Optional[Dict] = None, data: Optional[Dict] = None,
json: Optional[Dict] = None, project: Project | None = None, files: Optional[Dict] = None,
json: Optional[Dict] = None, project: Optional[Project] = None, files: Optional[Dict] = None,
cookies: Optional[Dict] = None) -> Dict:
"""Build a request for the backend.

Expand Down
2 changes: 1 addition & 1 deletion src/ydata/sdk/connectors/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def uid(self) -> UID:
return self._model.uid

@property
def type(self) -> str:
def type(self) -> ConnectorType:
return self._model.type

@staticmethod
Expand Down
19 changes: 1 addition & 18 deletions src/ydata/sdk/datasources/_models/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,17 @@
from ydata.sdk.common.types import UID
from ydata.sdk.datasources._models.datatype import DataSourceType
from ydata.sdk.datasources._models.metadata.metadata import Metadata
from ydata.sdk.datasources._models.status import State, Status
from ydata.sdk.datasources._models.status import Status


@dataclass
class DataSource:

uid: Optional[UID] = None
author: Optional[str] = None
name: Optional[str] = None
datatype: Optional[DataSourceType] = None
metadata: Optional[Metadata] = None
status: Optional[Status] = None
state: Optional[State] = None

def __post_init__(self):
if self.metadata is not None:
self.metadata = Metadata(**self.metadata)

if self.state is not None:
data = {
'validation': self.state.get('validation', {}).get('state', 'unknown'),
'metadata': self.state.get('metadata', {}).get('state', 'unknown'),
'profiling': self.state.get('profiling', {}).get('state', 'unknown')
}
self.state = State.parse_obj(data)

if self.status is not None:
self.status = Status(self.status)

def to_payload(self):
return {}
5 changes: 3 additions & 2 deletions src/ydata/sdk/datasources/_models/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class ProfilingState(StringEnum):
AVAILABLE = 'available'


class Status(StringEnum):
class State(StringEnum):
"""Represent the status of a [`DataSource`][ydata.sdk.datasources.datasource.DataSource]."""

AVAILABLE = 'available'
Expand Down Expand Up @@ -59,7 +59,8 @@ class Status(StringEnum):
"""


class State(BaseModel):
class Status(BaseModel):
state: State
validation: ValidationState
metadata: MetadataState
profiling: ProfilingState
15 changes: 2 additions & 13 deletions src/ydata/sdk/datasources/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ydata.sdk.datasources._models.datasource_list import DataSourceList
from ydata.sdk.datasources._models.datatype import DataSourceType
from ydata.sdk.datasources._models.metadata.metadata import Metadata
from ydata.sdk.datasources._models.status import Status, ValidationState
from ydata.sdk.datasources._models.status import Status
from ydata.sdk.utils.model_mixin import ModelFactoryMixin
from ydata.sdk.utils.model_utils import filter_dict

Expand Down Expand Up @@ -174,20 +174,9 @@ def _wait_for_metadata(datasource):
sleep(BACKOFF)
return datasource

@staticmethod
def _resolve_api_status(api_status: Dict) -> Status:
status = Status(api_status.get('state', Status.UNKNOWN.name))
validation = ValidationState(api_status.get('validation', {}).get(
'state', ValidationState.UNKNOWN.name))
if validation == ValidationState.FAILED:
status = Status.FAILED
return status

@staticmethod
def _model_from_api(data: Dict, datasource_type: Type[mDataSource]) -> mDataSource:
data['datatype'] = data.pop('dataType')
data['state'] = data['status']
data['status'] = DataSource._resolve_api_status(data['status'])
data['datatype'] = data.pop('dataType', None)
data = filter_dict(datasource_type, data)
model = datasource_type(**data)
return model
Expand Down
3 changes: 2 additions & 1 deletion src/ydata/sdk/synthesizers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from ydata.datascience.common import PrivacyLevel
from ydata.sdk.synthesizers._models.synthesizers_list import SynthesizersList
from ydata.sdk.synthesizers.multitable import MultiTableSynthesizer
from ydata.sdk.synthesizers.regular import RegularSynthesizer
from ydata.sdk.synthesizers.synthesizer import BaseSynthesizer as Synthesizer
from ydata.sdk.synthesizers.timeseries import TimeSeriesSynthesizer

__all__ = ["RegularSynthesizer", "TimeSeriesSynthesizer",
"Synthesizer", "SynthesizersList", "PrivacyLevel"]
"Synthesizer", "SynthesizersList", "PrivacyLevel", "MultiTableSynthesizer"]
65 changes: 40 additions & 25 deletions src/ydata/sdk/synthesizers/_models/status.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,64 @@
from typing import Generic, TypeVar
from typing import Generic, Optional, TypeVar

from pydantic import BaseModel
from pydantic import BaseModel, Field

from ydata.core.enum import StringEnum

T = TypeVar("T")


class GenericStateErrorStatus(BaseModel, Generic[T]):
state: T
state: Optional[T] = Field(None)

class Config:
use_enum_values = True


class PrepareState(StringEnum):
PREPARING = 'preparing'
DISCOVERING = 'discovering'
FINISHED = 'finished'
FAILED = 'failed'
UNKNOWN = 'unknown'
PREPARING = "preparing"
DISCOVERING = "discovering"
FINISHED = "finished"
FAILED = "failed"


class TrainingState(StringEnum):
PREPARING = 'preparing'
RUNNING = 'running'
FINISHED = 'finished'
FAILED = 'failed'
UNKNOWN = 'unknown'
PREPARING = "preparing"
RUNNING = "running"
FINISHED = "finished"
FAILED = "failed"


class ReportState(StringEnum):
UNKNOWN = 'unknown'
DISCOVERING = 'discovering'
FINISHED = 'finished'
FAILED = 'failed'
PREPARING = "preparing"
GENERATING = "generating"
AVAILABLE = "available"
FAILED = "failed"


PrepareStatus = GenericStateErrorStatus[PrepareState]
TrainingStatus = GenericStateErrorStatus[TrainingState]
ReportStatus = GenericStateErrorStatus[ReportState]


class Status(StringEnum):
NOT_INITIALIZED = 'not initialized'
FAILED = 'failed'
PREPARE = 'prepare'
TRAIN = 'train'
REPORT = 'report' # Should not be here for SDK
READY = 'ready'
UNKNOWN = 'unknown'
class Status(BaseModel):
class State(StringEnum):
NOT_INITIALIZED = 'not initialized'
UNKNOWN = 'unknown'

PREPARE = "prepare"
TRAIN = "train"
REPORT = "report"
READY = "ready"

state: Optional[State] = Field(None)
prepare: Optional[PrepareStatus] = Field(None)
training: Optional[TrainingStatus] = Field(None)
report: Optional[ReportStatus] = Field(None)

@staticmethod
def not_initialized() -> "Status":
return Status(state=Status.State.NOT_INITIALIZED)

@staticmethod
def unknown() -> "Status":
return Status(state=Status.State.UNKNOWN)
17 changes: 9 additions & 8 deletions src/ydata/sdk/synthesizers/_models/synthesizer.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from dataclasses import dataclass, field
from typing import Dict, Optional
from typing import Optional

from pydantic import BaseModel, Field

@dataclass
class Synthesizer:
from .status import Status

uid: Optional[str] = None
author: Optional[str] = None
name: Optional[str] = None
status: Optional[Dict] = field(default_factory=dict)

class Synthesizer(BaseModel):
uid: Optional[str] = Field(None)
author: Optional[str] = Field(None)
name: Optional[str] = Field(None)
status: Optional[Status] = Field(None)
Loading