Skip to content
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
529 changes: 462 additions & 67 deletions sdv/datasets/demo.py

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions sdv/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,20 @@ class RefitWarning(UserWarning):

class SynthesizerProcessingError(Exception):
"""Error to raise when synthesizer parameters are invalid."""


class DemoResourceNotFoundError(Exception):
"""Raised when a demo dataset or one of its resources cannot be found.

This error is intended for missing demo assets such as the dataset archive,
metadata, license, README, or other auxiliary files in the demo bucket.
"""


class DemoResourceNotFoundWarning(UserWarning):
"""Warning raised when an optional demo resource is not available.

This warning indicates that a non-critical artifact (e.g., README or SOURCE
information) is not present for a given demo dataset. The operation can
continue, but the requested information cannot be provided.
"""
264 changes: 79 additions & 185 deletions tests/integration/datasets/test_demo.py
Original file line number Diff line number Diff line change
@@ -1,220 +1,114 @@
import pandas as pd
from pandas.api.types import is_integer_dtype

from sdv.datasets.demo import get_available_demos
from sdv.datasets.demo import get_available_demos, get_readme, get_source


def test_get_available_demos_single_table():
"""Test it can get demos for single table."""
"""Test single_table demos listing equals the expected filtered list and values."""
# Run
tables_info = get_available_demos('single_table')
mask = ~(
tables_info['dataset_name'].str.startswith('bad_')
| tables_info['dataset_name'].str.startswith('dataset')
)
tables_info = tables_info[mask].reset_index(drop=True)

# Assert
expected_table = pd.DataFrame({
expected = pd.DataFrame({
'dataset_name': [
'adult',
'alarm',
'census',
'census_extended',
'child',
'covtype',
'expedia_hotel_logs',
'fake_companies',
'fake_hotel_guests',
'insurance',
'intrusion',
'news',
'student_placements',
'student_placements_pii',
],
'size_MB': [
'3.907448',
'4.520128',
'98.165608',
'3.200128',
'255.645408',
'0.200128',
'3.340128',
'162.039016',
'18.712096',
3.91,
4.52,
98.17,
4.95,
3.20,
255.65,
0.20,
0.00,
0.03,
3.34,
162.04,
18.71,
0.03,
0.03,
],
'num_tables': [
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
],
'num_tables': ['1'] * 9,
})
expected_table['size_MB'] = expected_table['size_MB'].astype(float).round(2)
expected_table['num_tables'] = pd.to_numeric(expected_table['num_tables'])
assert is_integer_dtype(tables_info['num_tables'])
assert len(expected_table.merge(tables_info)) == len(expected_table)
pd.testing.assert_frame_equal(tables_info[['dataset_name', 'size_MB', 'num_tables']], expected)


def test_get_available_demos_multi_table():
"""Test it can get demos for multi table."""
"""Test multi_table demos listing is returned with expected columns and types."""
# Run
tables_info = get_available_demos('multi_table')

# Assert
expected_table = pd.DataFrame({
expected = pd.DataFrame({
'dataset_name': [
'Accidents_v1',
'Atherosclerosis_v1',
'AustralianFootball_v1',
'Biodegradability_v1',
'Bupa_v1',
'CORA_v1',
'Carcinogenesis_v1',
'Chess_v1',
'Countries_v1',
'DCG_v1',
'Dunur_v1',
'Elti_v1',
'FNHK_v1',
'Facebook_v1',
'Hepatitis_std_v1',
'Mesh_v1',
'Mooney_Family_v1',
'MuskSmall_v1',
'NBA_v1',
'NCAA_v1',
'PTE_v1',
'Pima_v1',
'PremierLeague_v1',
'Pyrimidine_v1',
'SAP_v1',
'SAT_v1',
'SalesDB_v1',
'Same_gen_v1',
'Student_loan_v1',
'Telstra_v1',
'Toxicology_v1',
'Triazine_v1',
'TubePricing_v1',
'UTube_v1',
'UW_std_v1',
'WebKP_v1',
'airbnb-simplified',
'financial_v1',
'ftp_v1',
'genes_v1',
'got_families',
'imdb_MovieLens_v1',
'imdb_ijs_v1',
'imdb_small_v1',
'legalActs_v1',
'mutagenesis_v1',
'nations_v1',
'restbase_v1',
'rossmann',
'trains_v1',
'university_v1',
'walmart',
'world_v1',
'fake_hotels',
'fake_hotels_extended',
],
'size_MB': [
'296.202744',
'7.916808',
'32.534832',
'0.692008',
'0.059144',
'1.987328',
'1.642592',
'0.403784',
'10.52272',
'0.321536',
'0.020224',
'0.054912',
'141.560872',
'1.481056',
'0.809472',
'0.101856',
'0.121784',
'0.646752',
'0.16632',
'29.137896',
'1.31464',
'0.160896',
'17.37664',
'0.038144',
'196.479272',
'0.500224',
'325.19768',
'0.056176',
'0.180256',
'5.503512',
'1.495496',
'0.156496',
'15.414536',
'0.135912',
'0.0576',
'1.9718',
'293.14392',
'94.718016',
'5.45568',
'0.440016',
'0.001',
'55.253264',
'259.140656',
'0.205728',
'186.132944',
'0.618088',
'0.540336',
'1.01452',
'73.328504',
'0.00644',
'0.009632',
'14.642184',
'0.295032',
],
'num_tables': [
'3',
'4',
'4',
'5',
'9',
'3',
'6',
'2',
'4',
'2',
'17',
'11',
'3',
'2',
'7',
'29',
'68',
'2',
'4',
'9',
'38',
'9',
'4',
'2',
'4',
'36',
'4',
'4',
'10',
'5',
'4',
'2',
'20',
'2',
'4',
'3',
'2',
'8',
'2',
'3',
'3',
'7',
'7',
'7',
'5',
'3',
'3',
'3',
'2',
'2',
'5',
'3',
'3',
0.05,
0.07,
],
'num_tables': [2, 2],
})
expected_table['size_MB'] = expected_table['size_MB'].astype(float).round(2)
expected_table['num_tables'] = pd.to_numeric(expected_table['num_tables'])
assert is_integer_dtype(tables_info['num_tables'])
assert len(expected_table.merge(tables_info, on='dataset_name')) == len(expected_table)
pd.testing.assert_frame_equal(tables_info[['dataset_name', 'size_MB', 'num_tables']], expected)


def test_get_readme_and_source_single_table_dataset1(tmp_path):
"""Test it returns the README and SOURCE for a single table dataset."""
# Run
readme = get_readme('single_table', 'dataset1')
source = get_source('single_table', 'dataset1')

# Assert
assert isinstance(readme, str) and 'sample dataset' in readme.lower()
assert isinstance(source, str) and source.strip() == 'unknown'

readme_out = tmp_path / 'r.txt'
source_out = tmp_path / 's.txt'
readme2 = get_readme('single_table', 'dataset1', str(readme_out))
source2 = get_source('single_table', 'dataset1', str(source_out))
assert readme2 == readme
assert source2 == source
assert readme_out.read_text(encoding='utf-8').strip() == readme.strip()
assert source_out.read_text(encoding='utf-8').strip() == source.strip()


def test_get_readme_missing_returns_none():
"""Test it returns None when the README/SOURCE is missing."""
# Run and Assert
assert get_readme('single_table', 'dataset2') is None
assert get_source('single_table', 'dataset2') is None
Loading
Loading