Skip to content

serialization of columns added into the definition of the table #1715

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,25 @@ make spell_fix
We use `pytest` to test our code. You can run the tests by running the following command:

```bash
make tests
make test_all
```

If you prefer, you can run only the core tests with the command:

```bash
make test_core
```

or the test of extensions with the command:

```bash
make test_extensions
```

You can also run the tests with coverage by running the following command:

```bash
make test-coverage
```

Make sure that all tests pass before submitting a pull request.
Expand Down
139 changes: 66 additions & 73 deletions pandasai/data_loader/semantic_layer_schema.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
from functools import partial
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List

import yaml
from pydantic import (
Expand Down Expand Up @@ -45,114 +45,107 @@ def __eq__(self, other):

class Column(BaseModel):
name: str = Field(..., description="Name of the column.")
type: Optional[str] = Field(None, description="Data type of the column.")
description: Optional[str] = Field(None, description="Description of the column")
expression: Optional[str] = Field(
None, description="Aggregation expression (avg, min, max, sum)"
)
alias: Optional[str] = Field(None, description="Alias for the column")
type: str | None = Field(None, description="Data type of the column.")
description: str | None = Field(None, description="Description of the column")
expression: str | None = Field(None, description="Aggregation expression (avg, min, max, sum)")
alias: str | None = Field(None, description="Alias for the column")

@field_validator("type")
@classmethod
def is_column_type_supported(cls, type: str) -> str:
if type and type not in VALID_COLUMN_TYPES:
def is_column_type_supported(cls, v: str) -> str:
if v and v not in VALID_COLUMN_TYPES:
raise ValueError(
f"Unsupported column type: {type}. Supported types are: {VALID_COLUMN_TYPES}"
f"Unsupported column type: {v}. Supported types are: {VALID_COLUMN_TYPES}"
)
return type
return v

@field_validator("expression")
@classmethod
def is_expression_valid(cls, expr: str) -> str:
try:
parse_one(expr)
return expr
except ParseError as e:
raise ValueError(f"Invalid SQL expression: {expr}. Error: {str(e)}")
def is_expression_valid(cls, v: str) -> str | None:
if v is not None:
try:
parse_one(v)
return v
except ParseError as e:
raise ValueError(f"Invalid SQL expression: {v}. Error: {str(e)}")


class Relation(BaseModel):
name: Optional[str] = Field(None, description="Name of the relationship.")
description: Optional[str] = Field(
None, description="Description of the relationship."
)
from_: str = Field(
..., alias="from", description="Source column for the relationship."
)
name: str | None = Field(None, description="Name of the relationship.")
description: str | None = Field(None, description="Description of the relationship.")
from_: str = Field(..., alias="from", description="Source column for the relationship.")
to: str = Field(..., description="Target column for the relationship.")


class TransformationParams(BaseModel):
column: Optional[str] = Field(None, description="Column to transform")
value: Optional[Union[str, int, float, bool]] = Field(
column: str | None = Field(None, description="Column to transform")
value: str | int | float | bool | None = Field(
None, description="Value for fill_na and other transformations"
)
mapping: Optional[Dict[str, str]] = Field(
mapping: Dict[str, str] | None = Field(
None, description="Mapping dictionary for map_values transformation"
)
format: Optional[str] = Field(None, description="Format string for date formatting")
decimals: Optional[int] = Field(
format: str | None = Field(None, description="Format string for date formatting")
decimals: int | None = Field(
None, description="Number of decimal places for rounding"
)
factor: Optional[Union[int, float]] = Field(None, description="Scaling factor")
to_tz: Optional[str] = Field(None, description="Target timezone or format")
from_tz: Optional[str] = Field(None, description="From timezone or format")
errors: Optional[str] = Field(
factor: int | float | None = Field(None, description="Scaling factor")
to_tz: str | None = Field(None, description="Target timezone or format")
from_tz: str | None = Field(None, description="From timezone or format")
errors: str | None = Field(
None, description="Error handling mode for numeric/datetime conversion"
)
old_value: Optional[Any] = Field(
old_value: Any | None = Field(
None, description="Old value for replace transformation"
)
new_value: Optional[Any] = Field(
new_value: Any | None = Field(
None, description="New value for replace transformation"
)
new_name: Optional[str] = Field(
new_name: str | None = Field(
None, description="New name for column in rename transformation"
)
pattern: Optional[str] = Field(
pattern: str | None = Field(
None, description="Pattern for extract transformation"
)
length: Optional[int] = Field(
length: int | None = Field(
None, description="Length for truncate transformation"
)
add_ellipsis: Optional[bool] = Field(
add_ellipsis: bool | None = Field(
True, description="Whether to add ellipsis in truncate"
)
width: Optional[int] = Field(None, description="Width for pad transformation")
side: Optional[str] = Field("left", description="Side for pad transformation")
pad_char: Optional[str] = Field(" ", description="Character for pad transformation")
lower: Optional[Union[int, float]] = Field(None, description="Lower bound for clip")
upper: Optional[Union[int, float]] = Field(None, description="Upper bound for clip")
bins: Optional[Union[int, List[Union[int, float]]]] = Field(
width: int | None = Field(None, description="Width for pad transformation")
side: str | None = Field("left", description="Side for pad transformation")
pad_char: str | None = Field(" ", description="Character for pad transformation")
lower: int | float | None = Field(None, description="Lower bound for clip")
upper: int | float | None = Field(None, description="Upper bound for clip")
bins: int | List[int | float] | None = Field(
None, description="Bins for binning"
)
labels: Optional[List[str]] = Field(None, description="Labels for bins")
drop_first: Optional[bool] = Field(
labels: List[str] | None = Field(None, description="Labels for bins")
drop_first: bool | None = Field(
True, description="Whether to drop first category in encoding"
)
drop_invalid: Optional[bool] = Field(
drop_invalid: bool | None = Field(
False, description="Whether to drop invalid values"
)
start_date: Optional[str] = Field(
start_date: str | None = Field(
None, description="Start date for date range validation"
)
end_date: Optional[str] = Field(
end_date: str | None = Field(
None, description="End date for date range validation"
)
country_code: Optional[str] = Field(
country_code: str | None = Field(
"+1", description="Country code for phone normalization"
)
columns: Optional[List[str]] = Field(
columns: List[str] | None = Field(
None, description="List of columns for multi-column operations"
)
keep: Optional[str] = Field("first", description="Which duplicates to keep")
ref_table: Optional[Any] = Field(
keep: str | None = Field("first", description="Which duplicates to keep")
ref_table: Any | None = Field(
None, description="Reference DataFrame for foreign key validation"
)
ref_column: Optional[str] = Field(
ref_column: str | None = Field(
None, description="Reference column for foreign key validation"
)
drop_negative: Optional[bool] = Field(
drop_negative: bool | None = Field(
False, description="Whether to drop negative values"
)

Expand All @@ -172,7 +165,7 @@ def validate_required_params(cls, values: dict) -> dict:

class Transformation(BaseModel):
type: str = Field(..., description="Type of transformation to be applied.")
params: Optional[TransformationParams] = Field(
params: TransformationParams | None = Field(
None, description="Parameters for the transformation."
)

Expand All @@ -195,11 +188,11 @@ def set_transform_type(cls, values: dict) -> dict:

class Source(BaseModel):
type: str = Field(..., description="Type of the data source.")
path: Optional[str] = Field(None, description="Path of the local data source.")
connection: Optional[SQLConnectionConfig] = Field(
path: str | None = Field(None, description="Path of the local data source.")
connection: SQLConnectionConfig | None = Field(
None, description="Connection object of the data source."
)
table: Optional[str] = Field(None, description="Table of the data source.")
table: str | None = Field(None, description="Table of the data source.")

def is_compatible_source(self, source2: "Source"):
"""
Expand Down Expand Up @@ -267,33 +260,33 @@ def is_format_supported(cls, format: str) -> str:

class SemanticLayerSchema(BaseModel):
name: str = Field(..., description="Dataset name.")
source: Optional[Source] = Field(None, description="Data source for your dataset.")
view: Optional[bool] = Field(None, description="Whether table is a view")
description: Optional[str] = Field(
source: Source | None = Field(None, description="Data source for your dataset.")
view: bool | None = Field(None, description="Whether table is a view")
description: str | None = Field(
None, description="Dataset’s contents and purpose description."
)
columns: Optional[List[Column]] = Field(
columns: List[Column] | None = Field(
None, description="Structure and metadata of your dataset’s columns"
)
relations: Optional[List[Relation]] = Field(
relations: List[Relation] | None = Field(
None, description="Relationships between columns and tables."
)
order_by: Optional[List[str]] = Field(
order_by: List[str] | None = Field(
None, description="Ordering criteria for the dataset."
)
limit: Optional[int] = Field(
limit: int | None = Field(
None, description="Maximum number of records to retrieve."
)
transformations: Optional[List[Transformation]] = Field(
transformations: List[Transformation] | None = Field(
None, description="List of transformations to apply to the data."
)
destination: Optional[Destination] = Field(
destination: Destination | None = Field(
None, description="Destination for saving the dataset."
)
update_frequency: Optional[str] = Field(
update_frequency: str | None = Field(
None, description="Frequency of dataset updates."
)
group_by: Optional[List[str]] = Field(
group_by: List[str] | None = Field(
None,
description="List of columns to group by. Every non-aggregated column must be included in group_by.",
)
Expand Down
4 changes: 4 additions & 0 deletions pandasai/helpers/dataframe_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ def serialize(cls, df: "DataFrame", dialect: str = "postgres") -> str:
if df.schema.description is not None:
dataframe_info += f' description="{df.schema.description}"'

if df.schema.columns:
columns = [column.model_dump() for column in df.schema.columns]
dataframe_info += f' columns="{json.dumps(columns, ensure_ascii=False)}"'

dataframe_info += f' dimensions="{df.rows_count}x{df.columns_count}">'

# Truncate long values
Expand Down
Loading