Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update repr_llm and add DataFrameSummarizer with customizable summarizing function #323

Merged
merged 5 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ mkdocstrings = { version = ">=0.19,<0.22", optional = true }
mkdocstrings-python = { version = ">=0.7.1,<0.10.0", optional = true }
duckdb-engine = "^0.9.2"
exceptiongroup = "^1.0.4"
repr-llm = "^0.2.1"
repr-llm = "^0.3.0"
structlog = "^23.2.0"

[tool.poetry.group.dev.dependencies]
Expand Down
4 changes: 2 additions & 2 deletions src/dx/formatters/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display as ipydisplay
from pandas.io.json import build_table_schema
from repr_llm.pandas import summarize_dataframe

from dx.formatters.summarizing import make_df_summary
from dx.sampling import get_column_string_lengths, get_df_dimensions, sample_if_too_big
from dx.settings import get_settings
from dx.types.main import DXDisplayMode
Expand Down Expand Up @@ -216,7 +216,7 @@ def format_output(
# add additional payload for LLM consumption; if any parsing/summarizing errors occur, we
# shouldn't block displaying the bundle
try:
payload["text/llm+plain"] = summarize_dataframe(df)
payload["text/llm+plain"] = make_df_summary(df)
except Exception as e:
logger.debug(f"Error in summarize_dataframe: {e}")

Expand Down
54 changes: 54 additions & 0 deletions src/dx/formatters/summarizing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from typing import Callable, Optional

import pandas as pd


class DataFrameSummarizer:
_instance: "DataFrameSummarizer" = None
summarizing_func: Optional[Callable] = None

def __init__(self, summarizing_func: Optional[Callable] = None):
Copy link
Collaborator

@rgbkrk rgbkrk Nov 2, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If they set it here, shouldn't we not load repr llm's version?

The summarizing_func passed in is likely overridden by the call within _load_repr_llm

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point - updating

if summarizing_func is None:
self._try_to_load_repr_llm()
else:
self.summarizing_func = summarizing_func

def _try_to_load_repr_llm(self) -> None:
"""Load repr_llm's summarize_dataframe into the summarizing_func if it's available."""
try:
from repr_llm.pandas import summarize_dataframe

self.summarizing_func = summarize_dataframe
except ImportError:
return

@classmethod
def instance(cls) -> "DataFrameSummarizer":
if cls._instance is None:
cls._instance = cls()
return cls._instance

def summarize(self, df: pd.DataFrame) -> str:
"""Generate a summary of a dataframe using the configured summarizing_func."""
if not isinstance(df, pd.DataFrame):
raise ValueError("`df` must be a pandas DataFrame")

if self.summarizing_func is None:
return df.describe().to_string()

return self.summarizing_func(df)


def get_summarizing_function() -> Optional[Callable]:
"""Get the function to use for summarizing dataframes."""
return DataFrameSummarizer.instance().summarizing_func


def set_summarizing_function(func: Callable) -> None:
"""Set the function to use for summarizing dataframes."""
DataFrameSummarizer.instance().summarizing_func = func


def make_df_summary(df: pd.DataFrame) -> str:
"""Generate a summary of a dataframe using the configured summarizing_func."""
return DataFrameSummarizer.instance().summarize(df)
Loading