diff --git a/TODO.md b/TODO.md index 960c23b..0834fb0 100644 --- a/TODO.md +++ b/TODO.md @@ -32,16 +32,16 @@ After completing a milestone, create a pull request with your changes for review ## PR3: Exploratory Data Analysis -- [ ] Create summary statistics generator -- [ ] Implement data quality assessment -- [ ] Create correlation analysis functionality -- [ ] Add distribution analysis for numeric variables -- [ ] Implement categorical variable analysis -- [ ] Add missing value visualization -- [ ] Create data profile report generator -- [ ] Implement data insights summary -- [ ] Write tests for all EDA functions -- [ ] Create test cases with different data types and edge cases +- [x] Create summary statistics generator +- [x] Implement data quality assessment +- [x] Create correlation analysis functionality +- [x] Add distribution analysis for numeric variables +- [x] Implement categorical variable analysis +- [x] Add missing value visualization +- [x] Create data profile report generator +- [x] Implement data insights summary +- [x] Write tests for all EDA functions +- [x] Create test cases with different data types and edge cases ## PR4: Data Visualization Module diff --git a/app.py b/app.py index 8249047..d5f168c 100644 --- a/app.py +++ b/app.py @@ -3,6 +3,7 @@ import streamlit as st from utils import config from utils import data as data_utils +from utils import eda st.set_page_config(page_title="PredictStream", layout="wide") @@ -53,10 +54,22 @@ def main() -> None: end = start + page_size st.dataframe(data.iloc[start:end]) - st.subheader("Data Summary") - summary = data_utils.data_summary(data) + st.subheader("Summary Statistics") + summary = eda.summary_statistics(data) st.dataframe(summary) + st.subheader("Data Quality") + quality = eda.data_quality_assessment(data) + st.dataframe(quality) + + st.subheader("Correlation Matrix") + corr = eda.correlation_matrix(data) + st.dataframe(corr) + + st.subheader("Insights") + for insight in eda.data_insights_summary(data): + st.write(f"- {insight}") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/test_eda.py b/tests/test_eda.py new file mode 100644 index 0000000..0770a0e --- /dev/null +++ b/tests/test_eda.py @@ -0,0 +1,61 @@ +import pandas as pd +from utils import eda + + +def sample_df(): + return pd.DataFrame({ + 'num1': [1, 2, 3, 4, 5], + 'num2': [1, 2, 3, 4, 5], + 'cat': ['a', 'b', 'a', None, 'b'], + }) + + +def test_summary_statistics(): + df = sample_df() + summary = eda.summary_statistics(df) + assert 'num1' in summary.columns + assert 'cat' in summary.columns + + +def test_data_quality_assessment(): + df = sample_df() + quality = eda.data_quality_assessment(df) + assert quality.loc['cat', 'missing'] == 1 + assert quality.loc['num1', 'missing'] == 0 + + +def test_correlation_matrix(): + df = sample_df() + corr = eda.correlation_matrix(df) + assert corr.loc['num1', 'num2'] == 1.0 + + +def test_numeric_distributions(): + df = sample_df() + hists = eda.numeric_distributions(df, bins=2) + assert 'num1' in hists + assert hists['num1'].sum() == len(df) + + +def test_categorical_analysis(): + df = sample_df() + counts = eda.categorical_analysis(df) + assert counts['cat']['a'] == 2 + + +def test_missing_value_matrix(): + df = sample_df() + matrix = eda.missing_value_matrix(df) + assert matrix['cat'].sum() == 1 + + +def test_profile_report(): + df = sample_df() + report = eda.profile_report(df) + assert 'summary' in report and 'quality' in report and 'correlation' in report + + +def test_data_insights_summary(): + df = sample_df() + insights = eda.data_insights_summary(df) + assert any('missing values' in text for text in insights) diff --git a/utils/__init__.py b/utils/__init__.py index 832078c..9ac963c 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -2,5 +2,6 @@ from . import config from . import data +from . import eda -__all__ = ["config", "data"] +__all__ = ["config", "data", "eda"] diff --git a/utils/eda.py b/utils/eda.py new file mode 100644 index 0000000..2d2a110 --- /dev/null +++ b/utils/eda.py @@ -0,0 +1,83 @@ +"""Exploratory data analysis utilities.""" + +from __future__ import annotations + +from typing import Any, Dict, List + +import numpy as np +import pandas as pd + + +def summary_statistics(df: pd.DataFrame) -> pd.DataFrame: + """Return summary statistics for all columns.""" + return df.describe(include="all") + + +def data_quality_assessment(df: pd.DataFrame) -> pd.DataFrame: + """Return data quality metrics for each column.""" + total = len(df) + return pd.DataFrame({ + "dtype": df.dtypes, + "missing": df.isna().sum(), + "missing_percent": df.isna().mean() * 100, + "unique": df.nunique(dropna=False), + }) + + +def correlation_matrix(df: pd.DataFrame, method: str = "pearson") -> pd.DataFrame: + """Return the correlation matrix for numeric columns.""" + numeric_df = df.select_dtypes(include="number") + return numeric_df.corr(method=method) + + +def numeric_distributions(df: pd.DataFrame, bins: int = 10) -> Dict[str, pd.Series]: + """Return histogram counts for numeric columns.""" + histograms: Dict[str, pd.Series] = {} + numeric_df = df.select_dtypes(include="number") + for column in numeric_df.columns: + histograms[column] = pd.cut(numeric_df[column], bins=bins).value_counts().sort_index() + return histograms + + +def categorical_analysis(df: pd.DataFrame, top_n: int = 10) -> Dict[str, pd.Series]: + """Return value counts for categorical columns.""" + counts: Dict[str, pd.Series] = {} + categorical_df = df.select_dtypes(exclude="number") + for column in categorical_df.columns: + counts[column] = categorical_df[column].value_counts(dropna=False).head(top_n) + return counts + + +def missing_value_matrix(df: pd.DataFrame) -> pd.DataFrame: + """Return a boolean matrix indicating missing values.""" + return df.isna() + + +def profile_report(df: pd.DataFrame) -> Dict[str, Any]: + """Generate a simple data profile report.""" + return { + "summary": summary_statistics(df), + "quality": data_quality_assessment(df), + "correlation": correlation_matrix(df), + } + + +def data_insights_summary(df: pd.DataFrame) -> List[str]: + """Generate simple insights from the data.""" + insights: List[str] = [] + quality = data_quality_assessment(df) + missing_cols = quality[quality["missing"] > 0].index.tolist() + if missing_cols: + insights.append("Columns with missing values: " + ", ".join(missing_cols)) + + corr = correlation_matrix(df).abs() + if not corr.empty: + upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)) + strong = upper.stack().loc[lambda s: s > 0.8] + if not strong.empty: + pairs = [f"{i} & {j}" for i, j in strong.index] + insights.append("Strong correlations detected: " + ", ".join(pairs)) + + if not insights: + insights.append("No notable data issues detected.") + return insights