diff --git a/.github/workflows/test-score-vizro-ai.yml b/.github/workflows/test-score-vizro-ai.yml index 813ecba8c..5530423a4 100644 --- a/.github/workflows/test-score-vizro-ai.yml +++ b/.github/workflows/test-score-vizro-ai.yml @@ -5,7 +5,13 @@ defaults: working-directory: vizro-ai on: + schedule: + - cron: "30 10 * * 1" # run every Monday at 10:30 UTC workflow_dispatch: + #temporary for development + pull_request: + branches: + - main env: PYTHONUNBUFFERED: 1 @@ -20,17 +26,8 @@ jobs: fail-fast: false matrix: config: - - python-version: "3.9" - hatch-env: all.py3.9 - - python-version: "3.10" - hatch-env: all.py3.10 - - python-version: "3.11" - hatch-env: all.py3.11 - python-version: "3.12" hatch-env: all.py3.12 - - python-version: "3.9" - hatch-env: lower-bounds - label: lower bounds steps: - uses: actions/checkout@v4 @@ -46,17 +43,8 @@ jobs: fail-fast: false matrix: config: - - python-version: "3.9" - hatch-env: all.py3.9 - - python-version: "3.10" - hatch-env: all.py3.10 - - python-version: "3.11" - hatch-env: all.py3.11 - python-version: "3.12" hatch-env: all.py3.12 - - python-version: "3.9" - hatch-env: lower-bounds - label: lower bounds steps: - uses: actions/checkout@v4 @@ -77,6 +65,8 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_BASE: ${{ secrets.OPENAI_API_BASE }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }} VIZRO_TYPE: pypi BRANCH: ${{ github.head_ref }} PYTHON_VERSION: ${{ matrix.config.python-version }} @@ -88,6 +78,8 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_BASE: ${{ secrets.OPENAI_API_BASE }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }} VIZRO_TYPE: local BRANCH: ${{ github.head_ref }} PYTHON_VERSION: ${{ matrix.config.python-version }} @@ -131,7 +123,8 @@ jobs: - name: Create one csv report run: | cd /home/runner/work/vizro/vizro/ - head -n 1 Report-3.11-/report_model_gpt-4o-mini_pypi.csv > report-aggregated-${{ steps.date.outputs.date }}.csv && tail -n+2 -q */*.csv >> report-aggregated-${{ steps.date.outputs.date }}.csv + ls */*.csv | head -n1 | xargs head -n1 > report-aggregated-${{ steps.date.outputs.date }}.csv && tail -n+2 -q */*.csv >> report-aggregated-${{ steps.date.outputs.date }}.csv + # replace all timestamps in aggregated report to current date gawk -F, -i inplace 'FNR>1 {$1="${{ steps.date.outputs.date }}"} {print}' OFS=, report-aggregated-${{ steps.date.outputs.date }}.csv - name: Report artifacts diff --git a/vizro-ai/changelog.d/20241224_135602_alexey_snigir_score_tests_improvements.md b/vizro-ai/changelog.d/20241224_135602_alexey_snigir_score_tests_improvements.md new file mode 100644 index 000000000..7c0d58d4f --- /dev/null +++ b/vizro-ai/changelog.d/20241224_135602_alexey_snigir_score_tests_improvements.md @@ -0,0 +1,48 @@ + + + + + + + + + diff --git a/vizro-ai/hatch.toml b/vizro-ai/hatch.toml index 51465325c..6f4c30482 100644 --- a/vizro-ai/hatch.toml +++ b/vizro-ai/hatch.toml @@ -51,7 +51,7 @@ prep-release = [ pypath = "hatch run python -c 'import sys; print(sys.executable)'" test = "pytest tests {args}" test-integration = "pytest -vs --reruns 1 tests/integration --headless {args}" -test-score = "pytest -vs --reruns 1 tests/score --headless {args}" +test-score = "pytest -vs tests/score --headless {args}" test-unit = "pytest tests/unit {args}" test-unit-coverage = [ "coverage run -m pytest tests/unit {args}", diff --git a/vizro-ai/tests/score/prompts.py b/vizro-ai/tests/score/prompts.py new file mode 100644 index 000000000..383e3bc8d --- /dev/null +++ b/vizro-ai/tests/score/prompts.py @@ -0,0 +1,85 @@ +easy_prompt = """ +I need a page with 1 table, 1 card and 1 chart. +The table shows the tech companies stock data. +The card says 'The Gapminder dataset provides historical data on countries' development indicators.' +The chart is the scatter plot which uses gapminder dataframe and showing life expectancy vs. GDP per capita by country. +Life expectancy on the y axis, GDP per capita on the x axis, and colored by continent. + +The layout uses a grid of 2 columns and 3 rows. +The first row contains card +The second row contains chart +The third row contains table + +Add a filter to filter the scatter plot by continent. +Add a second filter to filter the table by companies. +""" + +medium_prompt = """ + +I need a page with 1 table and 1 line chart. +The chart shows the stock price trends of GOOG and AAPL. +The table shows the stock prices data details. + + +I need a second page showing 3 cards and 4 charts. +The cards says 'The Gapminder dataset provides historical data on countries' development indicators.' +The charts are the scatter plots showing GDP per capita vs. life expectancy. +GDP per capita on the x axis, life expectancy on the y axis, and colored by continent. +Layout the cards on the left and the chart on the right. +Add a filter to filter the scatter plots by continent. +Add a second filter to filter the charts by year. + + +This page displays the tips dataset. use four different charts to show data +distributions. one chart should be a bar chart. the other should be a scatter plot. +next chart should be a line chart. last one should be an area plot. +first and second charts are on the left and the third and fourth charts are on the right. +Add a filter to filter data in every plot by smoker. + + +Create 3 cards on this page: +1. The first card on top says "This page combines data from various sources + including tips, stock prices, and global indicators." +2. The second card says "Insights from Gapminder dataset." +3. The third card says "Stock price trends over time." + +Layout these 3 cards in this way: +create a grid with 3 columns and 2 rows. +Row 1: The first row has three columns: +- The first column is empty. +- The second and third columns span the area for card 1. + +Row 2: The second row also has three columns: +- The first column is empty. +- The second column is occupied by the area for card 2. +- The third column is occupied by the area for card 3. +""" + + +complex_prompt = """ + +Show me 1 table on the first page that shows tips and sorted by day +Using export button I want to export data to csv +Add filters by bill and by tip amount using range slider + + +Second page should contain kpi cards with population trends and +two popular charts that display population per capita vs. continent. +Filter charts by GDP using dropdown. +Align kpi cards in one row and charts in different. +Both charts should be in tabs. + + +Third page should contain 6 charts showing stocks. +Each should have separate filter by date. +Filter types should include dropdown, datepicker, slider, checklist and radio items. +Add parameter for any chart. + + +Fourth page contains chart with wind data. +Table with population data. +Two more charts with stocks and tips representations. +Align table beautifully relative to the charts. +Every chart should have 2 filters. +Table should have 1 filter. +""" diff --git a/vizro-ai/tests/score/pytest.ini b/vizro-ai/tests/score/pytest.ini index 8b3381827..7f2efb67c 100644 --- a/vizro-ai/tests/score/pytest.ini +++ b/vizro-ai/tests/score/pytest.ini @@ -2,3 +2,9 @@ markers = easy_dashboard: mark test with easy prompt for dashboard creation. medium_dashboard: mark test with medium prompt for dashboard creation. + complex_dashboard: mark test with complex prompt for dashboard creation. + +filterwarnings = + ignore::UserWarning + # Ignore deprecation warning until this is solved: https://github.com/plotly/dash/issues/2590: + ignore:HTTPResponse.getheader():DeprecationWarning diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py index 53d2e9033..0815a48aa 100644 --- a/vizro-ai/tests/score/test_dashboard.py +++ b/vizro-ai/tests/score/test_dashboard.py @@ -2,32 +2,34 @@ import csv import os +import statistics from collections import Counter from dataclasses import dataclass from datetime import datetime from typing import Literal import chromedriver_autoinstaller +import numpy as np import pytest import vizro.plotly.express as px +from prompts import complex_prompt, easy_prompt, medium_prompt from vizro import Vizro from vizro_ai import VizroAI -vizro_ai = VizroAI() - df1 = px.data.gapminder() df2 = px.data.stocks() df3 = px.data.tips() +df4 = px.data.wind() @dataclass -class Components: +class Component: type: Literal["ag_grid", "card", "graph"] @dataclass -class Controls: +class Control: type: Literal["filter", "parameter"] @@ -38,11 +40,18 @@ def setup_test_environment(): chromedriver_autoinstaller.install() +# If len() is 0, it means that nothing was entered for this score in config, +# in this case it should be 1. +def score_calculator(metrics_score: list[int]): + return statistics.mean(metrics_score) if len(metrics_score) != 0 else 1 + + def logic( # noqa: PLR0912, PLR0915 dashboard, model_name, dash_duo, prompt_tier, + prompt_text, config: dict, ): """Calculates all separate scores. Creates csv report. @@ -52,9 +61,11 @@ def logic( # noqa: PLR0912, PLR0915 model_name: GenAI model name dash_duo: dash_duo fixture prompt_tier: complexity of the prompt + prompt_text: prompt text config: json config of the expected dashboard """ + # TODO: Add layout score report_dir = "tests/score/reports" os.makedirs(report_dir, exist_ok=True) @@ -62,7 +73,7 @@ def logic( # noqa: PLR0912, PLR0915 try: dash_duo.start_server(app) - app_started = 1.0 + app_started = 1 app_started_report = "App started!" except Exception as e: app_started = 0 @@ -71,7 +82,7 @@ def logic( # noqa: PLR0912, PLR0915 try: assert dash_duo.get_logs() == [] - no_browser_console_errors = 1.0 + no_browser_console_errors = 1 no_browser_console_errors_report = "No error logs in browser console!" except AssertionError as e: no_browser_console_errors = 0 @@ -89,8 +100,8 @@ def logic( # noqa: PLR0912, PLR0915 branch = "local" python_version = "local" - pages_exist = [1 if dashboard.pages else 0] - pages_exist_report = bool(pages_exist[0]) + pages_exist = [1 if dashboard.pages else 0][0] + pages_exist_report = bool(pages_exist) pages_num = [1 if len(dashboard.pages) == len(config["pages"]) else 0] pages_num_report = [f'{len(config["pages"])} page(s) for dashboard is {bool(pages_num[0])}'] @@ -158,36 +169,31 @@ def logic( # noqa: PLR0912, PLR0915 controls_types_names.append(controls_types) controls_types_names_report.append("page or control does not exists") - pages_exist.extend(pages_num) - # Every separate score has its own weight. - app_started_score = {"weight": 0.4, "score": app_started} - no_browser_console_errors_score = {"weight": 0.1, "score": no_browser_console_errors} - pages_score = {"weight": 0.2, "score": sum(pages_exist) / len(pages_exist)} - components_score = {"weight": 0.1, "score": sum(components_num) / len(components_num)} - component_types_score = {"weight": 0.1, "score": sum(components_types_names) / len(components_types_names)} - controls_score = {"weight": 0.1, "score": sum(controls_num) / len(controls_num)} - controls_types_score = {"weight": 0.1, "score": sum(controls_types_names) / len(controls_types_names)} - scores = [ - app_started_score, - no_browser_console_errors_score, - pages_score, - components_score, - component_types_score, - controls_score, - controls_types_score, + {"score_name": "app_started_score", "weight": 0.4, "score": app_started}, + {"score_name": "no_browser_console_errors_score", "weight": 0.1, "score": no_browser_console_errors}, + {"score_name": "pages_score", "weight": 0.3, "score": pages_exist}, + {"score_name": "pages_number", "weight": 0.2, "score": score_calculator(metrics_score=pages_num)}, + {"score_name": "components_score", "weight": 0.2, "score": score_calculator(metrics_score=components_num)}, + { + "score_name": "component_types_score", + "weight": 0.2, + "score": score_calculator(metrics_score=components_types_names), + }, + {"score_name": "controls_score", "weight": 0.2, "score": score_calculator(metrics_score=controls_num)}, + { + "score_name": "controls_types_score", + "weight": 0.2, + "score": score_calculator(metrics_score=controls_types_names), + }, ] - # total_weight should be equal to 1 - total_weight = sum(score["weight"] for score in scores) - # If total_weight is not equal to 1, we're recalculating weights for every separate score - # and calculating final weighted_score for the created dashboard - if total_weight != 1: - scores = [{"weight": score["weight"] / total_weight, "score": score["score"]} for score in scores] - weighted_score = round(sum(score["weight"] * score["score"] for score in scores), 1) - # csv report creation + scores_values = np.array([score["score"] for score in scores]) + weights = np.array([score["weight"] for score in scores]) + weighted_score = np.average(scores_values, weights=weights) + # csv report creation data_rows = [ datetime.now(), vizro_type, @@ -195,42 +201,29 @@ def logic( # noqa: PLR0912, PLR0915 python_version, model_name, prompt_tier, + prompt_text, weighted_score, - app_started_score["score"], - no_browser_console_errors_score["score"], - pages_score["score"], - components_score["score"], - component_types_score["score"], - controls_score["score"], - controls_types_score["score"], ] + data_rows.extend(score["score"] for score in scores) with open(f"{report_dir}/report_model_{model_name}_{vizro_type}.csv", "a", newline=""): with open(f"{report_dir}/report_model_{model_name}_{vizro_type}.csv", "r+", newline="") as csvfile: writer = csv.writer(csvfile, delimiter=",") first_line = csvfile.readline() if not first_line: - writer.writerow( - [ - "timestamp", - "vizro_type", - "branch", - "python_version", - "model", - "prompt_tier", - "weighted_score", - "app_started_score", - "no_browser_console_errors_score", - "pages_score", - "components_score", - "component_types_score", - "controls_score", - "controls_types_score", - ] - ) - writer.writerow(data_rows) - else: - writer.writerow(data_rows) + header_rows = [ + "timestamp", + "vizro_type", + "branch", + "python_version", + "model", + "prompt_tier", + "prompt_text", + "weighted_score", + ] + header_rows.extend(score["score_name"] for score in scores) + writer.writerow(header_rows) + writer.writerow(data_rows) # Readable report for the console output print(f"App started: {app_started_report}") # noqa: T201 @@ -248,159 +241,155 @@ def logic( # noqa: PLR0912, PLR0915 @pytest.mark.easy_dashboard @pytest.mark.parametrize( "model_name", - ["gpt-4o-mini"], - ids=["gpt-4o-mini"], + [ + "gpt-4o-mini", + "claude-3-5-sonnet-latest", + ], + ids=[ + "gpt-4o-mini", + "claude-3-5-sonnet-latest", + ], ) -@pytest.mark.filterwarnings("ignore::langchain_core._api.beta_decorator.LangChainBetaWarning") -@pytest.mark.filterwarnings("ignore::UserWarning") -@pytest.mark.filterwarnings("ignore:HTTPResponse.getheader()") def test_easy_dashboard(dash_duo, model_name): - input_text = """ - I need a page with 1 table. - The table shows the tech companies stock data. - - I need a second page showing 2 cards and one chart. - The first card says 'The Gapminder dataset provides historical data on countries' development indicators.' - The chart is an scatter plot showing life expectancy vs. GDP per capita by country. - Life expectancy on the y axis, GDP per capita on the x axis, and colored by continent. - The second card says 'Data spans from 1952 to 2007 across various countries.' - The layout uses a grid of 3 columns and 2 rows. - - Row 1: The first row has three columns: - The first column is occupied by the first card. - The second and third columns are spanned by the chart. - - Row 2: The second row mirrors the layout of the first row with respect to chart, - but the first column is occupied by the second card. - - Add a filter to filter the scatter plot by continent. - Add a second filter to filter the chart by year. - """ - - dashboard = vizro_ai.dashboard([df1, df2], input_text) + dashboard = VizroAI(model=model_name).dashboard([df1, df2], easy_prompt) logic( dashboard=dashboard, model_name=model_name, dash_duo=dash_duo, prompt_tier="easy", + prompt_text=easy_prompt.replace("\n", " "), config={ "pages": [ { "components": [ - Components(type="ag_grid"), + Component(type="ag_grid"), + Component(type="card"), + Component(type="graph"), + ], + "controls": [ + Control(type="filter"), + Control(type="filter"), + ], + }, + ], + }, + ) + + +@pytest.mark.medium_dashboard +@pytest.mark.parametrize("model_name", ["gpt-4o-mini"], ids=["gpt-4o-mini"]) +def test_medium_dashboard(dash_duo, model_name): + dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3], medium_prompt) + + logic( + dashboard=dashboard, + model_name=model_name, + dash_duo=dash_duo, + prompt_tier="medium", + prompt_text=medium_prompt.replace("\n", " "), + config={ + "pages": [ + { + "components": [ + Component(type="ag_grid"), + Component(type="graph"), ], "controls": [], }, { "components": [ - Components(type="card"), - Components(type="card"), - Components(type="graph"), + Component(type="card"), + Component(type="graph"), + ], + "controls": [ + Control(type="filter"), + Control(type="filter"), + ], + }, + { + "components": [ + Component(type="graph"), + Component(type="graph"), ], "controls": [ - Controls(type="filter"), - Controls(type="filter"), + Control(type="filter"), + ], + }, + { + "components": [ + Component(type="card"), + Component(type="card"), + Component(type="card"), ], + "controls": [], }, ], }, ) -@pytest.mark.medium_dashboard +@pytest.mark.complex_dashboard @pytest.mark.parametrize( "model_name", ["gpt-4o-mini"], ids=["gpt-4o-mini"], ) -@pytest.mark.filterwarnings("ignore::langchain_core._api.beta_decorator.LangChainBetaWarning") -@pytest.mark.filterwarnings("ignore::UserWarning") -@pytest.mark.filterwarnings("ignore:HTTPResponse.getheader()") -def test_medium_dashboard(dash_duo, model_name): - input_text = """ - - I need a page with 1 table and 1 line chart. - The chart shows the stock price trends of GOOG and AAPL. - The table shows the stock prices data details. - - - I need a second page showing 1 card and 1 chart. - The card says 'The Gapminder dataset provides historical data on countries' development indicators.' - The chart is a scatter plot showing GDP per capita vs. life expectancy. - GDP per capita on the x axis, life expectancy on the y axis, and colored by continent. - Layout the card on the left and the chart on the right. The card takes 1/3 of the whole space on the left. - The chart takes 2/3 of the whole space and is on the right. - Add a filter to filter the scatter plot by continent. - Add a second filter to filter the chart by year. - - - This page displays the tips dataset. use two different charts to show data - distributions. one chart should be a bar chart and the other should be a scatter plot. - first chart is on the left and the second chart is on the right. - Add a filter to filter data in the scatter plot by smoker. - - - Create 3 cards on this page: - 1. The first card on top says "This page combines data from various sources - including tips, stock prices, and global indicators." - 2. The second card says "Insights from Gapminder dataset." - 3. The third card says "Stock price trends over time." - - Layout these 3 cards in this way: - create a grid with 3 columns and 2 rows. - Row 1: The first row has three columns: - - The first column is empty. - - The second and third columns span the area for card 1. - - Row 2: The second row also has three columns: - - The first column is empty. - - The second column is occupied by the area for card 2. - - The third column is occupied by the area for card 3. - """ - - dashboard = vizro_ai.dashboard([df1, df2, df3], input_text) +def test_complex_dashboard(dash_duo, model_name): + dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3, df4], complex_prompt) logic( dashboard=dashboard, model_name=model_name, dash_duo=dash_duo, - prompt_tier="medium", + prompt_tier="complex", + prompt_text=complex_prompt.replace("\n", " "), config={ "pages": [ { "components": [ - Components(type="ag_grid"), - Components(type="graph"), + Component(type="ag_grid"), ], - "controls": [], + "controls": [Control(type="filter"), Control(type="filter")], }, { - "components": [ - Components(type="card"), - Components(type="graph"), - ], - "controls": [ - Controls(type="filter"), - Controls(type="filter"), - ], + "components": [Component(type="graph"), Component(type="graph")], + "controls": [Control(type="filter")], }, { "components": [ - Components(type="graph"), - Components(type="graph"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), ], "controls": [ - Controls(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), ], }, { "components": [ - Components(type="card"), - Components(type="card"), - Components(type="card"), + Component(type="ag_grid"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), + ], + "controls": [ + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), ], - "controls": [], }, ], },