From 98f6f4ddf4805fdef0fec624edca474d8ffd7f6b Mon Sep 17 00:00:00 2001 From: Alexey Snigir Date: Tue, 24 Dec 2024 13:55:41 +0100 Subject: [PATCH 01/14] complex prompt, update report, improvements --- .github/workflows/test-score-vizro-ai.yml | 25 +- vizro-ai/hatch.toml | 2 +- vizro-ai/tests/score/prompts.py | 105 +++++++++ vizro-ai/tests/score/pytest.ini | 6 + vizro-ai/tests/score/test_dashboard.py | 273 ++++++++++------------ 5 files changed, 243 insertions(+), 168 deletions(-) create mode 100644 vizro-ai/tests/score/prompts.py diff --git a/.github/workflows/test-score-vizro-ai.yml b/.github/workflows/test-score-vizro-ai.yml index 813ecba8c..033b8fa0e 100644 --- a/.github/workflows/test-score-vizro-ai.yml +++ b/.github/workflows/test-score-vizro-ai.yml @@ -5,7 +5,13 @@ defaults: working-directory: vizro-ai on: + schedule: + - cron: "30 10 * * 1" # run every Monday at 10:30 UTC workflow_dispatch: + #temporary for development + pull_request: + branches: + - main env: PYTHONUNBUFFERED: 1 @@ -20,17 +26,8 @@ jobs: fail-fast: false matrix: config: - - python-version: "3.9" - hatch-env: all.py3.9 - - python-version: "3.10" - hatch-env: all.py3.10 - - python-version: "3.11" - hatch-env: all.py3.11 - python-version: "3.12" hatch-env: all.py3.12 - - python-version: "3.9" - hatch-env: lower-bounds - label: lower bounds steps: - uses: actions/checkout@v4 @@ -46,17 +43,9 @@ jobs: fail-fast: false matrix: config: - - python-version: "3.9" - hatch-env: all.py3.9 - - python-version: "3.10" - hatch-env: all.py3.10 - - python-version: "3.11" - hatch-env: all.py3.11 - python-version: "3.12" hatch-env: all.py3.12 - - python-version: "3.9" - hatch-env: lower-bounds - label: lower bounds + steps: - uses: actions/checkout@v4 diff --git a/vizro-ai/hatch.toml b/vizro-ai/hatch.toml index 384d15d57..371590661 100644 --- a/vizro-ai/hatch.toml +++ b/vizro-ai/hatch.toml @@ -49,7 +49,7 @@ prep-release = [ pypath = "hatch run python -c 'import sys; print(sys.executable)'" test = "pytest tests {args}" test-integration = "pytest -vs --reruns 1 tests/integration --headless {args}" -test-score = "pytest -vs --reruns 1 tests/score --headless {args}" +test-score = "pytest -vs tests/score --headless {args}" test-unit = "pytest tests/unit {args}" test-unit-coverage = [ "coverage run -m pytest tests/unit {args}", diff --git a/vizro-ai/tests/score/prompts.py b/vizro-ai/tests/score/prompts.py new file mode 100644 index 000000000..47858f694 --- /dev/null +++ b/vizro-ai/tests/score/prompts.py @@ -0,0 +1,105 @@ +easy_prompt = """ +I need a page with 1 table. +The table shows the tech companies stock data. + +I need a second page showing 2 cards and one chart. +The first card says 'The Gapminder dataset provides historical data on countries' development indicators.' +The chart is an scatter plot showing life expectancy vs. GDP per capita by country. +Life expectancy on the y axis, GDP per capita on the x axis, and colored by continent. +The second card says 'Data spans from 1952 to 2007 across various countries.' +The layout uses a grid of 3 columns and 2 rows. + +Row 1: The first row has three columns: +The first column is occupied by the first card. +The second and third columns are spanned by the chart. + +Row 2: The second row mirrors the layout of the first row with respect to chart, +but the first column is occupied by the second card. + +Add a filter to filter the scatter plot by continent. +Add a second filter to filter the chart by year. +""" + +medium_prompt = """ + +I need a page with 1 table and 1 line chart. +The chart shows the stock price trends of GOOG and AAPL. +The table shows the stock prices data details. + + +I need a second page showing 3 cards and 4 charts. +The cards says 'The Gapminder dataset provides historical data on countries' development indicators.' +The charts are the scatter plots showing GDP per capita vs. life expectancy. +GDP per capita on the x axis, life expectancy on the y axis, and colored by continent. +Layout the cards on the left and the chart on the right. +Add a filter to filter the scatter plots by continent. +Add a second filter to filter the charts by year. + + +This page displays the tips dataset. use four different charts to show data +distributions. one chart should be a bar chart. the other should be a scatter plot. +next chart should be a line chart. last one should be an area plot. +first and second charts are on the left and the third and fourth charts are on the right. +Add a filter to filter data in every plot by smoker. + + +Create 3 cards on this page: +1. The first card on top says "This page combines data from various sources + including tips, stock prices, and global indicators." +2. The second card says "Insights from Gapminder dataset." +3. The third card says "Stock price trends over time." + +Layout these 3 cards in this way: +create a grid with 3 columns and 2 rows. +Row 1: The first row has three columns: +- The first column is empty. +- The second and third columns span the area for card 1. + +Row 2: The second row also has three columns: +- The first column is empty. +- The second column is occupied by the area for card 2. +- The third column is occupied by the area for card 3. + """ + + +complex_prompt = """ + +I need a page with 1 table and 3 line charts. +The chart shows the stock price trends of GOOG and AAPL. +The table shows the stock prices data details. +Add 3 filters to filter the line chart by companies. + + +I need a second page showing 1 card and 1 chart. +The card says 'The Gapminder dataset provides historical data on countries' development indicators.' +The chart is a scatter plot showing GDP per capita vs. life expectancy. +GDP per capita on the x axis, life expectancy on the y axis, and colored by continent. +Layout the card on the left and the chart on the right. The card takes 1/3 of the whole space on the left. +The chart takes 2/3 of the whole space and is on the right. +Add a filter to filter the scatter plot by continent. +Add a second filter to filter the chart by year. + + +This page displays the tips dataset. use two different charts to show data +distributions. one chart should be a bar chart and the other should be a scatter plot. +first chart is on the left and the second chart is on the right. +Add a filter to filter data in the scatter plot by smoker. + + +Create 3 cards on this page: +1. The first card on top says "This page combines data from various sources + including tips, stock prices, and global indicators." +2. The second card says "Insights from Gapminder dataset." +3. The third card says "Stock price trends over time." + +Layout these 3 cards in this way: +create a grid with 3 columns and 2 rows. +Row 1: The first row has three columns: +- The first column is empty. +- The second and third columns span the area for card 1. + +Row 2: The second row also has three columns: +- The first column is empty. +- The second column is occupied by the area for card 2. +- The third column is occupied by the area for card 3. + """ \ No newline at end of file diff --git a/vizro-ai/tests/score/pytest.ini b/vizro-ai/tests/score/pytest.ini index 8b3381827..7f2efb67c 100644 --- a/vizro-ai/tests/score/pytest.ini +++ b/vizro-ai/tests/score/pytest.ini @@ -2,3 +2,9 @@ markers = easy_dashboard: mark test with easy prompt for dashboard creation. medium_dashboard: mark test with medium prompt for dashboard creation. + complex_dashboard: mark test with complex prompt for dashboard creation. + +filterwarnings = + ignore::UserWarning + # Ignore deprecation warning until this is solved: https://github.com/plotly/dash/issues/2590: + ignore:HTTPResponse.getheader():DeprecationWarning diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py index 53d2e9033..5c2bac14a 100644 --- a/vizro-ai/tests/score/test_dashboard.py +++ b/vizro-ai/tests/score/test_dashboard.py @@ -10,11 +10,11 @@ import chromedriver_autoinstaller import pytest import vizro.plotly.express as px +import numpy as np from vizro import Vizro from vizro_ai import VizroAI - -vizro_ai = VizroAI() +from prompts import easy_prompt, medium_prompt, complex_prompt df1 = px.data.gapminder() df2 = px.data.stocks() @@ -22,12 +22,12 @@ @dataclass -class Components: +class Component: type: Literal["ag_grid", "card", "graph"] @dataclass -class Controls: +class Control: type: Literal["filter", "parameter"] @@ -43,6 +43,7 @@ def logic( # noqa: PLR0912, PLR0915 model_name, dash_duo, prompt_tier, + prompt_text, config: dict, ): """Calculates all separate scores. Creates csv report. @@ -52,6 +53,7 @@ def logic( # noqa: PLR0912, PLR0915 model_name: GenAI model name dash_duo: dash_duo fixture prompt_tier: complexity of the prompt + prompt_text: prompt text config: json config of the expected dashboard """ @@ -161,73 +163,40 @@ def logic( # noqa: PLR0912, PLR0915 pages_exist.extend(pages_num) # Every separate score has its own weight. - app_started_score = {"weight": 0.4, "score": app_started} - no_browser_console_errors_score = {"weight": 0.1, "score": no_browser_console_errors} - pages_score = {"weight": 0.2, "score": sum(pages_exist) / len(pages_exist)} - components_score = {"weight": 0.1, "score": sum(components_num) / len(components_num)} - component_types_score = {"weight": 0.1, "score": sum(components_types_names) / len(components_types_names)} - controls_score = {"weight": 0.1, "score": sum(controls_num) / len(controls_num)} - controls_types_score = {"weight": 0.1, "score": sum(controls_types_names) / len(controls_types_names)} - scores = [ - app_started_score, - no_browser_console_errors_score, - pages_score, - components_score, - component_types_score, - controls_score, - controls_types_score, + {"score_name": "app_started_score", "weight": 0.4, "score": app_started}, + {"score_name": "no_browser_console_errors_score", "weight": 0.1, "score": no_browser_console_errors}, + {"score_name": "pages_score", "weight": 0.2, "score": sum(pages_exist) / len(pages_exist)}, + {"score_name": "components_score", "weight": 0.1, "score": sum(components_num) / len(components_num)}, + {"score_name": "component_types_score", "weight": 0.1, "score": sum(components_types_names) / len(components_types_names)}, + {"score_name": "controls_score", "weight": 0.1, "score": sum(controls_num) / len(controls_num)}, + {"score_name": "controls_types_score", "weight": 0.1, "score": sum(controls_types_names) / len(controls_types_names)}, ] - # total_weight should be equal to 1 - total_weight = sum(score["weight"] for score in scores) - # If total_weight is not equal to 1, we're recalculating weights for every separate score - # and calculating final weighted_score for the created dashboard - if total_weight != 1: - scores = [{"weight": score["weight"] / total_weight, "score": score["score"]} for score in scores] - weighted_score = round(sum(score["weight"] * score["score"] for score in scores), 1) - # csv report creation + scores_values = np.array([score["score"] for score in scores]) + weights = np.array([score["weight"] for score in scores]) + weighted_score = np.average(scores_values, weights=weights) - data_rows = [ - datetime.now(), - vizro_type, - branch, - python_version, - model_name, - prompt_tier, - weighted_score, - app_started_score["score"], - no_browser_console_errors_score["score"], - pages_score["score"], - components_score["score"], - component_types_score["score"], - controls_score["score"], - controls_types_score["score"], - ] + # csv report creation + data_rows = [datetime.now(), vizro_type, branch, python_version, model_name, prompt_tier, prompt_text, weighted_score] + data_rows.extend(score["score"] for score in scores) with open(f"{report_dir}/report_model_{model_name}_{vizro_type}.csv", "a", newline=""): with open(f"{report_dir}/report_model_{model_name}_{vizro_type}.csv", "r+", newline="") as csvfile: writer = csv.writer(csvfile, delimiter=",") first_line = csvfile.readline() if not first_line: - writer.writerow( - [ + header_rows = [ "timestamp", "vizro_type", "branch", "python_version", "model", "prompt_tier", - "weighted_score", - "app_started_score", - "no_browser_console_errors_score", - "pages_score", - "components_score", - "component_types_score", - "controls_score", - "controls_types_score", - ] - ) + "prompt_text", + "weighted_score"] + header_rows.extend(score["score_name"] for score in scores) + writer.writerow(header_rows) writer.writerow(data_rows) else: writer.writerow(data_rows) @@ -248,59 +217,40 @@ def logic( # noqa: PLR0912, PLR0915 @pytest.mark.easy_dashboard @pytest.mark.parametrize( "model_name", - ["gpt-4o-mini"], - ids=["gpt-4o-mini"], -) -@pytest.mark.filterwarnings("ignore::langchain_core._api.beta_decorator.LangChainBetaWarning") -@pytest.mark.filterwarnings("ignore::UserWarning") -@pytest.mark.filterwarnings("ignore:HTTPResponse.getheader()") + [ + "gpt-4o-mini", + "claude-3-5-sonnet-latest", + ], + ids=[ + "gpt-4o-mini", + "claude-3-5-sonnet-latest", + ]) def test_easy_dashboard(dash_duo, model_name): - input_text = """ - I need a page with 1 table. - The table shows the tech companies stock data. - - I need a second page showing 2 cards and one chart. - The first card says 'The Gapminder dataset provides historical data on countries' development indicators.' - The chart is an scatter plot showing life expectancy vs. GDP per capita by country. - Life expectancy on the y axis, GDP per capita on the x axis, and colored by continent. - The second card says 'Data spans from 1952 to 2007 across various countries.' - The layout uses a grid of 3 columns and 2 rows. - - Row 1: The first row has three columns: - The first column is occupied by the first card. - The second and third columns are spanned by the chart. - - Row 2: The second row mirrors the layout of the first row with respect to chart, - but the first column is occupied by the second card. - - Add a filter to filter the scatter plot by continent. - Add a second filter to filter the chart by year. - """ - - dashboard = vizro_ai.dashboard([df1, df2], input_text) + dashboard = VizroAI(model=model_name).dashboard([df1, df2], easy_prompt) logic( dashboard=dashboard, model_name=model_name, dash_duo=dash_duo, prompt_tier="easy", + prompt_text=easy_prompt.replace("\n", " "), config={ "pages": [ { "components": [ - Components(type="ag_grid"), + Component(type="ag_grid"), ], "controls": [], }, { "components": [ - Components(type="card"), - Components(type="card"), - Components(type="graph"), + Component(type="card"), + Component(type="card"), + Component(type="graph"), ], "controls": [ - Controls(type="filter"), - Controls(type="filter"), + Control(type="filter"), + Control(type="filter"), ], }, ], @@ -312,93 +262,118 @@ def test_easy_dashboard(dash_duo, model_name): @pytest.mark.parametrize( "model_name", ["gpt-4o-mini"], - ids=["gpt-4o-mini"], -) -@pytest.mark.filterwarnings("ignore::langchain_core._api.beta_decorator.LangChainBetaWarning") -@pytest.mark.filterwarnings("ignore::UserWarning") -@pytest.mark.filterwarnings("ignore:HTTPResponse.getheader()") + ids=["gpt-4o-mini"]) def test_medium_dashboard(dash_duo, model_name): - input_text = """ - - I need a page with 1 table and 1 line chart. - The chart shows the stock price trends of GOOG and AAPL. - The table shows the stock prices data details. - - - I need a second page showing 1 card and 1 chart. - The card says 'The Gapminder dataset provides historical data on countries' development indicators.' - The chart is a scatter plot showing GDP per capita vs. life expectancy. - GDP per capita on the x axis, life expectancy on the y axis, and colored by continent. - Layout the card on the left and the chart on the right. The card takes 1/3 of the whole space on the left. - The chart takes 2/3 of the whole space and is on the right. - Add a filter to filter the scatter plot by continent. - Add a second filter to filter the chart by year. - - - This page displays the tips dataset. use two different charts to show data - distributions. one chart should be a bar chart and the other should be a scatter plot. - first chart is on the left and the second chart is on the right. - Add a filter to filter data in the scatter plot by smoker. - - - Create 3 cards on this page: - 1. The first card on top says "This page combines data from various sources - including tips, stock prices, and global indicators." - 2. The second card says "Insights from Gapminder dataset." - 3. The third card says "Stock price trends over time." - - Layout these 3 cards in this way: - create a grid with 3 columns and 2 rows. - Row 1: The first row has three columns: - - The first column is empty. - - The second and third columns span the area for card 1. - - Row 2: The second row also has three columns: - - The first column is empty. - - The second column is occupied by the area for card 2. - - The third column is occupied by the area for card 3. - """ - - dashboard = vizro_ai.dashboard([df1, df2, df3], input_text) + dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3], medium_prompt) logic( dashboard=dashboard, model_name=model_name, dash_duo=dash_duo, prompt_tier="medium", + prompt_text=medium_prompt.replace("\n", " "), config={ "pages": [ { "components": [ - Components(type="ag_grid"), - Components(type="graph"), + Component(type="ag_grid"), + Component(type="graph"), ], "controls": [], }, { "components": [ - Components(type="card"), - Components(type="graph"), + Component(type="card"), + Component(type="graph"), + ], + "controls": [ + Control(type="filter"), + Control(type="filter"), + ], + }, + { + "components": [ + Component(type="graph"), + Component(type="graph"), + ], + "controls": [ + Control(type="filter"), + ], + }, + { + "components": [ + Component(type="card"), + Component(type="card"), + Component(type="card"), + ], + "controls": [], + }, + ], + }, + ) + + +@pytest.mark.complex_dashboard +@pytest.mark.parametrize( + "model_name", + ["gpt-4o-mini"], + ids=["gpt-4o-mini"], +) +def test_complex_dashboard(dash_duo, model_name): + dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3], complex_prompt) + + logic( + dashboard=dashboard, + model_name=model_name, + dash_duo=dash_duo, + prompt_tier="complex", + prompt_text=complex_prompt.replace("\n", " "), + config={ + "pages": [ + { + "components": [ + Component(type="ag_grid"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), + ], + "controls": [ + Control(type="filter"), + Control(type="filter"), + Control(type="filter") + ], + }, + { + "components": [ + Component(type="card"), + Component(type="card"), + Component(type="card"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), ], "controls": [ - Controls(type="filter"), - Controls(type="filter"), + Control(type="filter"), + Control(type="filter"), ], }, { "components": [ - Components(type="graph"), - Components(type="graph"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), ], "controls": [ - Controls(type="filter"), + Control(type="filter"), ], }, { "components": [ - Components(type="card"), - Components(type="card"), - Components(type="card"), + Component(type="card"), + Component(type="card"), + Component(type="card"), ], "controls": [], }, From 736254e8aa8d0b350e7d3236aea41f12ed958f73 Mon Sep 17 00:00:00 2001 From: Alexey Snigir Date: Tue, 24 Dec 2024 13:56:20 +0100 Subject: [PATCH 02/14] changelog --- ..._alexey_snigir_score_tests_improvements.md | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 vizro-ai/changelog.d/20241224_135602_alexey_snigir_score_tests_improvements.md diff --git a/vizro-ai/changelog.d/20241224_135602_alexey_snigir_score_tests_improvements.md b/vizro-ai/changelog.d/20241224_135602_alexey_snigir_score_tests_improvements.md new file mode 100644 index 000000000..7c0d58d4f --- /dev/null +++ b/vizro-ai/changelog.d/20241224_135602_alexey_snigir_score_tests_improvements.md @@ -0,0 +1,48 @@ + + + + + + + + + From 99d0fd7987ba8417fdd29ccc9348952d933fcf0c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 24 Dec 2024 12:57:28 +0000 Subject: [PATCH 03/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/workflows/test-score-vizro-ai.yml | 1 - vizro-ai/tests/score/prompts.py | 2 +- vizro-ai/tests/score/test_dashboard.py | 58 ++++++++++++++--------- 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/.github/workflows/test-score-vizro-ai.yml b/.github/workflows/test-score-vizro-ai.yml index 033b8fa0e..a124cbc6c 100644 --- a/.github/workflows/test-score-vizro-ai.yml +++ b/.github/workflows/test-score-vizro-ai.yml @@ -46,7 +46,6 @@ jobs: - python-version: "3.12" hatch-env: all.py3.12 - steps: - uses: actions/checkout@v4 diff --git a/vizro-ai/tests/score/prompts.py b/vizro-ai/tests/score/prompts.py index 47858f694..b507f051c 100644 --- a/vizro-ai/tests/score/prompts.py +++ b/vizro-ai/tests/score/prompts.py @@ -102,4 +102,4 @@ - The first column is empty. - The second column is occupied by the area for card 2. - The third column is occupied by the area for card 3. - """ \ No newline at end of file + """ diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py index 5c2bac14a..5681ee89d 100644 --- a/vizro-ai/tests/score/test_dashboard.py +++ b/vizro-ai/tests/score/test_dashboard.py @@ -8,13 +8,13 @@ from typing import Literal import chromedriver_autoinstaller +import numpy as np import pytest import vizro.plotly.express as px -import numpy as np +from prompts import complex_prompt, easy_prompt, medium_prompt from vizro import Vizro from vizro_ai import VizroAI -from prompts import easy_prompt, medium_prompt, complex_prompt df1 = px.data.gapminder() df2 = px.data.stocks() @@ -168,9 +168,17 @@ def logic( # noqa: PLR0912, PLR0915 {"score_name": "no_browser_console_errors_score", "weight": 0.1, "score": no_browser_console_errors}, {"score_name": "pages_score", "weight": 0.2, "score": sum(pages_exist) / len(pages_exist)}, {"score_name": "components_score", "weight": 0.1, "score": sum(components_num) / len(components_num)}, - {"score_name": "component_types_score", "weight": 0.1, "score": sum(components_types_names) / len(components_types_names)}, + { + "score_name": "component_types_score", + "weight": 0.1, + "score": sum(components_types_names) / len(components_types_names), + }, {"score_name": "controls_score", "weight": 0.1, "score": sum(controls_num) / len(controls_num)}, - {"score_name": "controls_types_score", "weight": 0.1, "score": sum(controls_types_names) / len(controls_types_names)}, + { + "score_name": "controls_types_score", + "weight": 0.1, + "score": sum(controls_types_names) / len(controls_types_names), + }, ] scores_values = np.array([score["score"] for score in scores]) @@ -178,7 +186,16 @@ def logic( # noqa: PLR0912, PLR0915 weighted_score = np.average(scores_values, weights=weights) # csv report creation - data_rows = [datetime.now(), vizro_type, branch, python_version, model_name, prompt_tier, prompt_text, weighted_score] + data_rows = [ + datetime.now(), + vizro_type, + branch, + python_version, + model_name, + prompt_tier, + prompt_text, + weighted_score, + ] data_rows.extend(score["score"] for score in scores) with open(f"{report_dir}/report_model_{model_name}_{vizro_type}.csv", "a", newline=""): @@ -187,14 +204,15 @@ def logic( # noqa: PLR0912, PLR0915 first_line = csvfile.readline() if not first_line: header_rows = [ - "timestamp", - "vizro_type", - "branch", - "python_version", - "model", - "prompt_tier", - "prompt_text", - "weighted_score"] + "timestamp", + "vizro_type", + "branch", + "python_version", + "model", + "prompt_tier", + "prompt_text", + "weighted_score", + ] header_rows.extend(score["score_name"] for score in scores) writer.writerow(header_rows) writer.writerow(data_rows) @@ -224,7 +242,8 @@ def logic( # noqa: PLR0912, PLR0915 ids=[ "gpt-4o-mini", "claude-3-5-sonnet-latest", - ]) + ], +) def test_easy_dashboard(dash_duo, model_name): dashboard = VizroAI(model=model_name).dashboard([df1, df2], easy_prompt) @@ -259,10 +278,7 @@ def test_easy_dashboard(dash_duo, model_name): @pytest.mark.medium_dashboard -@pytest.mark.parametrize( - "model_name", - ["gpt-4o-mini"], - ids=["gpt-4o-mini"]) +@pytest.mark.parametrize("model_name", ["gpt-4o-mini"], ids=["gpt-4o-mini"]) def test_medium_dashboard(dash_duo, model_name): dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3], medium_prompt) @@ -337,11 +353,7 @@ def test_complex_dashboard(dash_duo, model_name): Component(type="graph"), Component(type="graph"), ], - "controls": [ - Control(type="filter"), - Control(type="filter"), - Control(type="filter") - ], + "controls": [Control(type="filter"), Control(type="filter"), Control(type="filter")], }, { "components": [ From 34a7ccf9651776996f89597ea839ec0c4dafa303 Mon Sep 17 00:00:00 2001 From: Alexey Snigir Date: Fri, 27 Dec 2024 11:17:08 +0100 Subject: [PATCH 04/14] add anthropic creds --- .github/workflows/test-score-vizro-ai.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test-score-vizro-ai.yml b/.github/workflows/test-score-vizro-ai.yml index 033b8fa0e..29d11287b 100644 --- a/.github/workflows/test-score-vizro-ai.yml +++ b/.github/workflows/test-score-vizro-ai.yml @@ -66,6 +66,8 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_BASE: ${{ secrets.OPENAI_API_BASE }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }} VIZRO_TYPE: pypi BRANCH: ${{ github.head_ref }} PYTHON_VERSION: ${{ matrix.config.python-version }} @@ -77,6 +79,8 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_BASE: ${{ secrets.OPENAI_API_BASE }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }} VIZRO_TYPE: local BRANCH: ${{ github.head_ref }} PYTHON_VERSION: ${{ matrix.config.python-version }} From 7139fb5a7fbc29da75b388acd678586726c71783 Mon Sep 17 00:00:00 2001 From: Alexey Snigir Date: Fri, 27 Dec 2024 11:45:33 +0100 Subject: [PATCH 05/14] fix report aggregated --- .github/workflows/test-score-vizro-ai.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-score-vizro-ai.yml b/.github/workflows/test-score-vizro-ai.yml index e41928ce8..15f64fd91 100644 --- a/.github/workflows/test-score-vizro-ai.yml +++ b/.github/workflows/test-score-vizro-ai.yml @@ -123,7 +123,7 @@ jobs: - name: Create one csv report run: | cd /home/runner/work/vizro/vizro/ - head -n 1 Report-3.11-/report_model_gpt-4o-mini_pypi.csv > report-aggregated-${{ steps.date.outputs.date }}.csv && tail -n+2 -q */*.csv >> report-aggregated-${{ steps.date.outputs.date }}.csv + head -n 1 Report-3.12-/report_model_gpt-4o-mini_pypi.csv > report-aggregated-${{ steps.date.outputs.date }}.csv && tail -n+2 -q */*.csv >> report-aggregated-${{ steps.date.outputs.date }}.csv gawk -F, -i inplace 'FNR>1 {$1="${{ steps.date.outputs.date }}"} {print}' OFS=, report-aggregated-${{ steps.date.outputs.date }}.csv - name: Report artifacts From 459f482264f0bfd9aa6c1e4e809bba96a7f1fadc Mon Sep 17 00:00:00 2001 From: Alexey Snigir Date: Thu, 16 Jan 2025 17:20:03 +0100 Subject: [PATCH 06/14] complex prompt and review fixes --- .github/workflows/test-score-vizro-ai.yml | 5 +- vizro-ai/tests/score/prompts.py | 76 +++++++++-------------- vizro-ai/tests/score/test_dashboard.py | 74 +++++++++++----------- 3 files changed, 69 insertions(+), 86 deletions(-) diff --git a/.github/workflows/test-score-vizro-ai.yml b/.github/workflows/test-score-vizro-ai.yml index 15f64fd91..afd4d3caa 100644 --- a/.github/workflows/test-score-vizro-ai.yml +++ b/.github/workflows/test-score-vizro-ai.yml @@ -123,8 +123,9 @@ jobs: - name: Create one csv report run: | cd /home/runner/work/vizro/vizro/ - head -n 1 Report-3.12-/report_model_gpt-4o-mini_pypi.csv > report-aggregated-${{ steps.date.outputs.date }}.csv && tail -n+2 -q */*.csv >> report-aggregated-${{ steps.date.outputs.date }}.csv - gawk -F, -i inplace 'FNR>1 {$1="${{ steps.date.outputs.date }}"} {print}' OFS=, report-aggregated-${{ steps.date.outputs.date }}.csv + ls */*.csv | head -n1 | xargs head -n1 > report-aggregated-${{ steps.date.outputs.date }}.csv && tail -n+2 -q */*.csv >> report-aggregated-${{ steps.date.outputs.date }}.csv + # replace all timestamps in aggregated report to current date + gawk -F, -i inplace 'FNR>1 {$1="${{ steps.date.outputs.date }}"} {print}' report-aggregated-${{ steps.date.outputs.date }}.csv - name: Report artifacts uses: actions/upload-artifact@v4 diff --git a/vizro-ai/tests/score/prompts.py b/vizro-ai/tests/score/prompts.py index b507f051c..fac3454b3 100644 --- a/vizro-ai/tests/score/prompts.py +++ b/vizro-ai/tests/score/prompts.py @@ -1,23 +1,17 @@ easy_prompt = """ -I need a page with 1 table. +I need a page with 1 table, 1 card and 1 chart. The table shows the tech companies stock data. - -I need a second page showing 2 cards and one chart. -The first card says 'The Gapminder dataset provides historical data on countries' development indicators.' -The chart is an scatter plot showing life expectancy vs. GDP per capita by country. +The card says 'The Gapminder dataset provides historical data on countries' development indicators.' +The chart is the scatter plot which uses gapminder dataframe and showing life expectancy vs. GDP per capita by country. Life expectancy on the y axis, GDP per capita on the x axis, and colored by continent. -The second card says 'Data spans from 1952 to 2007 across various countries.' -The layout uses a grid of 3 columns and 2 rows. - -Row 1: The first row has three columns: -The first column is occupied by the first card. -The second and third columns are spanned by the chart. -Row 2: The second row mirrors the layout of the first row with respect to chart, -but the first column is occupied by the second card. +The layout uses a grid of 2 columns and 3 rows. +The first row contains card +The second row contains chart +The third row contains table Add a filter to filter the scatter plot by continent. -Add a second filter to filter the chart by year. +Add a second filter to filter the table by companies. """ medium_prompt = """ @@ -59,47 +53,33 @@ - The first column is empty. - The second column is occupied by the area for card 2. - The third column is occupied by the area for card 3. - """ +""" complex_prompt = """ -I need a page with 1 table and 3 line charts. -The chart shows the stock price trends of GOOG and AAPL. -The table shows the stock prices data details. -Add 3 filters to filter the line chart by companies. +Show me 1 table on the first page that shows tips and sorted by day +Using export button I want to export data to csv +Add filters by bill and by tip amount using range slider -I need a second page showing 1 card and 1 chart. -The card says 'The Gapminder dataset provides historical data on countries' development indicators.' -The chart is a scatter plot showing GDP per capita vs. life expectancy. -GDP per capita on the x axis, life expectancy on the y axis, and colored by continent. -Layout the card on the left and the chart on the right. The card takes 1/3 of the whole space on the left. -The chart takes 2/3 of the whole space and is on the right. -Add a filter to filter the scatter plot by continent. -Add a second filter to filter the chart by year. +Second page should contain kpi cards with population trends and +two popular charts that display population per capita vs. continent. +Filter charts by GDP using slider. +Align kpi cards in one row and charts in different. +Both charts should be in tabs. -This page displays the tips dataset. use two different charts to show data -distributions. one chart should be a bar chart and the other should be a scatter plot. -first chart is on the left and the second chart is on the right. -Add a filter to filter data in the scatter plot by smoker. +Third page should contain 6 charts showing stocks. +Each should have separate filter by date. +Filter types should include dropdown, datepicker, slider, checklist and radio items. +Add parameter for any chart. -Create 3 cards on this page: -1. The first card on top says "This page combines data from various sources - including tips, stock prices, and global indicators." -2. The second card says "Insights from Gapminder dataset." -3. The third card says "Stock price trends over time." - -Layout these 3 cards in this way: -create a grid with 3 columns and 2 rows. -Row 1: The first row has three columns: -- The first column is empty. -- The second and third columns span the area for card 1. - -Row 2: The second row also has three columns: -- The first column is empty. -- The second column is occupied by the area for card 2. -- The third column is occupied by the area for card 3. - """ +Fourth page contains chart with wind data. +Table with GDP data. +Two more charts with stocks and tips representations. +Align table beautifully relative to the charts. +Every chart should have 2 filters. +Table should have 1 filter. +""" diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py index 5681ee89d..632ca0298 100644 --- a/vizro-ai/tests/score/test_dashboard.py +++ b/vizro-ai/tests/score/test_dashboard.py @@ -2,6 +2,7 @@ import csv import os +import statistics from collections import Counter from dataclasses import dataclass from datetime import datetime @@ -19,6 +20,7 @@ df1 = px.data.gapminder() df2 = px.data.stocks() df3 = px.data.tips() +df4 = px.data.wind() @dataclass @@ -38,6 +40,12 @@ def setup_test_environment(): chromedriver_autoinstaller.install() +# If len() is 0, it means that nothing was entered for this score in config, +# in this case in should be 1.0. +def score_calculator(score_name): + return statistics.mean(score_name) if len(score_name) != 0 else 1.0 + + def logic( # noqa: PLR0912, PLR0915 dashboard, model_name, @@ -91,8 +99,8 @@ def logic( # noqa: PLR0912, PLR0915 branch = "local" python_version = "local" - pages_exist = [1 if dashboard.pages else 0] - pages_exist_report = bool(pages_exist[0]) + pages_exist = [1.0 if dashboard.pages else 0][0] + pages_exist_report = bool(pages_exist) pages_num = [1 if len(dashboard.pages) == len(config["pages"]) else 0] pages_num_report = [f'{len(config["pages"])} page(s) for dashboard is {bool(pages_num[0])}'] @@ -160,24 +168,24 @@ def logic( # noqa: PLR0912, PLR0915 controls_types_names.append(controls_types) controls_types_names_report.append("page or control does not exists") - pages_exist.extend(pages_num) # Every separate score has its own weight. scores = [ {"score_name": "app_started_score", "weight": 0.4, "score": app_started}, {"score_name": "no_browser_console_errors_score", "weight": 0.1, "score": no_browser_console_errors}, - {"score_name": "pages_score", "weight": 0.2, "score": sum(pages_exist) / len(pages_exist)}, - {"score_name": "components_score", "weight": 0.1, "score": sum(components_num) / len(components_num)}, + {"score_name": "pages_score", "weight": 0.3, "score": pages_exist}, + {"score_name": "pages_number", "weight": 0.2, "score": score_calculator(score_name=pages_num)}, + {"score_name": "components_score", "weight": 0.2, "score": score_calculator(score_name=components_num)}, { "score_name": "component_types_score", - "weight": 0.1, - "score": sum(components_types_names) / len(components_types_names), + "weight": 0.2, + "score": score_calculator(score_name=components_types_names), }, - {"score_name": "controls_score", "weight": 0.1, "score": sum(controls_num) / len(controls_num)}, + {"score_name": "controls_score", "weight": 0.2, "score": score_calculator(score_name=controls_num)}, { "score_name": "controls_types_score", - "weight": 0.1, - "score": sum(controls_types_names) / len(controls_types_names), + "weight": 0.2, + "score": score_calculator(score_name=controls_types_names), }, ] @@ -215,9 +223,7 @@ def logic( # noqa: PLR0912, PLR0915 ] header_rows.extend(score["score_name"] for score in scores) writer.writerow(header_rows) - writer.writerow(data_rows) - else: - writer.writerow(data_rows) + writer.writerow(data_rows) # Readable report for the console output print(f"App started: {app_started_report}") # noqa: T201 @@ -258,12 +264,6 @@ def test_easy_dashboard(dash_duo, model_name): { "components": [ Component(type="ag_grid"), - ], - "controls": [], - }, - { - "components": [ - Component(type="card"), Component(type="card"), Component(type="graph"), ], @@ -336,7 +336,7 @@ def test_medium_dashboard(dash_duo, model_name): ids=["gpt-4o-mini"], ) def test_complex_dashboard(dash_duo, model_name): - dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3], complex_prompt) + dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3, df4], complex_prompt) logic( dashboard=dashboard, @@ -349,17 +349,17 @@ def test_complex_dashboard(dash_duo, model_name): { "components": [ Component(type="ag_grid"), - Component(type="graph"), - Component(type="graph"), - Component(type="graph"), ], - "controls": [Control(type="filter"), Control(type="filter"), Control(type="filter")], + "controls": [Control(type="filter"), Control(type="filter")], + }, + { + "components": [Component(type="graph"), Component(type="graph")], + "controls": [Control(type="filter")], }, { "components": [ - Component(type="card"), - Component(type="card"), - Component(type="card"), + Component(type="graph"), + Component(type="graph"), Component(type="graph"), Component(type="graph"), Component(type="graph"), @@ -368,27 +368,29 @@ def test_complex_dashboard(dash_duo, model_name): "controls": [ Control(type="filter"), Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), ], }, { "components": [ - Component(type="graph"), + Component(type="ag_grid"), Component(type="graph"), Component(type="graph"), Component(type="graph"), ], "controls": [ Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), ], }, - { - "components": [ - Component(type="card"), - Component(type="card"), - Component(type="card"), - ], - "controls": [], - }, ], }, ) From e60a1044d4493595d1f43ff09e9cc2855a705d1f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 16 Jan 2025 16:21:39 +0000 Subject: [PATCH 07/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- vizro-ai/tests/score/test_dashboard.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py index 632ca0298..8c60217ba 100644 --- a/vizro-ai/tests/score/test_dashboard.py +++ b/vizro-ai/tests/score/test_dashboard.py @@ -168,7 +168,6 @@ def logic( # noqa: PLR0912, PLR0915 controls_types_names.append(controls_types) controls_types_names_report.append("page or control does not exists") - # Every separate score has its own weight. scores = [ {"score_name": "app_started_score", "weight": 0.4, "score": app_started}, From 9be7bd83fdcb9c8f7dfc3162f77d76b982c5b23a Mon Sep 17 00:00:00 2001 From: Alexey Snigir Date: Thu, 16 Jan 2025 17:28:11 +0100 Subject: [PATCH 08/14] added reruns --- vizro-ai/hatch.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vizro-ai/hatch.toml b/vizro-ai/hatch.toml index 6f4c30482..51465325c 100644 --- a/vizro-ai/hatch.toml +++ b/vizro-ai/hatch.toml @@ -51,7 +51,7 @@ prep-release = [ pypath = "hatch run python -c 'import sys; print(sys.executable)'" test = "pytest tests {args}" test-integration = "pytest -vs --reruns 1 tests/integration --headless {args}" -test-score = "pytest -vs tests/score --headless {args}" +test-score = "pytest -vs --reruns 1 tests/score --headless {args}" test-unit = "pytest tests/unit {args}" test-unit-coverage = [ "coverage run -m pytest tests/unit {args}", From dafbb16eba17bd56e5939064fe470f1077ca1888 Mon Sep 17 00:00:00 2001 From: Alexey Snigir Date: Thu, 16 Jan 2025 17:36:27 +0100 Subject: [PATCH 09/14] change complex prompt --- vizro-ai/hatch.toml | 2 +- vizro-ai/tests/score/prompts.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vizro-ai/hatch.toml b/vizro-ai/hatch.toml index 51465325c..6f4c30482 100644 --- a/vizro-ai/hatch.toml +++ b/vizro-ai/hatch.toml @@ -51,7 +51,7 @@ prep-release = [ pypath = "hatch run python -c 'import sys; print(sys.executable)'" test = "pytest tests {args}" test-integration = "pytest -vs --reruns 1 tests/integration --headless {args}" -test-score = "pytest -vs --reruns 1 tests/score --headless {args}" +test-score = "pytest -vs tests/score --headless {args}" test-unit = "pytest tests/unit {args}" test-unit-coverage = [ "coverage run -m pytest tests/unit {args}", diff --git a/vizro-ai/tests/score/prompts.py b/vizro-ai/tests/score/prompts.py index fac3454b3..c8cbea85f 100644 --- a/vizro-ai/tests/score/prompts.py +++ b/vizro-ai/tests/score/prompts.py @@ -77,7 +77,7 @@ Fourth page contains chart with wind data. -Table with GDP data. +Table with population data. Two more charts with stocks and tips representations. Align table beautifully relative to the charts. Every chart should have 2 filters. From 06c239aabab6097b2e3a81ad576e189256c33f21 Mon Sep 17 00:00:00 2001 From: Alexey Snigir Date: Thu, 16 Jan 2025 17:43:40 +0100 Subject: [PATCH 10/14] change complex prompt --- vizro-ai/tests/score/prompts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vizro-ai/tests/score/prompts.py b/vizro-ai/tests/score/prompts.py index c8cbea85f..383e3bc8d 100644 --- a/vizro-ai/tests/score/prompts.py +++ b/vizro-ai/tests/score/prompts.py @@ -65,7 +65,7 @@ Second page should contain kpi cards with population trends and two popular charts that display population per capita vs. continent. -Filter charts by GDP using slider. +Filter charts by GDP using dropdown. Align kpi cards in one row and charts in different. Both charts should be in tabs. From 97a1019e927e378a623d21ab7861510c42702d76 Mon Sep 17 00:00:00 2001 From: Alexey Snigir Date: Thu, 16 Jan 2025 17:52:37 +0100 Subject: [PATCH 11/14] option for ',' separator in aggregated report --- .github/workflows/test-score-vizro-ai.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-score-vizro-ai.yml b/.github/workflows/test-score-vizro-ai.yml index afd4d3caa..5530423a4 100644 --- a/.github/workflows/test-score-vizro-ai.yml +++ b/.github/workflows/test-score-vizro-ai.yml @@ -125,7 +125,7 @@ jobs: cd /home/runner/work/vizro/vizro/ ls */*.csv | head -n1 | xargs head -n1 > report-aggregated-${{ steps.date.outputs.date }}.csv && tail -n+2 -q */*.csv >> report-aggregated-${{ steps.date.outputs.date }}.csv # replace all timestamps in aggregated report to current date - gawk -F, -i inplace 'FNR>1 {$1="${{ steps.date.outputs.date }}"} {print}' report-aggregated-${{ steps.date.outputs.date }}.csv + gawk -F, -i inplace 'FNR>1 {$1="${{ steps.date.outputs.date }}"} {print}' OFS=, report-aggregated-${{ steps.date.outputs.date }}.csv - name: Report artifacts uses: actions/upload-artifact@v4 From ff97b3109e9d72853e0f09e40559a64d865f621e Mon Sep 17 00:00:00 2001 From: Alexey Snigir Date: Fri, 17 Jan 2025 12:49:12 +0100 Subject: [PATCH 12/14] review changes --- vizro-ai/tests/score/test_dashboard.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py index 8c60217ba..0815a48aa 100644 --- a/vizro-ai/tests/score/test_dashboard.py +++ b/vizro-ai/tests/score/test_dashboard.py @@ -41,9 +41,9 @@ def setup_test_environment(): # If len() is 0, it means that nothing was entered for this score in config, -# in this case in should be 1.0. -def score_calculator(score_name): - return statistics.mean(score_name) if len(score_name) != 0 else 1.0 +# in this case it should be 1. +def score_calculator(metrics_score: list[int]): + return statistics.mean(metrics_score) if len(metrics_score) != 0 else 1 def logic( # noqa: PLR0912, PLR0915 @@ -65,6 +65,7 @@ def logic( # noqa: PLR0912, PLR0915 config: json config of the expected dashboard """ + # TODO: Add layout score report_dir = "tests/score/reports" os.makedirs(report_dir, exist_ok=True) @@ -72,7 +73,7 @@ def logic( # noqa: PLR0912, PLR0915 try: dash_duo.start_server(app) - app_started = 1.0 + app_started = 1 app_started_report = "App started!" except Exception as e: app_started = 0 @@ -81,7 +82,7 @@ def logic( # noqa: PLR0912, PLR0915 try: assert dash_duo.get_logs() == [] - no_browser_console_errors = 1.0 + no_browser_console_errors = 1 no_browser_console_errors_report = "No error logs in browser console!" except AssertionError as e: no_browser_console_errors = 0 @@ -99,7 +100,7 @@ def logic( # noqa: PLR0912, PLR0915 branch = "local" python_version = "local" - pages_exist = [1.0 if dashboard.pages else 0][0] + pages_exist = [1 if dashboard.pages else 0][0] pages_exist_report = bool(pages_exist) pages_num = [1 if len(dashboard.pages) == len(config["pages"]) else 0] pages_num_report = [f'{len(config["pages"])} page(s) for dashboard is {bool(pages_num[0])}'] @@ -173,18 +174,18 @@ def logic( # noqa: PLR0912, PLR0915 {"score_name": "app_started_score", "weight": 0.4, "score": app_started}, {"score_name": "no_browser_console_errors_score", "weight": 0.1, "score": no_browser_console_errors}, {"score_name": "pages_score", "weight": 0.3, "score": pages_exist}, - {"score_name": "pages_number", "weight": 0.2, "score": score_calculator(score_name=pages_num)}, - {"score_name": "components_score", "weight": 0.2, "score": score_calculator(score_name=components_num)}, + {"score_name": "pages_number", "weight": 0.2, "score": score_calculator(metrics_score=pages_num)}, + {"score_name": "components_score", "weight": 0.2, "score": score_calculator(metrics_score=components_num)}, { "score_name": "component_types_score", "weight": 0.2, - "score": score_calculator(score_name=components_types_names), + "score": score_calculator(metrics_score=components_types_names), }, - {"score_name": "controls_score", "weight": 0.2, "score": score_calculator(score_name=controls_num)}, + {"score_name": "controls_score", "weight": 0.2, "score": score_calculator(metrics_score=controls_num)}, { "score_name": "controls_types_score", "weight": 0.2, - "score": score_calculator(score_name=controls_types_names), + "score": score_calculator(metrics_score=controls_types_names), }, ] From b1ae1b6b915a81d67d51f5bb6a03dcab3b6ab4ef Mon Sep 17 00:00:00 2001 From: Alexey Snigir Date: Wed, 22 Jan 2025 15:27:05 +0100 Subject: [PATCH 13/14] created one parametrized test --- vizro-ai/pyproject.toml | 6 +- vizro-ai/tests/score/data_classes.py | 12 ++ vizro-ai/tests/score/prompts.py | 260 ++++++++++++++++++------- vizro-ai/tests/score/pytest.ini | 10 - vizro-ai/tests/score/test_dashboard.py | 189 ++++-------------- 5 files changed, 241 insertions(+), 236 deletions(-) create mode 100644 vizro-ai/tests/score/data_classes.py delete mode 100644 vizro-ai/tests/score/pytest.ini diff --git a/vizro-ai/pyproject.toml b/vizro-ai/pyproject.toml index 791d2ab72..09bcf7590 100644 --- a/vizro-ai/pyproject.toml +++ b/vizro-ai/pyproject.toml @@ -66,7 +66,11 @@ filterwarnings = [ # Ignore LLMchian deprecation warning: "ignore:.*The class `LLMChain` was deprecated in LangChain 0.1.17", # Ignore warning for Pydantic v1 API and Python 3.13: - "ignore:Failing to pass a value to the 'type_params' parameter of 'typing.ForwardRef._evaluate' is deprecated:DeprecationWarning" + "ignore:Failing to pass a value to the 'type_params' parameter of 'typing.ForwardRef._evaluate' is deprecated:DeprecationWarning", + # Ignore deprecation warning until this is solved: https://github.com/plotly/dash/issues/2590: + "ignore:HTTPResponse.getheader():DeprecationWarning", + "ignore:CapturedCallable function is excluded from the schema", + "ignore:Exception in thread" ] [tool.ruff] diff --git a/vizro-ai/tests/score/data_classes.py b/vizro-ai/tests/score/data_classes.py new file mode 100644 index 000000000..047983fe6 --- /dev/null +++ b/vizro-ai/tests/score/data_classes.py @@ -0,0 +1,12 @@ +from dataclasses import dataclass +from typing import Literal + + +@dataclass +class Component: + type: Literal["ag_grid", "card", "graph"] + + +@dataclass +class Control: + type: Literal["filter", "parameter"] diff --git a/vizro-ai/tests/score/prompts.py b/vizro-ai/tests/score/prompts.py index 383e3bc8d..ea7b3583b 100644 --- a/vizro-ai/tests/score/prompts.py +++ b/vizro-ai/tests/score/prompts.py @@ -1,85 +1,201 @@ -easy_prompt = """ -I need a page with 1 table, 1 card and 1 chart. -The table shows the tech companies stock data. -The card says 'The Gapminder dataset provides historical data on countries' development indicators.' -The chart is the scatter plot which uses gapminder dataframe and showing life expectancy vs. GDP per capita by country. -Life expectancy on the y axis, GDP per capita on the x axis, and colored by continent. +from data_classes import Component, Control -The layout uses a grid of 2 columns and 3 rows. -The first row contains card -The second row contains chart -The third row contains table +easy_prompt = { + "tier_type": "easy", + "prompt_name": "one page + table + card + chart + 2 filters", + "prompt_text": """ + I need a page with 1 table, 1 card and 1 chart. + The table shows the tech companies stock data. + The card says 'The Gapminder dataset provides historical data on countries' development indicators.' + The chart is the scatter plot which uses gapminder dataframe + and showing life expectancy vs. GDP per capita by country. + Life expectancy on the y axis, GDP per capita on the x axis, and colored by continent. -Add a filter to filter the scatter plot by continent. -Add a second filter to filter the table by companies. -""" + The layout uses a grid of 2 columns and 3 rows. + The first row contains card + The second row contains chart + The third row contains table -medium_prompt = """ - -I need a page with 1 table and 1 line chart. -The chart shows the stock price trends of GOOG and AAPL. -The table shows the stock prices data details. + Add a filter to filter the scatter plot by continent. + Add a second filter to filter the table by companies. + """, + "expected_config": { + "pages": [ + { + "components": [ + Component(type="ag_grid"), + Component(type="card"), + Component(type="graph"), + ], + "controls": [ + Control(type="filter"), + Control(type="filter"), + ], + }, + ], + }, +} - -I need a second page showing 3 cards and 4 charts. -The cards says 'The Gapminder dataset provides historical data on countries' development indicators.' -The charts are the scatter plots showing GDP per capita vs. life expectancy. -GDP per capita on the x axis, life expectancy on the y axis, and colored by continent. -Layout the cards on the left and the chart on the right. -Add a filter to filter the scatter plots by continent. -Add a second filter to filter the charts by year. +medium_prompt = { + "tier_type": "medium", + "prompt_name": "4 pages with supported prompt instructions", + "prompt_text": """ + + I need a page with 1 table and 1 line chart. + The chart shows the stock price trends of GOOG and AAPL. + The table shows the stock prices data details. - -This page displays the tips dataset. use four different charts to show data -distributions. one chart should be a bar chart. the other should be a scatter plot. -next chart should be a line chart. last one should be an area plot. -first and second charts are on the left and the third and fourth charts are on the right. -Add a filter to filter data in every plot by smoker. + + I need a second page showing 3 cards and 4 charts. + The cards says 'The Gapminder dataset provides historical data on countries' development indicators.' + The charts are the scatter plots showing GDP per capita vs. life expectancy. + GDP per capita on the x axis, life expectancy on the y axis, and colored by continent. + Layout the cards on the left and the chart on the right. + Add a filter to filter the scatter plots by continent. + Add a second filter to filter the charts by year. - -Create 3 cards on this page: -1. The first card on top says "This page combines data from various sources - including tips, stock prices, and global indicators." -2. The second card says "Insights from Gapminder dataset." -3. The third card says "Stock price trends over time." + + This page displays the tips dataset. use four different charts to show data + distributions. one chart should be a bar chart. the other should be a scatter plot. + next chart should be a line chart. last one should be an area plot. + first and second charts are on the left and the third and fourth charts are on the right. + Add a filter to filter data in every plot by smoker. -Layout these 3 cards in this way: -create a grid with 3 columns and 2 rows. -Row 1: The first row has three columns: -- The first column is empty. -- The second and third columns span the area for card 1. + + Create 3 cards on this page: + 1. The first card on top says "This page combines data from various sources + including tips, stock prices, and global indicators." + 2. The second card says "Insights from Gapminder dataset." + 3. The third card says "Stock price trends over time." -Row 2: The second row also has three columns: -- The first column is empty. -- The second column is occupied by the area for card 2. -- The third column is occupied by the area for card 3. -""" + Layout these 3 cards in this way: + create a grid with 3 columns and 2 rows. + Row 1: The first row has three columns: + - The first column is empty. + - The second and third columns span the area for card 1. + Row 2: The second row also has three columns: + - The first column is empty. + - The second column is occupied by the area for card 2. + - The third column is occupied by the area for card 3. + """, + "expected_config": { + "pages": [ + { + "components": [ + Component(type="ag_grid"), + Component(type="graph"), + ], + "controls": [], + }, + { + "components": [ + Component(type="card"), + Component(type="graph"), + ], + "controls": [ + Control(type="filter"), + Control(type="filter"), + ], + }, + { + "components": [ + Component(type="graph"), + Component(type="graph"), + ], + "controls": [ + Control(type="filter"), + ], + }, + { + "components": [ + Component(type="card"), + Component(type="card"), + Component(type="card"), + ], + "controls": [], + }, + ], + }, +} -complex_prompt = """ - -Show me 1 table on the first page that shows tips and sorted by day -Using export button I want to export data to csv -Add filters by bill and by tip amount using range slider +complex_prompt = { + "tier_type": "complex", + "prompt_name": "4 pages with mix of supported and unsupported prompt instructions", + "prompt_text": """ + + Show me 1 table on the first page that shows tips and sorted by day + Using export button I want to export data to csv + Add filters by bill and by tip amount using slider - -Second page should contain kpi cards with population trends and -two popular charts that display population per capita vs. continent. -Filter charts by GDP using dropdown. -Align kpi cards in one row and charts in different. -Both charts should be in tabs. + + Second page should contain kpi cards with population trends and + two popular charts that display population per capita vs. continent. + Filter charts by GDP using dropdown. + Align kpi cards in one row and charts in different. + Both charts should be in tabs. - -Third page should contain 6 charts showing stocks. -Each should have separate filter by date. -Filter types should include dropdown, datepicker, slider, checklist and radio items. -Add parameter for any chart. + + Third page should contain 6 charts showing stocks. + Each should have separate filter by date. + Filter types should include dropdown, datepicker, slider, checklist and radio items. + Add parameter for any chart. - -Fourth page contains chart with wind data. -Table with population data. -Two more charts with stocks and tips representations. -Align table beautifully relative to the charts. -Every chart should have 2 filters. -Table should have 1 filter. -""" + + Fourth page contains chart with wind data. + Table with population per capita data. + Two more charts with stocks and tips representations. + Align table beautifully relative to the charts. + Every chart should have 2 filters. + Table should have 1 filter. + """, + "expected_config": { + "pages": [ + { + "components": [ + Component(type="ag_grid"), + ], + "controls": [Control(type="filter"), Control(type="filter")], + }, + { + "components": [Component(type="graph"), Component(type="graph")], + "controls": [Control(type="filter")], + }, + { + "components": [ + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), + ], + "controls": [ + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + ], + }, + { + "components": [ + Component(type="ag_grid"), + Component(type="graph"), + Component(type="graph"), + Component(type="graph"), + ], + "controls": [ + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + Control(type="filter"), + ], + }, + ], + }, +} diff --git a/vizro-ai/tests/score/pytest.ini b/vizro-ai/tests/score/pytest.ini deleted file mode 100644 index 7f2efb67c..000000000 --- a/vizro-ai/tests/score/pytest.ini +++ /dev/null @@ -1,10 +0,0 @@ -[pytest] -markers = - easy_dashboard: mark test with easy prompt for dashboard creation. - medium_dashboard: mark test with medium prompt for dashboard creation. - complex_dashboard: mark test with complex prompt for dashboard creation. - -filterwarnings = - ignore::UserWarning - # Ignore deprecation warning until this is solved: https://github.com/plotly/dash/issues/2590: - ignore:HTTPResponse.getheader():DeprecationWarning diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py index 0815a48aa..5c7f4f9d5 100644 --- a/vizro-ai/tests/score/test_dashboard.py +++ b/vizro-ai/tests/score/test_dashboard.py @@ -4,9 +4,7 @@ import os import statistics from collections import Counter -from dataclasses import dataclass from datetime import datetime -from typing import Literal import chromedriver_autoinstaller import numpy as np @@ -23,16 +21,6 @@ df4 = px.data.wind() -@dataclass -class Component: - type: Literal["ag_grid", "card", "graph"] - - -@dataclass -class Control: - type: Literal["filter", "parameter"] - - @pytest.fixture(scope="module", autouse=True) def setup_test_environment(): # We only need to install chromedriver outside CI. @@ -46,11 +34,12 @@ def score_calculator(metrics_score: list[int]): return statistics.mean(metrics_score) if len(metrics_score) != 0 else 1 -def logic( # noqa: PLR0912, PLR0915 +def logic( # noqa: PLR0912, PLR0913, PLR0915 dashboard, model_name, dash_duo, prompt_tier, + prompt_name, prompt_text, config: dict, ): @@ -61,6 +50,7 @@ def logic( # noqa: PLR0912, PLR0915 model_name: GenAI model name dash_duo: dash_duo fixture prompt_tier: complexity of the prompt + prompt_name: short prompt description prompt_text: prompt text config: json config of the expected dashboard @@ -201,6 +191,7 @@ def logic( # noqa: PLR0912, PLR0915 python_version, model_name, prompt_tier, + prompt_name, prompt_text, weighted_score, ] @@ -218,6 +209,7 @@ def logic( # noqa: PLR0912, PLR0915 "python_version", "model", "prompt_tier", + "prompt_name", "prompt_text", "weighted_score", ] @@ -238,159 +230,50 @@ def logic( # noqa: PLR0912, PLR0915 print(f"Scores: {scores}") # noqa: T201 -@pytest.mark.easy_dashboard @pytest.mark.parametrize( "model_name", [ "gpt-4o-mini", - "claude-3-5-sonnet-latest", ], ids=[ "gpt-4o-mini", - "claude-3-5-sonnet-latest", ], ) -def test_easy_dashboard(dash_duo, model_name): - dashboard = VizroAI(model=model_name).dashboard([df1, df2], easy_prompt) - - logic( - dashboard=dashboard, - model_name=model_name, - dash_duo=dash_duo, - prompt_tier="easy", - prompt_text=easy_prompt.replace("\n", " "), - config={ - "pages": [ - { - "components": [ - Component(type="ag_grid"), - Component(type="card"), - Component(type="graph"), - ], - "controls": [ - Control(type="filter"), - Control(type="filter"), - ], - }, - ], - }, - ) - - -@pytest.mark.medium_dashboard -@pytest.mark.parametrize("model_name", ["gpt-4o-mini"], ids=["gpt-4o-mini"]) -def test_medium_dashboard(dash_duo, model_name): - dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3], medium_prompt) - - logic( - dashboard=dashboard, - model_name=model_name, - dash_duo=dash_duo, - prompt_tier="medium", - prompt_text=medium_prompt.replace("\n", " "), - config={ - "pages": [ - { - "components": [ - Component(type="ag_grid"), - Component(type="graph"), - ], - "controls": [], - }, - { - "components": [ - Component(type="card"), - Component(type="graph"), - ], - "controls": [ - Control(type="filter"), - Control(type="filter"), - ], - }, - { - "components": [ - Component(type="graph"), - Component(type="graph"), - ], - "controls": [ - Control(type="filter"), - ], - }, - { - "components": [ - Component(type="card"), - Component(type="card"), - Component(type="card"), - ], - "controls": [], - }, - ], - }, - ) - - -@pytest.mark.complex_dashboard @pytest.mark.parametrize( - "model_name", - ["gpt-4o-mini"], - ids=["gpt-4o-mini"], + "tier_type, prompt_name, prompt_text, expected_config, dfs", + [ + ( + easy_prompt["tier_type"], + easy_prompt["prompt_name"], + easy_prompt["prompt_text"], + easy_prompt["expected_config"], + [df1, df2], + ), + ( + medium_prompt["tier_type"], + medium_prompt["prompt_name"], + medium_prompt["prompt_text"], + medium_prompt["expected_config"], + [df1, df2, df3], + ), + ( + complex_prompt["tier_type"], + complex_prompt["prompt_name"], + complex_prompt["prompt_text"], + complex_prompt["expected_config"], + [df1, df2, df3, df4], + ), + ], ) -def test_complex_dashboard(dash_duo, model_name): - dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3, df4], complex_prompt) +def test_dashboard(dash_duo, model_name, tier_type, prompt_name, prompt_text, expected_config, dfs): # noqa: PLR0913 + created_dashboard = VizroAI(model=model_name).dashboard(dfs, prompt_text) logic( - dashboard=dashboard, + dashboard=created_dashboard, model_name=model_name, dash_duo=dash_duo, - prompt_tier="complex", - prompt_text=complex_prompt.replace("\n", " "), - config={ - "pages": [ - { - "components": [ - Component(type="ag_grid"), - ], - "controls": [Control(type="filter"), Control(type="filter")], - }, - { - "components": [Component(type="graph"), Component(type="graph")], - "controls": [Control(type="filter")], - }, - { - "components": [ - Component(type="graph"), - Component(type="graph"), - Component(type="graph"), - Component(type="graph"), - Component(type="graph"), - Component(type="graph"), - ], - "controls": [ - Control(type="filter"), - Control(type="filter"), - Control(type="filter"), - Control(type="filter"), - Control(type="filter"), - Control(type="filter"), - ], - }, - { - "components": [ - Component(type="ag_grid"), - Component(type="graph"), - Component(type="graph"), - Component(type="graph"), - ], - "controls": [ - Control(type="filter"), - Control(type="filter"), - Control(type="filter"), - Control(type="filter"), - Control(type="filter"), - Control(type="filter"), - Control(type="filter"), - ], - }, - ], - }, + prompt_tier=tier_type, + prompt_name=prompt_name, + prompt_text=prompt_text.replace("\n", " "), + config=expected_config, ) From c52f3669ee6eb2da4085b5adf5d0a4e05b97f190 Mon Sep 17 00:00:00 2001 From: Alexey Snigir Date: Wed, 22 Jan 2025 19:06:15 +0100 Subject: [PATCH 14/14] separate anthropic test --- vizro-ai/tests/score/test_dashboard.py | 44 +++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py index 5c7f4f9d5..5590fb683 100644 --- a/vizro-ai/tests/score/test_dashboard.py +++ b/vizro-ai/tests/score/test_dashboard.py @@ -59,9 +59,8 @@ def logic( # noqa: PLR0912, PLR0913, PLR0915 report_dir = "tests/score/reports" os.makedirs(report_dir, exist_ok=True) - app = Vizro().build(dashboard).dash - try: + app = Vizro().build(dashboard).dash dash_duo.start_server(app) app_started = 1 app_started_report = "App started!" @@ -192,10 +191,10 @@ def logic( # noqa: PLR0912, PLR0913, PLR0915 model_name, prompt_tier, prompt_name, - prompt_text, weighted_score, ] data_rows.extend(score["score"] for score in scores) + data_rows.extend([prompt_text]) with open(f"{report_dir}/report_model_{model_name}_{vizro_type}.csv", "a", newline=""): with open(f"{report_dir}/report_model_{model_name}_{vizro_type}.csv", "r+", newline="") as csvfile: @@ -210,10 +209,10 @@ def logic( # noqa: PLR0912, PLR0913, PLR0915 "model", "prompt_tier", "prompt_name", - "prompt_text", "weighted_score", ] header_rows.extend(score["score_name"] for score in scores) + header_rows.extend(["prompt_text"]) writer.writerow(header_rows) writer.writerow(data_rows) @@ -265,7 +264,42 @@ def logic( # noqa: PLR0912, PLR0913, PLR0915 ), ], ) -def test_dashboard(dash_duo, model_name, tier_type, prompt_name, prompt_text, expected_config, dfs): # noqa: PLR0913 +def test_dashboard_openai(dash_duo, model_name, tier_type, prompt_name, prompt_text, expected_config, dfs): # noqa: PLR0913 + created_dashboard = VizroAI(model=model_name).dashboard(dfs, prompt_text) + + logic( + dashboard=created_dashboard, + model_name=model_name, + dash_duo=dash_duo, + prompt_tier=tier_type, + prompt_name=prompt_name, + prompt_text=prompt_text.replace("\n", " "), + config=expected_config, + ) + + +@pytest.mark.parametrize( + "model_name", + [ + "claude-3-5-sonnet-latest", + ], + ids=[ + "claude-3-5-sonnet-latest", + ], +) +@pytest.mark.parametrize( + "tier_type, prompt_name, prompt_text, expected_config, dfs", + [ + ( + easy_prompt["tier_type"], + easy_prompt["prompt_name"], + easy_prompt["prompt_text"], + easy_prompt["expected_config"], + [df1, df2], + ), + ], +) +def test_dashboard_anthropic(dash_duo, model_name, tier_type, prompt_name, prompt_text, expected_config, dfs): # noqa: PLR0913 created_dashboard = VizroAI(model=model_name).dashboard(dfs, prompt_text) logic(