From 98f6f4ddf4805fdef0fec624edca474d8ffd7f6b Mon Sep 17 00:00:00 2001
From: Alexey Snigir <alexey_snigir@external.mckinsey.com>
Date: Tue, 24 Dec 2024 13:55:41 +0100
Subject: [PATCH 01/14] complex prompt, update report, improvements

---
 .github/workflows/test-score-vizro-ai.yml |  25 +-
 vizro-ai/hatch.toml                       |   2 +-
 vizro-ai/tests/score/prompts.py           | 105 +++++++++
 vizro-ai/tests/score/pytest.ini           |   6 +
 vizro-ai/tests/score/test_dashboard.py    | 273 ++++++++++------------
 5 files changed, 243 insertions(+), 168 deletions(-)
 create mode 100644 vizro-ai/tests/score/prompts.py
diff --git a/.github/workflows/test-score-vizro-ai.yml b/.github/workflows/test-score-vizro-ai.yml
index 813ecba8c..033b8fa0e 100644
--- a/.github/workflows/test-score-vizro-ai.yml
+++ b/.github/workflows/test-score-vizro-ai.yml
@@ -5,7 +5,13 @@ defaults:
     working-directory: vizro-ai
 
 on:
+  schedule:
+    - cron: "30 10 * * 1" # run every Monday at 10:30 UTC
   workflow_dispatch:
+  #temporary for development
+  pull_request:
+    branches:
+      - main
 
 env:
   PYTHONUNBUFFERED: 1
@@ -20,17 +26,8 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - python-version: "3.9"
-            hatch-env: all.py3.9
-          - python-version: "3.10"
-            hatch-env: all.py3.10
-          - python-version: "3.11"
-            hatch-env: all.py3.11
           - python-version: "3.12"
             hatch-env: all.py3.12
-          - python-version: "3.9"
-            hatch-env: lower-bounds
-            label: lower bounds
 
     steps:
       - uses: actions/checkout@v4
@@ -46,17 +43,9 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - python-version: "3.9"
-            hatch-env: all.py3.9
-          - python-version: "3.10"
-            hatch-env: all.py3.10
-          - python-version: "3.11"
-            hatch-env: all.py3.11
           - python-version: "3.12"
             hatch-env: all.py3.12
-          - python-version: "3.9"
-            hatch-env: lower-bounds
-            label: lower bounds
+
 
     steps:
       - uses: actions/checkout@v4
diff --git a/vizro-ai/hatch.toml b/vizro-ai/hatch.toml
index 384d15d57..371590661 100644
--- a/vizro-ai/hatch.toml
+++ b/vizro-ai/hatch.toml
@@ -49,7 +49,7 @@ prep-release = [
 pypath = "hatch run python -c 'import sys; print(sys.executable)'"
 test = "pytest tests {args}"
 test-integration = "pytest -vs --reruns 1 tests/integration --headless {args}"
-test-score = "pytest -vs --reruns 1 tests/score --headless {args}"
+test-score = "pytest -vs tests/score --headless {args}"
 test-unit = "pytest tests/unit {args}"
 test-unit-coverage = [
   "coverage run -m pytest tests/unit {args}",
diff --git a/vizro-ai/tests/score/prompts.py b/vizro-ai/tests/score/prompts.py
new file mode 100644
index 000000000..47858f694
--- /dev/null
+++ b/vizro-ai/tests/score/prompts.py
@@ -0,0 +1,105 @@
+easy_prompt = """
+I need a page with 1 table.
+The table shows the tech companies stock data.
+
+I need a second page showing 2 cards and one chart.
+The first card says 'The Gapminder dataset provides historical data on countries' development indicators.'
+The chart is an scatter plot showing life expectancy vs. GDP per capita by country.
+Life expectancy on the y axis, GDP per capita on the x axis, and colored by continent.
+The second card says 'Data spans from 1952 to 2007 across various countries.'
+The layout uses a grid of 3 columns and 2 rows.
+
+Row 1: The first row has three columns:
+The first column is occupied by the first card.
+The second and third columns are spanned by the chart.
+
+Row 2: The second row mirrors the layout of the first row with respect to chart,
+but the first column is occupied by the second card.
+
+Add a filter to filter the scatter plot by continent.
+Add a second filter to filter the chart by year.
+"""
+
+medium_prompt = """
+<Page 1>
+I need a page with 1 table and 1 line chart.
+The chart shows the stock price trends of GOOG and AAPL.
+The table shows the stock prices data details.
+
+<Page 2>
+I need a second page showing 3 cards and 4 charts.
+The cards says 'The Gapminder dataset provides historical data on countries' development indicators.'
+The charts are the scatter plots showing GDP per capita vs. life expectancy.
+GDP per capita on the x axis, life expectancy on the y axis, and colored by continent.
+Layout the cards on the left and the chart on the right.
+Add a filter to filter the scatter plots by continent.
+Add a second filter to filter the charts by year.
+
+<Page 3>
+This page displays the tips dataset. use four different charts to show data
+distributions. one chart should be a bar chart. the other should be a scatter plot.
+next chart should be a line chart. last one should be an area plot.
+first and second charts are on the left and the third and fourth charts are on the right.
+Add a filter to filter data in every plot by smoker.
+
+<Page 4>
+Create 3 cards on this page:
+1. The first card on top says "This page combines data from various sources
+ including tips, stock prices, and global indicators."
+2. The second card says "Insights from Gapminder dataset."
+3. The third card says "Stock price trends over time."
+
+Layout these 3 cards in this way:
+create a grid with 3 columns and 2 rows.
+Row 1: The first row has three columns:
+- The first column is empty.
+- The second and third columns span the area for card 1.
+
+Row 2: The second row also has three columns:
+- The first column is empty.
+- The second column is occupied by the area for card 2.
+- The third column is occupied by the area for card 3.
+    """
+
+
+complex_prompt = """
+<Page 1>
+I need a page with 1 table and 3 line charts.
+The chart shows the stock price trends of GOOG and AAPL.
+The table shows the stock prices data details.
+Add 3 filters to filter the line chart by companies.
+
+<Page 2>
+I need a second page showing 1 card and 1 chart.
+The card says 'The Gapminder dataset provides historical data on countries' development indicators.'
+The chart is a scatter plot showing GDP per capita vs. life expectancy.
+GDP per capita on the x axis, life expectancy on the y axis, and colored by continent.
+Layout the card on the left and the chart on the right. The card takes 1/3 of the whole space on the left.
+The chart takes 2/3 of the whole space and is on the right.
+Add a filter to filter the scatter plot by continent.
+Add a second filter to filter the chart by year.
+
+<Page 3>
+This page displays the tips dataset. use two different charts to show data
+distributions. one chart should be a bar chart and the other should be a scatter plot.
+first chart is on the left and the second chart is on the right.
+Add a filter to filter data in the scatter plot by smoker.
+
+<Page 4>
+Create 3 cards on this page:
+1. The first card on top says "This page combines data from various sources
+ including tips, stock prices, and global indicators."
+2. The second card says "Insights from Gapminder dataset."
+3. The third card says "Stock price trends over time."
+
+Layout these 3 cards in this way:
+create a grid with 3 columns and 2 rows.
+Row 1: The first row has three columns:
+- The first column is empty.
+- The second and third columns span the area for card 1.
+
+Row 2: The second row also has three columns:
+- The first column is empty.
+- The second column is occupied by the area for card 2.
+- The third column is occupied by the area for card 3.
+    """
\ No newline at end of file
diff --git a/vizro-ai/tests/score/pytest.ini b/vizro-ai/tests/score/pytest.ini
index 8b3381827..7f2efb67c 100644
--- a/vizro-ai/tests/score/pytest.ini
+++ b/vizro-ai/tests/score/pytest.ini
@@ -2,3 +2,9 @@
 markers =
     easy_dashboard: mark test with easy prompt for dashboard creation.
     medium_dashboard: mark test with medium prompt for dashboard creation.
+    complex_dashboard: mark test with complex prompt for dashboard creation.
+
+filterwarnings =
+    ignore::UserWarning
+    # Ignore deprecation warning until this is solved: https://github.com/plotly/dash/issues/2590:
+    ignore:HTTPResponse.getheader():DeprecationWarning
diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py
index 53d2e9033..5c2bac14a 100644
--- a/vizro-ai/tests/score/test_dashboard.py
+++ b/vizro-ai/tests/score/test_dashboard.py
@@ -10,11 +10,11 @@
 import chromedriver_autoinstaller
 import pytest
 import vizro.plotly.express as px
+import numpy as np
 from vizro import Vizro
 
 from vizro_ai import VizroAI
-
-vizro_ai = VizroAI()
+from prompts import easy_prompt, medium_prompt, complex_prompt
 
 df1 = px.data.gapminder()
 df2 = px.data.stocks()
@@ -22,12 +22,12 @@
 
 
 @dataclass
-class Components:
+class Component:
     type: Literal["ag_grid", "card", "graph"]
 
 
 @dataclass
-class Controls:
+class Control:
     type: Literal["filter", "parameter"]
 
 
@@ -43,6 +43,7 @@ def logic(  # noqa: PLR0912, PLR0915
     model_name,
     dash_duo,
     prompt_tier,
+    prompt_text,
     config: dict,
 ):
     """Calculates all separate scores. Creates csv report.
@@ -52,6 +53,7 @@ def logic(  # noqa: PLR0912, PLR0915
         model_name: GenAI model name
         dash_duo: dash_duo fixture
         prompt_tier: complexity of the prompt
+        prompt_text: prompt text
         config: json config of the expected dashboard
 
     """
@@ -161,73 +163,40 @@ def logic(  # noqa: PLR0912, PLR0915
     pages_exist.extend(pages_num)
 
     # Every separate score has its own weight.
-    app_started_score = {"weight": 0.4, "score": app_started}
-    no_browser_console_errors_score = {"weight": 0.1, "score": no_browser_console_errors}
-    pages_score = {"weight": 0.2, "score": sum(pages_exist) / len(pages_exist)}
-    components_score = {"weight": 0.1, "score": sum(components_num) / len(components_num)}
-    component_types_score = {"weight": 0.1, "score": sum(components_types_names) / len(components_types_names)}
-    controls_score = {"weight": 0.1, "score": sum(controls_num) / len(controls_num)}
-    controls_types_score = {"weight": 0.1, "score": sum(controls_types_names) / len(controls_types_names)}
-
     scores = [
-        app_started_score,
-        no_browser_console_errors_score,
-        pages_score,
-        components_score,
-        component_types_score,
-        controls_score,
-        controls_types_score,
+        {"score_name": "app_started_score", "weight": 0.4, "score": app_started},
+        {"score_name": "no_browser_console_errors_score", "weight": 0.1, "score": no_browser_console_errors},
+        {"score_name": "pages_score", "weight": 0.2, "score": sum(pages_exist) / len(pages_exist)},
+        {"score_name": "components_score", "weight": 0.1, "score": sum(components_num) / len(components_num)},
+        {"score_name": "component_types_score", "weight": 0.1, "score": sum(components_types_names) / len(components_types_names)},
+        {"score_name": "controls_score", "weight": 0.1, "score": sum(controls_num) / len(controls_num)},
+        {"score_name": "controls_types_score", "weight": 0.1, "score": sum(controls_types_names) / len(controls_types_names)},
     ]
-    # total_weight should be equal to 1
-    total_weight = sum(score["weight"] for score in scores)
-    # If total_weight is not equal to 1, we're recalculating weights for every separate score
-    # and calculating final weighted_score for the created dashboard
-    if total_weight != 1:
-        scores = [{"weight": score["weight"] / total_weight, "score": score["score"]} for score in scores]
-    weighted_score = round(sum(score["weight"] * score["score"] for score in scores), 1)
 
-    # csv report creation
+    scores_values = np.array([score["score"] for score in scores])
+    weights = np.array([score["weight"] for score in scores])
+    weighted_score = np.average(scores_values, weights=weights)
 
-    data_rows = [
-        datetime.now(),
-        vizro_type,
-        branch,
-        python_version,
-        model_name,
-        prompt_tier,
-        weighted_score,
-        app_started_score["score"],
-        no_browser_console_errors_score["score"],
-        pages_score["score"],
-        components_score["score"],
-        component_types_score["score"],
-        controls_score["score"],
-        controls_types_score["score"],
-    ]
+    # csv report creation
+    data_rows = [datetime.now(), vizro_type, branch, python_version, model_name, prompt_tier, prompt_text, weighted_score]
+    data_rows.extend(score["score"] for score in scores)
 
     with open(f"{report_dir}/report_model_{model_name}_{vizro_type}.csv", "a", newline=""):
         with open(f"{report_dir}/report_model_{model_name}_{vizro_type}.csv", "r+", newline="") as csvfile:
             writer = csv.writer(csvfile, delimiter=",")
             first_line = csvfile.readline()
             if not first_line:
-                writer.writerow(
-                    [
+                header_rows = [
                         "timestamp",
                         "vizro_type",
                         "branch",
                         "python_version",
                         "model",
                         "prompt_tier",
-                        "weighted_score",
-                        "app_started_score",
-                        "no_browser_console_errors_score",
-                        "pages_score",
-                        "components_score",
-                        "component_types_score",
-                        "controls_score",
-                        "controls_types_score",
-                    ]
-                )
+                        "prompt_text",
+                        "weighted_score"]
+                header_rows.extend(score["score_name"] for score in scores)
+                writer.writerow(header_rows)
                 writer.writerow(data_rows)
             else:
                 writer.writerow(data_rows)
@@ -248,59 +217,40 @@ def logic(  # noqa: PLR0912, PLR0915
 @pytest.mark.easy_dashboard
 @pytest.mark.parametrize(
     "model_name",
-    ["gpt-4o-mini"],
-    ids=["gpt-4o-mini"],
-)
-@pytest.mark.filterwarnings("ignore::langchain_core._api.beta_decorator.LangChainBetaWarning")
-@pytest.mark.filterwarnings("ignore::UserWarning")
-@pytest.mark.filterwarnings("ignore:HTTPResponse.getheader()")
+    [
+        "gpt-4o-mini",
+        "claude-3-5-sonnet-latest",
+    ],
+    ids=[
+        "gpt-4o-mini",
+        "claude-3-5-sonnet-latest",
+    ])
 def test_easy_dashboard(dash_duo, model_name):
-    input_text = """
-    I need a page with 1 table.
-    The table shows the tech companies stock data.
-
-    I need a second page showing 2 cards and one chart.
-    The first card says 'The Gapminder dataset provides historical data on countries' development indicators.'
-    The chart is an scatter plot showing life expectancy vs. GDP per capita by country.
-    Life expectancy on the y axis, GDP per capita on the x axis, and colored by continent.
-    The second card says 'Data spans from 1952 to 2007 across various countries.'
-    The layout uses a grid of 3 columns and 2 rows.
-
-    Row 1: The first row has three columns:
-    The first column is occupied by the first card.
-    The second and third columns are spanned by the chart.
-
-    Row 2: The second row mirrors the layout of the first row with respect to chart,
-    but the first column is occupied by the second card.
-
-    Add a filter to filter the scatter plot by continent.
-    Add a second filter to filter the chart by year.
-    """
-
-    dashboard = vizro_ai.dashboard([df1, df2], input_text)
+    dashboard = VizroAI(model=model_name).dashboard([df1, df2], easy_prompt)
 
     logic(
         dashboard=dashboard,
         model_name=model_name,
         dash_duo=dash_duo,
         prompt_tier="easy",
+        prompt_text=easy_prompt.replace("\n", " "),
         config={
             "pages": [
                 {
                     "components": [
-                        Components(type="ag_grid"),
+                        Component(type="ag_grid"),
                     ],
                     "controls": [],
                 },
                 {
                     "components": [
-                        Components(type="card"),
-                        Components(type="card"),
-                        Components(type="graph"),
+                        Component(type="card"),
+                        Component(type="card"),
+                        Component(type="graph"),
                     ],
                     "controls": [
-                        Controls(type="filter"),
-                        Controls(type="filter"),
+                        Control(type="filter"),
+                        Control(type="filter"),
                     ],
                 },
             ],
@@ -312,93 +262,118 @@ def test_easy_dashboard(dash_duo, model_name):
 @pytest.mark.parametrize(
     "model_name",
     ["gpt-4o-mini"],
-    ids=["gpt-4o-mini"],
-)
-@pytest.mark.filterwarnings("ignore::langchain_core._api.beta_decorator.LangChainBetaWarning")
-@pytest.mark.filterwarnings("ignore::UserWarning")
-@pytest.mark.filterwarnings("ignore:HTTPResponse.getheader()")
+    ids=["gpt-4o-mini"])
 def test_medium_dashboard(dash_duo, model_name):
-    input_text = """
-    <Page 1>
-    I need a page with 1 table and 1 line chart.
-    The chart shows the stock price trends of GOOG and AAPL.
-    The table shows the stock prices data details.
-
-    <Page 2>
-    I need a second page showing 1 card and 1 chart.
-    The card says 'The Gapminder dataset provides historical data on countries' development indicators.'
-    The chart is a scatter plot showing GDP per capita vs. life expectancy.
-    GDP per capita on the x axis, life expectancy on the y axis, and colored by continent.
-    Layout the card on the left and the chart on the right. The card takes 1/3 of the whole space on the left.
-    The chart takes 2/3 of the whole space and is on the right.
-    Add a filter to filter the scatter plot by continent.
-    Add a second filter to filter the chart by year.
-
-    <Page 3>
-    This page displays the tips dataset. use two different charts to show data
-    distributions. one chart should be a bar chart and the other should be a scatter plot.
-    first chart is on the left and the second chart is on the right.
-    Add a filter to filter data in the scatter plot by smoker.
-
-    <Page 4>
-    Create 3 cards on this page:
-    1. The first card on top says "This page combines data from various sources
-     including tips, stock prices, and global indicators."
-    2. The second card says "Insights from Gapminder dataset."
-    3. The third card says "Stock price trends over time."
-
-    Layout these 3 cards in this way:
-    create a grid with 3 columns and 2 rows.
-    Row 1: The first row has three columns:
-    - The first column is empty.
-    - The second and third columns span the area for card 1.
-
-    Row 2: The second row also has three columns:
-    - The first column is empty.
-    - The second column is occupied by the area for card 2.
-    - The third column is occupied by the area for card 3.
-        """
-
-    dashboard = vizro_ai.dashboard([df1, df2, df3], input_text)
+    dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3], medium_prompt)
 
     logic(
         dashboard=dashboard,
         model_name=model_name,
         dash_duo=dash_duo,
         prompt_tier="medium",
+        prompt_text=medium_prompt.replace("\n", " "),
         config={
             "pages": [
                 {
                     "components": [
-                        Components(type="ag_grid"),
-                        Components(type="graph"),
+                        Component(type="ag_grid"),
+                        Component(type="graph"),
                     ],
                     "controls": [],
                 },
                 {
                     "components": [
-                        Components(type="card"),
-                        Components(type="graph"),
+                        Component(type="card"),
+                        Component(type="graph"),
+                    ],
+                    "controls": [
+                        Control(type="filter"),
+                        Control(type="filter"),
+                    ],
+                },
+                {
+                    "components": [
+                        Component(type="graph"),
+                        Component(type="graph"),
+                    ],
+                    "controls": [
+                        Control(type="filter"),
+                    ],
+                },
+                {
+                    "components": [
+                        Component(type="card"),
+                        Component(type="card"),
+                        Component(type="card"),
+                    ],
+                    "controls": [],
+                },
+            ],
+        },
+    )
+
+
+@pytest.mark.complex_dashboard
+@pytest.mark.parametrize(
+    "model_name",
+    ["gpt-4o-mini"],
+    ids=["gpt-4o-mini"],
+)
+def test_complex_dashboard(dash_duo, model_name):
+    dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3], complex_prompt)
+
+    logic(
+        dashboard=dashboard,
+        model_name=model_name,
+        dash_duo=dash_duo,
+        prompt_tier="complex",
+        prompt_text=complex_prompt.replace("\n", " "),
+        config={
+            "pages": [
+                {
+                    "components": [
+                        Component(type="ag_grid"),
+                        Component(type="graph"),
+                        Component(type="graph"),
+                        Component(type="graph"),
+                    ],
+                    "controls": [
+                        Control(type="filter"),
+                        Control(type="filter"),
+                        Control(type="filter")
+                    ],
+                },
+                {
+                    "components": [
+                        Component(type="card"),
+                        Component(type="card"),
+                        Component(type="card"),
+                        Component(type="graph"),
+                        Component(type="graph"),
+                        Component(type="graph"),
+                        Component(type="graph"),
                     ],
                     "controls": [
-                        Controls(type="filter"),
-                        Controls(type="filter"),
+                        Control(type="filter"),
+                        Control(type="filter"),
                     ],
                 },
                 {
                     "components": [
-                        Components(type="graph"),
-                        Components(type="graph"),
+                        Component(type="graph"),
+                        Component(type="graph"),
+                        Component(type="graph"),
+                        Component(type="graph"),
                     ],
                     "controls": [
-                        Controls(type="filter"),
+                        Control(type="filter"),
                     ],
                 },
                 {
                     "components": [
-                        Components(type="card"),
-                        Components(type="card"),
-                        Components(type="card"),
+                        Component(type="card"),
+                        Component(type="card"),
+                        Component(type="card"),
                     ],
                     "controls": [],
                 },

From 736254e8aa8d0b350e7d3236aea41f12ed958f73 Mon Sep 17 00:00:00 2001
From: Alexey Snigir <alexey_snigir@external.mckinsey.com>
Date: Tue, 24 Dec 2024 13:56:20 +0100
Subject: [PATCH 02/14] changelog

---
 ..._alexey_snigir_score_tests_improvements.md | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 vizro-ai/changelog.d/20241224_135602_alexey_snigir_score_tests_improvements.md

diff --git a/vizro-ai/changelog.d/20241224_135602_alexey_snigir_score_tests_improvements.md b/vizro-ai/changelog.d/20241224_135602_alexey_snigir_score_tests_improvements.md
new file mode 100644
index 000000000..7c0d58d4f
--- /dev/null
+++ b/vizro-ai/changelog.d/20241224_135602_alexey_snigir_score_tests_improvements.md
@@ -0,0 +1,48 @@
+<!--
+A new scriv changelog fragment.
+
+Uncomment the section that is right (remove the HTML comment wrapper).
+-->
+
+<!--
+### Highlights ✨
+
+- A bullet item for the Highlights ✨ category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX. ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->
+<!--
+### Removed
+
+- A bullet item for the Removed category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX. ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->
+<!--
+### Added
+
+- A bullet item for the Added category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX. ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->
+<!--
+### Changed
+
+- A bullet item for the Changed category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX. ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->
+<!--
+### Deprecated
+
+- A bullet item for the Deprecated category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX. ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->
+<!--
+### Fixed
+
+- A bullet item for the Fixed category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX. ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->
+<!--
+### Security
+
+- A bullet item for the Security category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX. ([#1](https://github.com/mckinsey/vizro/pull/1))
+
+-->

From 99d0fd7987ba8417fdd29ccc9348952d933fcf0c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 24 Dec 2024 12:57:28 +0000
Subject: [PATCH 03/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .github/workflows/test-score-vizro-ai.yml |  1 -
 vizro-ai/tests/score/prompts.py           |  2 +-
 vizro-ai/tests/score/test_dashboard.py    | 58 ++++++++++++++---------
 3 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/test-score-vizro-ai.yml b/.github/workflows/test-score-vizro-ai.yml
index 033b8fa0e..a124cbc6c 100644
--- a/.github/workflows/test-score-vizro-ai.yml
+++ b/.github/workflows/test-score-vizro-ai.yml
@@ -46,7 +46,6 @@ jobs:
           - python-version: "3.12"
             hatch-env: all.py3.12
 
-
     steps:
       - uses: actions/checkout@v4
 
diff --git a/vizro-ai/tests/score/prompts.py b/vizro-ai/tests/score/prompts.py
index 47858f694..b507f051c 100644
--- a/vizro-ai/tests/score/prompts.py
+++ b/vizro-ai/tests/score/prompts.py
@@ -102,4 +102,4 @@
 - The first column is empty.
 - The second column is occupied by the area for card 2.
 - The third column is occupied by the area for card 3.
-    """
\ No newline at end of file
+    """
diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py
index 5c2bac14a..5681ee89d 100644
--- a/vizro-ai/tests/score/test_dashboard.py
+++ b/vizro-ai/tests/score/test_dashboard.py
@@ -8,13 +8,13 @@
 from typing import Literal
 
 import chromedriver_autoinstaller
+import numpy as np
 import pytest
 import vizro.plotly.express as px
-import numpy as np
+from prompts import complex_prompt, easy_prompt, medium_prompt
 from vizro import Vizro
 
 from vizro_ai import VizroAI
-from prompts import easy_prompt, medium_prompt, complex_prompt
 
 df1 = px.data.gapminder()
 df2 = px.data.stocks()
@@ -168,9 +168,17 @@ def logic(  # noqa: PLR0912, PLR0915
         {"score_name": "no_browser_console_errors_score", "weight": 0.1, "score": no_browser_console_errors},
         {"score_name": "pages_score", "weight": 0.2, "score": sum(pages_exist) / len(pages_exist)},
         {"score_name": "components_score", "weight": 0.1, "score": sum(components_num) / len(components_num)},
-        {"score_name": "component_types_score", "weight": 0.1, "score": sum(components_types_names) / len(components_types_names)},
+        {
+            "score_name": "component_types_score",
+            "weight": 0.1,
+            "score": sum(components_types_names) / len(components_types_names),
+        },
         {"score_name": "controls_score", "weight": 0.1, "score": sum(controls_num) / len(controls_num)},
-        {"score_name": "controls_types_score", "weight": 0.1, "score": sum(controls_types_names) / len(controls_types_names)},
+        {
+            "score_name": "controls_types_score",
+            "weight": 0.1,
+            "score": sum(controls_types_names) / len(controls_types_names),
+        },
     ]
 
     scores_values = np.array([score["score"] for score in scores])
@@ -178,7 +186,16 @@ def logic(  # noqa: PLR0912, PLR0915
     weighted_score = np.average(scores_values, weights=weights)
 
     # csv report creation
-    data_rows = [datetime.now(), vizro_type, branch, python_version, model_name, prompt_tier, prompt_text, weighted_score]
+    data_rows = [
+        datetime.now(),
+        vizro_type,
+        branch,
+        python_version,
+        model_name,
+        prompt_tier,
+        prompt_text,
+        weighted_score,
+    ]
     data_rows.extend(score["score"] for score in scores)
 
     with open(f"{report_dir}/report_model_{model_name}_{vizro_type}.csv", "a", newline=""):
@@ -187,14 +204,15 @@ def logic(  # noqa: PLR0912, PLR0915
             first_line = csvfile.readline()
             if not first_line:
                 header_rows = [
-                        "timestamp",
-                        "vizro_type",
-                        "branch",
-                        "python_version",
-                        "model",
-                        "prompt_tier",
-                        "prompt_text",
-                        "weighted_score"]
+                    "timestamp",
+                    "vizro_type",
+                    "branch",
+                    "python_version",
+                    "model",
+                    "prompt_tier",
+                    "prompt_text",
+                    "weighted_score",
+                ]
                 header_rows.extend(score["score_name"] for score in scores)
                 writer.writerow(header_rows)
                 writer.writerow(data_rows)
@@ -224,7 +242,8 @@ def logic(  # noqa: PLR0912, PLR0915
     ids=[
         "gpt-4o-mini",
         "claude-3-5-sonnet-latest",
-    ])
+    ],
+)
 def test_easy_dashboard(dash_duo, model_name):
     dashboard = VizroAI(model=model_name).dashboard([df1, df2], easy_prompt)
 
@@ -259,10 +278,7 @@ def test_easy_dashboard(dash_duo, model_name):
 
 
 @pytest.mark.medium_dashboard
-@pytest.mark.parametrize(
-    "model_name",
-    ["gpt-4o-mini"],
-    ids=["gpt-4o-mini"])
+@pytest.mark.parametrize("model_name", ["gpt-4o-mini"], ids=["gpt-4o-mini"])
 def test_medium_dashboard(dash_duo, model_name):
     dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3], medium_prompt)
 
@@ -337,11 +353,7 @@ def test_complex_dashboard(dash_duo, model_name):
                         Component(type="graph"),
                         Component(type="graph"),
                     ],
-                    "controls": [
-                        Control(type="filter"),
-                        Control(type="filter"),
-                        Control(type="filter")
-                    ],
+                    "controls": [Control(type="filter"), Control(type="filter"), Control(type="filter")],
                 },
                 {
                     "components": [

From 34a7ccf9651776996f89597ea839ec0c4dafa303 Mon Sep 17 00:00:00 2001
From: Alexey Snigir <alexey_snigir@external.mckinsey.com>
Date: Fri, 27 Dec 2024 11:17:08 +0100
Subject: [PATCH 04/14] add anthropic creds

---
 .github/workflows/test-score-vizro-ai.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/test-score-vizro-ai.yml b/.github/workflows/test-score-vizro-ai.yml
index 033b8fa0e..29d11287b 100644
--- a/.github/workflows/test-score-vizro-ai.yml
+++ b/.github/workflows/test-score-vizro-ai.yml
@@ -66,6 +66,8 @@ jobs:
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           OPENAI_API_BASE: ${{ secrets.OPENAI_API_BASE }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }}
           VIZRO_TYPE: pypi
           BRANCH: ${{ github.head_ref }}
           PYTHON_VERSION: ${{ matrix.config.python-version }}
@@ -77,6 +79,8 @@ jobs:
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           OPENAI_API_BASE: ${{ secrets.OPENAI_API_BASE }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }}
           VIZRO_TYPE: local
           BRANCH: ${{ github.head_ref }}
           PYTHON_VERSION: ${{ matrix.config.python-version }}

From 7139fb5a7fbc29da75b388acd678586726c71783 Mon Sep 17 00:00:00 2001
From: Alexey Snigir <alexey_snigir@external.mckinsey.com>
Date: Fri, 27 Dec 2024 11:45:33 +0100
Subject: [PATCH 05/14] fix report aggregated

---
 .github/workflows/test-score-vizro-ai.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-score-vizro-ai.yml b/.github/workflows/test-score-vizro-ai.yml
index e41928ce8..15f64fd91 100644
--- a/.github/workflows/test-score-vizro-ai.yml
+++ b/.github/workflows/test-score-vizro-ai.yml
@@ -123,7 +123,7 @@ jobs:
       - name: Create one csv report
         run: |
           cd /home/runner/work/vizro/vizro/
-          head -n 1 Report-3.11-/report_model_gpt-4o-mini_pypi.csv > report-aggregated-${{ steps.date.outputs.date }}.csv && tail -n+2 -q */*.csv >> report-aggregated-${{ steps.date.outputs.date }}.csv
+          head -n 1 Report-3.12-/report_model_gpt-4o-mini_pypi.csv > report-aggregated-${{ steps.date.outputs.date }}.csv && tail -n+2 -q */*.csv >> report-aggregated-${{ steps.date.outputs.date }}.csv
           gawk -F, -i inplace 'FNR>1 {$1="${{ steps.date.outputs.date }}"} {print}' OFS=, report-aggregated-${{ steps.date.outputs.date }}.csv
 
       - name: Report artifacts

From 459f482264f0bfd9aa6c1e4e809bba96a7f1fadc Mon Sep 17 00:00:00 2001
From: Alexey Snigir <alexey_snigir@external.mckinsey.com>
Date: Thu, 16 Jan 2025 17:20:03 +0100
Subject: [PATCH 06/14] complex prompt and review fixes

---
 .github/workflows/test-score-vizro-ai.yml |  5 +-
 vizro-ai/tests/score/prompts.py           | 76 +++++++++--------------
 vizro-ai/tests/score/test_dashboard.py    | 74 +++++++++++-----------
 3 files changed, 69 insertions(+), 86 deletions(-)

diff --git a/.github/workflows/test-score-vizro-ai.yml b/.github/workflows/test-score-vizro-ai.yml
index 15f64fd91..afd4d3caa 100644
--- a/.github/workflows/test-score-vizro-ai.yml
+++ b/.github/workflows/test-score-vizro-ai.yml
@@ -123,8 +123,9 @@ jobs:
       - name: Create one csv report
         run: |
           cd /home/runner/work/vizro/vizro/
-          head -n 1 Report-3.12-/report_model_gpt-4o-mini_pypi.csv > report-aggregated-${{ steps.date.outputs.date }}.csv && tail -n+2 -q */*.csv >> report-aggregated-${{ steps.date.outputs.date }}.csv
-          gawk -F, -i inplace 'FNR>1 {$1="${{ steps.date.outputs.date }}"} {print}' OFS=, report-aggregated-${{ steps.date.outputs.date }}.csv
+          ls */*.csv | head -n1 | xargs head -n1 > report-aggregated-${{ steps.date.outputs.date }}.csv && tail -n+2 -q */*.csv >> report-aggregated-${{ steps.date.outputs.date }}.csv
+          # replace all timestamps in aggregated report to current date
+          gawk -F, -i inplace 'FNR>1 {$1="${{ steps.date.outputs.date }}"} {print}' report-aggregated-${{ steps.date.outputs.date }}.csv
 
       - name: Report artifacts
         uses: actions/upload-artifact@v4
diff --git a/vizro-ai/tests/score/prompts.py b/vizro-ai/tests/score/prompts.py
index b507f051c..fac3454b3 100644
--- a/vizro-ai/tests/score/prompts.py
+++ b/vizro-ai/tests/score/prompts.py
@@ -1,23 +1,17 @@
 easy_prompt = """
-I need a page with 1 table.
+I need a page with 1 table, 1 card and 1 chart.
 The table shows the tech companies stock data.
-
-I need a second page showing 2 cards and one chart.
-The first card says 'The Gapminder dataset provides historical data on countries' development indicators.'
-The chart is an scatter plot showing life expectancy vs. GDP per capita by country.
+The card says 'The Gapminder dataset provides historical data on countries' development indicators.'
+The chart is the scatter plot which uses gapminder dataframe and showing life expectancy vs. GDP per capita by country.
 Life expectancy on the y axis, GDP per capita on the x axis, and colored by continent.
-The second card says 'Data spans from 1952 to 2007 across various countries.'
-The layout uses a grid of 3 columns and 2 rows.
-
-Row 1: The first row has three columns:
-The first column is occupied by the first card.
-The second and third columns are spanned by the chart.
 
-Row 2: The second row mirrors the layout of the first row with respect to chart,
-but the first column is occupied by the second card.
+The layout uses a grid of 2 columns and 3 rows.
+The first row contains card
+The second row contains chart
+The third row contains table
 
 Add a filter to filter the scatter plot by continent.
-Add a second filter to filter the chart by year.
+Add a second filter to filter the table by companies.
 """
 
 medium_prompt = """
@@ -59,47 +53,33 @@
 - The first column is empty.
 - The second column is occupied by the area for card 2.
 - The third column is occupied by the area for card 3.
-    """
+"""
 
 
 complex_prompt = """
 <Page 1>
-I need a page with 1 table and 3 line charts.
-The chart shows the stock price trends of GOOG and AAPL.
-The table shows the stock prices data details.
-Add 3 filters to filter the line chart by companies.
+Show me 1 table on the first page that shows tips and sorted by day
+Using export button I want to export data to csv
+Add filters by bill and by tip amount using range slider
 
 <Page 2>
-I need a second page showing 1 card and 1 chart.
-The card says 'The Gapminder dataset provides historical data on countries' development indicators.'
-The chart is a scatter plot showing GDP per capita vs. life expectancy.
-GDP per capita on the x axis, life expectancy on the y axis, and colored by continent.
-Layout the card on the left and the chart on the right. The card takes 1/3 of the whole space on the left.
-The chart takes 2/3 of the whole space and is on the right.
-Add a filter to filter the scatter plot by continent.
-Add a second filter to filter the chart by year.
+Second page should contain kpi cards with population trends and
+two popular charts that display population per capita vs. continent.
+Filter charts by GDP using slider.
+Align kpi cards in one row and charts in different.
+Both charts should be in tabs.
 
 <Page 3>
-This page displays the tips dataset. use two different charts to show data
-distributions. one chart should be a bar chart and the other should be a scatter plot.
-first chart is on the left and the second chart is on the right.
-Add a filter to filter data in the scatter plot by smoker.
+Third page should contain 6 charts showing stocks.
+Each should have separate filter by date.
+Filter types should include dropdown, datepicker, slider, checklist and radio items.
+Add parameter for any chart.
 
 <Page 4>
-Create 3 cards on this page:
-1. The first card on top says "This page combines data from various sources
- including tips, stock prices, and global indicators."
-2. The second card says "Insights from Gapminder dataset."
-3. The third card says "Stock price trends over time."
-
-Layout these 3 cards in this way:
-create a grid with 3 columns and 2 rows.
-Row 1: The first row has three columns:
-- The first column is empty.
-- The second and third columns span the area for card 1.
-
-Row 2: The second row also has three columns:
-- The first column is empty.
-- The second column is occupied by the area for card 2.
-- The third column is occupied by the area for card 3.
-    """
+Fourth page contains chart with wind data.
+Table with GDP data.
+Two more charts with stocks and tips representations.
+Align table beautifully relative to the charts.
+Every chart should have 2 filters.
+Table should have 1 filter.
+"""
diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py
index 5681ee89d..632ca0298 100644
--- a/vizro-ai/tests/score/test_dashboard.py
+++ b/vizro-ai/tests/score/test_dashboard.py
@@ -2,6 +2,7 @@
 
 import csv
 import os
+import statistics
 from collections import Counter
 from dataclasses import dataclass
 from datetime import datetime
@@ -19,6 +20,7 @@
 df1 = px.data.gapminder()
 df2 = px.data.stocks()
 df3 = px.data.tips()
+df4 = px.data.wind()
 
 
 @dataclass
@@ -38,6 +40,12 @@ def setup_test_environment():
         chromedriver_autoinstaller.install()
 
 
+# If len() is 0, it means that nothing was entered for this score in config,
+# in this case in should be 1.0.
+def score_calculator(score_name):
+    return statistics.mean(score_name) if len(score_name) != 0 else 1.0
+
+
 def logic(  # noqa: PLR0912, PLR0915
     dashboard,
     model_name,
@@ -91,8 +99,8 @@ def logic(  # noqa: PLR0912, PLR0915
         branch = "local"
         python_version = "local"
 
-    pages_exist = [1 if dashboard.pages else 0]
-    pages_exist_report = bool(pages_exist[0])
+    pages_exist = [1.0 if dashboard.pages else 0][0]
+    pages_exist_report = bool(pages_exist)
     pages_num = [1 if len(dashboard.pages) == len(config["pages"]) else 0]
     pages_num_report = [f'{len(config["pages"])} page(s) for dashboard is {bool(pages_num[0])}']
 
@@ -160,24 +168,24 @@ def logic(  # noqa: PLR0912, PLR0915
         controls_types_names.append(controls_types)
         controls_types_names_report.append("page or control does not exists")
 
-    pages_exist.extend(pages_num)
 
     # Every separate score has its own weight.
     scores = [
         {"score_name": "app_started_score", "weight": 0.4, "score": app_started},
         {"score_name": "no_browser_console_errors_score", "weight": 0.1, "score": no_browser_console_errors},
-        {"score_name": "pages_score", "weight": 0.2, "score": sum(pages_exist) / len(pages_exist)},
-        {"score_name": "components_score", "weight": 0.1, "score": sum(components_num) / len(components_num)},
+        {"score_name": "pages_score", "weight": 0.3, "score": pages_exist},
+        {"score_name": "pages_number", "weight": 0.2, "score": score_calculator(score_name=pages_num)},
+        {"score_name": "components_score", "weight": 0.2, "score": score_calculator(score_name=components_num)},
         {
             "score_name": "component_types_score",
-            "weight": 0.1,
-            "score": sum(components_types_names) / len(components_types_names),
+            "weight": 0.2,
+            "score": score_calculator(score_name=components_types_names),
         },
-        {"score_name": "controls_score", "weight": 0.1, "score": sum(controls_num) / len(controls_num)},
+        {"score_name": "controls_score", "weight": 0.2, "score": score_calculator(score_name=controls_num)},
         {
             "score_name": "controls_types_score",
-            "weight": 0.1,
-            "score": sum(controls_types_names) / len(controls_types_names),
+            "weight": 0.2,
+            "score": score_calculator(score_name=controls_types_names),
         },
     ]
 
@@ -215,9 +223,7 @@ def logic(  # noqa: PLR0912, PLR0915
                 ]
                 header_rows.extend(score["score_name"] for score in scores)
                 writer.writerow(header_rows)
-                writer.writerow(data_rows)
-            else:
-                writer.writerow(data_rows)
+            writer.writerow(data_rows)
 
     # Readable report for the console output
     print(f"App started: {app_started_report}")  # noqa: T201
@@ -258,12 +264,6 @@ def test_easy_dashboard(dash_duo, model_name):
                 {
                     "components": [
                         Component(type="ag_grid"),
-                    ],
-                    "controls": [],
-                },
-                {
-                    "components": [
-                        Component(type="card"),
                         Component(type="card"),
                         Component(type="graph"),
                     ],
@@ -336,7 +336,7 @@ def test_medium_dashboard(dash_duo, model_name):
     ids=["gpt-4o-mini"],
 )
 def test_complex_dashboard(dash_duo, model_name):
-    dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3], complex_prompt)
+    dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3, df4], complex_prompt)
 
     logic(
         dashboard=dashboard,
@@ -349,17 +349,17 @@ def test_complex_dashboard(dash_duo, model_name):
                 {
                     "components": [
                         Component(type="ag_grid"),
-                        Component(type="graph"),
-                        Component(type="graph"),
-                        Component(type="graph"),
                     ],
-                    "controls": [Control(type="filter"), Control(type="filter"), Control(type="filter")],
+                    "controls": [Control(type="filter"), Control(type="filter")],
+                },
+                {
+                    "components": [Component(type="graph"), Component(type="graph")],
+                    "controls": [Control(type="filter")],
                 },
                 {
                     "components": [
-                        Component(type="card"),
-                        Component(type="card"),
-                        Component(type="card"),
+                        Component(type="graph"),
+                        Component(type="graph"),
                         Component(type="graph"),
                         Component(type="graph"),
                         Component(type="graph"),
@@ -368,27 +368,29 @@ def test_complex_dashboard(dash_duo, model_name):
                     "controls": [
                         Control(type="filter"),
                         Control(type="filter"),
+                        Control(type="filter"),
+                        Control(type="filter"),
+                        Control(type="filter"),
+                        Control(type="filter"),
                     ],
                 },
                 {
                     "components": [
-                        Component(type="graph"),
+                        Component(type="ag_grid"),
                         Component(type="graph"),
                         Component(type="graph"),
                         Component(type="graph"),
                     ],
                     "controls": [
                         Control(type="filter"),
+                        Control(type="filter"),
+                        Control(type="filter"),
+                        Control(type="filter"),
+                        Control(type="filter"),
+                        Control(type="filter"),
+                        Control(type="filter"),
                     ],
                 },
-                {
-                    "components": [
-                        Component(type="card"),
-                        Component(type="card"),
-                        Component(type="card"),
-                    ],
-                    "controls": [],
-                },
             ],
         },
     )

From e60a1044d4493595d1f43ff09e9cc2855a705d1f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 16 Jan 2025 16:21:39 +0000
Subject: [PATCH 07/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 vizro-ai/tests/score/test_dashboard.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py
index 632ca0298..8c60217ba 100644
--- a/vizro-ai/tests/score/test_dashboard.py
+++ b/vizro-ai/tests/score/test_dashboard.py
@@ -168,7 +168,6 @@ def logic(  # noqa: PLR0912, PLR0915
         controls_types_names.append(controls_types)
         controls_types_names_report.append("page or control does not exists")
 
-
     # Every separate score has its own weight.
     scores = [
         {"score_name": "app_started_score", "weight": 0.4, "score": app_started},

From 9be7bd83fdcb9c8f7dfc3162f77d76b982c5b23a Mon Sep 17 00:00:00 2001
From: Alexey Snigir <alexey_snigir@external.mckinsey.com>
Date: Thu, 16 Jan 2025 17:28:11 +0100
Subject: [PATCH 08/14] added reruns

---
 vizro-ai/hatch.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vizro-ai/hatch.toml b/vizro-ai/hatch.toml
index 6f4c30482..51465325c 100644
--- a/vizro-ai/hatch.toml
+++ b/vizro-ai/hatch.toml
@@ -51,7 +51,7 @@ prep-release = [
 pypath = "hatch run python -c 'import sys; print(sys.executable)'"
 test = "pytest tests {args}"
 test-integration = "pytest -vs --reruns 1 tests/integration --headless {args}"
-test-score = "pytest -vs tests/score --headless {args}"
+test-score = "pytest -vs --reruns 1 tests/score --headless {args}"
 test-unit = "pytest tests/unit {args}"
 test-unit-coverage = [
   "coverage run -m pytest tests/unit {args}",

From dafbb16eba17bd56e5939064fe470f1077ca1888 Mon Sep 17 00:00:00 2001
From: Alexey Snigir <alexey_snigir@external.mckinsey.com>
Date: Thu, 16 Jan 2025 17:36:27 +0100
Subject: [PATCH 09/14] change complex prompt

---
 vizro-ai/hatch.toml             | 2 +-
 vizro-ai/tests/score/prompts.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vizro-ai/hatch.toml b/vizro-ai/hatch.toml
index 51465325c..6f4c30482 100644
--- a/vizro-ai/hatch.toml
+++ b/vizro-ai/hatch.toml
@@ -51,7 +51,7 @@ prep-release = [
 pypath = "hatch run python -c 'import sys; print(sys.executable)'"
 test = "pytest tests {args}"
 test-integration = "pytest -vs --reruns 1 tests/integration --headless {args}"
-test-score = "pytest -vs --reruns 1 tests/score --headless {args}"
+test-score = "pytest -vs tests/score --headless {args}"
 test-unit = "pytest tests/unit {args}"
 test-unit-coverage = [
   "coverage run -m pytest tests/unit {args}",
diff --git a/vizro-ai/tests/score/prompts.py b/vizro-ai/tests/score/prompts.py
index fac3454b3..c8cbea85f 100644
--- a/vizro-ai/tests/score/prompts.py
+++ b/vizro-ai/tests/score/prompts.py
@@ -77,7 +77,7 @@
 
 <Page 4>
 Fourth page contains chart with wind data.
-Table with GDP data.
+Table with population data.
 Two more charts with stocks and tips representations.
 Align table beautifully relative to the charts.
 Every chart should have 2 filters.

From 06c239aabab6097b2e3a81ad576e189256c33f21 Mon Sep 17 00:00:00 2001
From: Alexey Snigir <alexey_snigir@external.mckinsey.com>
Date: Thu, 16 Jan 2025 17:43:40 +0100
Subject: [PATCH 10/14] change complex prompt

---
 vizro-ai/tests/score/prompts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vizro-ai/tests/score/prompts.py b/vizro-ai/tests/score/prompts.py
index c8cbea85f..383e3bc8d 100644
--- a/vizro-ai/tests/score/prompts.py
+++ b/vizro-ai/tests/score/prompts.py
@@ -65,7 +65,7 @@
 <Page 2>
 Second page should contain kpi cards with population trends and
 two popular charts that display population per capita vs. continent.
-Filter charts by GDP using slider.
+Filter charts by GDP using dropdown.
 Align kpi cards in one row and charts in different.
 Both charts should be in tabs.
 

From 97a1019e927e378a623d21ab7861510c42702d76 Mon Sep 17 00:00:00 2001
From: Alexey Snigir <alexey_snigir@external.mckinsey.com>
Date: Thu, 16 Jan 2025 17:52:37 +0100
Subject: [PATCH 11/14] option for ',' separator in aggregated report

---
 .github/workflows/test-score-vizro-ai.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-score-vizro-ai.yml b/.github/workflows/test-score-vizro-ai.yml
index afd4d3caa..5530423a4 100644
--- a/.github/workflows/test-score-vizro-ai.yml
+++ b/.github/workflows/test-score-vizro-ai.yml
@@ -125,7 +125,7 @@ jobs:
           cd /home/runner/work/vizro/vizro/
           ls */*.csv | head -n1 | xargs head -n1 > report-aggregated-${{ steps.date.outputs.date }}.csv && tail -n+2 -q */*.csv >> report-aggregated-${{ steps.date.outputs.date }}.csv
           # replace all timestamps in aggregated report to current date
-          gawk -F, -i inplace 'FNR>1 {$1="${{ steps.date.outputs.date }}"} {print}' report-aggregated-${{ steps.date.outputs.date }}.csv
+          gawk -F, -i inplace 'FNR>1 {$1="${{ steps.date.outputs.date }}"} {print}' OFS=, report-aggregated-${{ steps.date.outputs.date }}.csv
 
       - name: Report artifacts
         uses: actions/upload-artifact@v4

From ff97b3109e9d72853e0f09e40559a64d865f621e Mon Sep 17 00:00:00 2001
From: Alexey Snigir <alexey_snigir@external.mckinsey.com>
Date: Fri, 17 Jan 2025 12:49:12 +0100
Subject: [PATCH 12/14] review changes

---
 vizro-ai/tests/score/test_dashboard.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py
index 8c60217ba..0815a48aa 100644
--- a/vizro-ai/tests/score/test_dashboard.py
+++ b/vizro-ai/tests/score/test_dashboard.py
@@ -41,9 +41,9 @@ def setup_test_environment():
 
 
 # If len() is 0, it means that nothing was entered for this score in config,
-# in this case in should be 1.0.
-def score_calculator(score_name):
-    return statistics.mean(score_name) if len(score_name) != 0 else 1.0
+# in this case it should be 1.
+def score_calculator(metrics_score: list[int]):
+    return statistics.mean(metrics_score) if len(metrics_score) != 0 else 1
 
 
 def logic(  # noqa: PLR0912, PLR0915
@@ -65,6 +65,7 @@ def logic(  # noqa: PLR0912, PLR0915
         config: json config of the expected dashboard
 
     """
+    # TODO: Add layout score
     report_dir = "tests/score/reports"
     os.makedirs(report_dir, exist_ok=True)
 
@@ -72,7 +73,7 @@ def logic(  # noqa: PLR0912, PLR0915
 
     try:
         dash_duo.start_server(app)
-        app_started = 1.0
+        app_started = 1
         app_started_report = "App started!"
     except Exception as e:
         app_started = 0
@@ -81,7 +82,7 @@ def logic(  # noqa: PLR0912, PLR0915
 
     try:
         assert dash_duo.get_logs() == []
-        no_browser_console_errors = 1.0
+        no_browser_console_errors = 1
         no_browser_console_errors_report = "No error logs in browser console!"
     except AssertionError as e:
         no_browser_console_errors = 0
@@ -99,7 +100,7 @@ def logic(  # noqa: PLR0912, PLR0915
         branch = "local"
         python_version = "local"
 
-    pages_exist = [1.0 if dashboard.pages else 0][0]
+    pages_exist = [1 if dashboard.pages else 0][0]
     pages_exist_report = bool(pages_exist)
     pages_num = [1 if len(dashboard.pages) == len(config["pages"]) else 0]
     pages_num_report = [f'{len(config["pages"])} page(s) for dashboard is {bool(pages_num[0])}']
@@ -173,18 +174,18 @@ def logic(  # noqa: PLR0912, PLR0915
         {"score_name": "app_started_score", "weight": 0.4, "score": app_started},
         {"score_name": "no_browser_console_errors_score", "weight": 0.1, "score": no_browser_console_errors},
         {"score_name": "pages_score", "weight": 0.3, "score": pages_exist},
-        {"score_name": "pages_number", "weight": 0.2, "score": score_calculator(score_name=pages_num)},
-        {"score_name": "components_score", "weight": 0.2, "score": score_calculator(score_name=components_num)},
+        {"score_name": "pages_number", "weight": 0.2, "score": score_calculator(metrics_score=pages_num)},
+        {"score_name": "components_score", "weight": 0.2, "score": score_calculator(metrics_score=components_num)},
         {
             "score_name": "component_types_score",
             "weight": 0.2,
-            "score": score_calculator(score_name=components_types_names),
+            "score": score_calculator(metrics_score=components_types_names),
         },
-        {"score_name": "controls_score", "weight": 0.2, "score": score_calculator(score_name=controls_num)},
+        {"score_name": "controls_score", "weight": 0.2, "score": score_calculator(metrics_score=controls_num)},
         {
             "score_name": "controls_types_score",
             "weight": 0.2,
-            "score": score_calculator(score_name=controls_types_names),
+            "score": score_calculator(metrics_score=controls_types_names),
         },
     ]
 

From b1ae1b6b915a81d67d51f5bb6a03dcab3b6ab4ef Mon Sep 17 00:00:00 2001
From: Alexey Snigir <alexey_snigir@external.mckinsey.com>
Date: Wed, 22 Jan 2025 15:27:05 +0100
Subject: [PATCH 13/14] created one parametrized test

---
 vizro-ai/pyproject.toml                |   6 +-
 vizro-ai/tests/score/data_classes.py   |  12 ++
 vizro-ai/tests/score/prompts.py        | 260 ++++++++++++++++++-------
 vizro-ai/tests/score/pytest.ini        |  10 -
 vizro-ai/tests/score/test_dashboard.py | 189 ++++--------------
 5 files changed, 241 insertions(+), 236 deletions(-)
 create mode 100644 vizro-ai/tests/score/data_classes.py
 delete mode 100644 vizro-ai/tests/score/pytest.ini

diff --git a/vizro-ai/pyproject.toml b/vizro-ai/pyproject.toml
index 791d2ab72..09bcf7590 100644
--- a/vizro-ai/pyproject.toml
+++ b/vizro-ai/pyproject.toml
@@ -66,7 +66,11 @@ filterwarnings = [
   # Ignore LLMchian deprecation warning:
   "ignore:.*The class `LLMChain` was deprecated in LangChain 0.1.17",
   # Ignore warning for Pydantic v1 API and Python 3.13:
-  "ignore:Failing to pass a value to the 'type_params' parameter of 'typing.ForwardRef._evaluate' is deprecated:DeprecationWarning"
+  "ignore:Failing to pass a value to the 'type_params' parameter of 'typing.ForwardRef._evaluate' is deprecated:DeprecationWarning",
+  # Ignore deprecation warning until this is solved: https://github.com/plotly/dash/issues/2590:
+  "ignore:HTTPResponse.getheader():DeprecationWarning",
+  "ignore:CapturedCallable function is excluded from the schema",
+  "ignore:Exception in thread"
 ]
 
 [tool.ruff]
diff --git a/vizro-ai/tests/score/data_classes.py b/vizro-ai/tests/score/data_classes.py
new file mode 100644
index 000000000..047983fe6
--- /dev/null
+++ b/vizro-ai/tests/score/data_classes.py
@@ -0,0 +1,12 @@
+from dataclasses import dataclass
+from typing import Literal
+
+
+@dataclass
+class Component:
+    type: Literal["ag_grid", "card", "graph"]
+
+
+@dataclass
+class Control:
+    type: Literal["filter", "parameter"]
diff --git a/vizro-ai/tests/score/prompts.py b/vizro-ai/tests/score/prompts.py
index 383e3bc8d..ea7b3583b 100644
--- a/vizro-ai/tests/score/prompts.py
+++ b/vizro-ai/tests/score/prompts.py
@@ -1,85 +1,201 @@
-easy_prompt = """
-I need a page with 1 table, 1 card and 1 chart.
-The table shows the tech companies stock data.
-The card says 'The Gapminder dataset provides historical data on countries' development indicators.'
-The chart is the scatter plot which uses gapminder dataframe and showing life expectancy vs. GDP per capita by country.
-Life expectancy on the y axis, GDP per capita on the x axis, and colored by continent.
+from data_classes import Component, Control
 
-The layout uses a grid of 2 columns and 3 rows.
-The first row contains card
-The second row contains chart
-The third row contains table
+easy_prompt = {
+    "tier_type": "easy",
+    "prompt_name": "one page + table + card + chart + 2 filters",
+    "prompt_text": """
+    I need a page with 1 table, 1 card and 1 chart.
+    The table shows the tech companies stock data.
+    The card says 'The Gapminder dataset provides historical data on countries' development indicators.'
+    The chart is the scatter plot which uses gapminder dataframe
+    and showing life expectancy vs. GDP per capita by country.
+    Life expectancy on the y axis, GDP per capita on the x axis, and colored by continent.
 
-Add a filter to filter the scatter plot by continent.
-Add a second filter to filter the table by companies.
-"""
+    The layout uses a grid of 2 columns and 3 rows.
+    The first row contains card
+    The second row contains chart
+    The third row contains table
 
-medium_prompt = """
-<Page 1>
-I need a page with 1 table and 1 line chart.
-The chart shows the stock price trends of GOOG and AAPL.
-The table shows the stock prices data details.
+    Add a filter to filter the scatter plot by continent.
+    Add a second filter to filter the table by companies.
+    """,
+    "expected_config": {
+        "pages": [
+            {
+                "components": [
+                    Component(type="ag_grid"),
+                    Component(type="card"),
+                    Component(type="graph"),
+                ],
+                "controls": [
+                    Control(type="filter"),
+                    Control(type="filter"),
+                ],
+            },
+        ],
+    },
+}
 
-<Page 2>
-I need a second page showing 3 cards and 4 charts.
-The cards says 'The Gapminder dataset provides historical data on countries' development indicators.'
-The charts are the scatter plots showing GDP per capita vs. life expectancy.
-GDP per capita on the x axis, life expectancy on the y axis, and colored by continent.
-Layout the cards on the left and the chart on the right.
-Add a filter to filter the scatter plots by continent.
-Add a second filter to filter the charts by year.
+medium_prompt = {
+    "tier_type": "medium",
+    "prompt_name": "4 pages with supported prompt instructions",
+    "prompt_text": """
+    <Page 1>
+    I need a page with 1 table and 1 line chart.
+    The chart shows the stock price trends of GOOG and AAPL.
+    The table shows the stock prices data details.
 
-<Page 3>
-This page displays the tips dataset. use four different charts to show data
-distributions. one chart should be a bar chart. the other should be a scatter plot.
-next chart should be a line chart. last one should be an area plot.
-first and second charts are on the left and the third and fourth charts are on the right.
-Add a filter to filter data in every plot by smoker.
+    <Page 2>
+    I need a second page showing 3 cards and 4 charts.
+    The cards says 'The Gapminder dataset provides historical data on countries' development indicators.'
+    The charts are the scatter plots showing GDP per capita vs. life expectancy.
+    GDP per capita on the x axis, life expectancy on the y axis, and colored by continent.
+    Layout the cards on the left and the chart on the right.
+    Add a filter to filter the scatter plots by continent.
+    Add a second filter to filter the charts by year.
 
-<Page 4>
-Create 3 cards on this page:
-1. The first card on top says "This page combines data from various sources
- including tips, stock prices, and global indicators."
-2. The second card says "Insights from Gapminder dataset."
-3. The third card says "Stock price trends over time."
+    <Page 3>
+    This page displays the tips dataset. use four different charts to show data
+    distributions. one chart should be a bar chart. the other should be a scatter plot.
+    next chart should be a line chart. last one should be an area plot.
+    first and second charts are on the left and the third and fourth charts are on the right.
+    Add a filter to filter data in every plot by smoker.
 
-Layout these 3 cards in this way:
-create a grid with 3 columns and 2 rows.
-Row 1: The first row has three columns:
-- The first column is empty.
-- The second and third columns span the area for card 1.
+    <Page 4>
+    Create 3 cards on this page:
+    1. The first card on top says "This page combines data from various sources
+     including tips, stock prices, and global indicators."
+    2. The second card says "Insights from Gapminder dataset."
+    3. The third card says "Stock price trends over time."
 
-Row 2: The second row also has three columns:
-- The first column is empty.
-- The second column is occupied by the area for card 2.
-- The third column is occupied by the area for card 3.
-"""
+    Layout these 3 cards in this way:
+    create a grid with 3 columns and 2 rows.
+    Row 1: The first row has three columns:
+    - The first column is empty.
+    - The second and third columns span the area for card 1.
 
+    Row 2: The second row also has three columns:
+    - The first column is empty.
+    - The second column is occupied by the area for card 2.
+    - The third column is occupied by the area for card 3.
+    """,
+    "expected_config": {
+        "pages": [
+            {
+                "components": [
+                    Component(type="ag_grid"),
+                    Component(type="graph"),
+                ],
+                "controls": [],
+            },
+            {
+                "components": [
+                    Component(type="card"),
+                    Component(type="graph"),
+                ],
+                "controls": [
+                    Control(type="filter"),
+                    Control(type="filter"),
+                ],
+            },
+            {
+                "components": [
+                    Component(type="graph"),
+                    Component(type="graph"),
+                ],
+                "controls": [
+                    Control(type="filter"),
+                ],
+            },
+            {
+                "components": [
+                    Component(type="card"),
+                    Component(type="card"),
+                    Component(type="card"),
+                ],
+                "controls": [],
+            },
+        ],
+    },
+}
 
-complex_prompt = """
-<Page 1>
-Show me 1 table on the first page that shows tips and sorted by day
-Using export button I want to export data to csv
-Add filters by bill and by tip amount using range slider
+complex_prompt = {
+    "tier_type": "complex",
+    "prompt_name": "4 pages with mix of supported and unsupported prompt instructions",
+    "prompt_text": """
+    <Page 1>
+    Show me 1 table on the first page that shows tips and sorted by day
+    Using export button I want to export data to csv
+    Add filters by bill and by tip amount using slider
 
-<Page 2>
-Second page should contain kpi cards with population trends and
-two popular charts that display population per capita vs. continent.
-Filter charts by GDP using dropdown.
-Align kpi cards in one row and charts in different.
-Both charts should be in tabs.
+    <Page 2>
+    Second page should contain kpi cards with population trends and
+    two popular charts that display population per capita vs. continent.
+    Filter charts by GDP using dropdown.
+    Align kpi cards in one row and charts in different.
+    Both charts should be in tabs.
 
-<Page 3>
-Third page should contain 6 charts showing stocks.
-Each should have separate filter by date.
-Filter types should include dropdown, datepicker, slider, checklist and radio items.
-Add parameter for any chart.
+    <Page 3>
+    Third page should contain 6 charts showing stocks.
+    Each should have separate filter by date.
+    Filter types should include dropdown, datepicker, slider, checklist and radio items.
+    Add parameter for any chart.
 
-<Page 4>
-Fourth page contains chart with wind data.
-Table with population data.
-Two more charts with stocks and tips representations.
-Align table beautifully relative to the charts.
-Every chart should have 2 filters.
-Table should have 1 filter.
-"""
+    <Page 4>
+    Fourth page contains chart with wind data.
+    Table with population per capita data.
+    Two more charts with stocks and tips representations.
+    Align table beautifully relative to the charts.
+    Every chart should have 2 filters.
+    Table should have 1 filter.
+    """,
+    "expected_config": {
+        "pages": [
+            {
+                "components": [
+                    Component(type="ag_grid"),
+                ],
+                "controls": [Control(type="filter"), Control(type="filter")],
+            },
+            {
+                "components": [Component(type="graph"), Component(type="graph")],
+                "controls": [Control(type="filter")],
+            },
+            {
+                "components": [
+                    Component(type="graph"),
+                    Component(type="graph"),
+                    Component(type="graph"),
+                    Component(type="graph"),
+                    Component(type="graph"),
+                    Component(type="graph"),
+                ],
+                "controls": [
+                    Control(type="filter"),
+                    Control(type="filter"),
+                    Control(type="filter"),
+                    Control(type="filter"),
+                    Control(type="filter"),
+                    Control(type="filter"),
+                ],
+            },
+            {
+                "components": [
+                    Component(type="ag_grid"),
+                    Component(type="graph"),
+                    Component(type="graph"),
+                    Component(type="graph"),
+                ],
+                "controls": [
+                    Control(type="filter"),
+                    Control(type="filter"),
+                    Control(type="filter"),
+                    Control(type="filter"),
+                    Control(type="filter"),
+                    Control(type="filter"),
+                    Control(type="filter"),
+                ],
+            },
+        ],
+    },
+}
diff --git a/vizro-ai/tests/score/pytest.ini b/vizro-ai/tests/score/pytest.ini
deleted file mode 100644
index 7f2efb67c..000000000
--- a/vizro-ai/tests/score/pytest.ini
+++ /dev/null
@@ -1,10 +0,0 @@
-[pytest]
-markers =
-    easy_dashboard: mark test with easy prompt for dashboard creation.
-    medium_dashboard: mark test with medium prompt for dashboard creation.
-    complex_dashboard: mark test with complex prompt for dashboard creation.
-
-filterwarnings =
-    ignore::UserWarning
-    # Ignore deprecation warning until this is solved: https://github.com/plotly/dash/issues/2590:
-    ignore:HTTPResponse.getheader():DeprecationWarning
diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py
index 0815a48aa..5c7f4f9d5 100644
--- a/vizro-ai/tests/score/test_dashboard.py
+++ b/vizro-ai/tests/score/test_dashboard.py
@@ -4,9 +4,7 @@
 import os
 import statistics
 from collections import Counter
-from dataclasses import dataclass
 from datetime import datetime
-from typing import Literal
 
 import chromedriver_autoinstaller
 import numpy as np
@@ -23,16 +21,6 @@
 df4 = px.data.wind()
 
 
-@dataclass
-class Component:
-    type: Literal["ag_grid", "card", "graph"]
-
-
-@dataclass
-class Control:
-    type: Literal["filter", "parameter"]
-
-
 @pytest.fixture(scope="module", autouse=True)
 def setup_test_environment():
     # We only need to install chromedriver outside CI.
@@ -46,11 +34,12 @@ def score_calculator(metrics_score: list[int]):
     return statistics.mean(metrics_score) if len(metrics_score) != 0 else 1
 
 
-def logic(  # noqa: PLR0912, PLR0915
+def logic(  # noqa: PLR0912, PLR0913, PLR0915
     dashboard,
     model_name,
     dash_duo,
     prompt_tier,
+    prompt_name,
     prompt_text,
     config: dict,
 ):
@@ -61,6 +50,7 @@ def logic(  # noqa: PLR0912, PLR0915
         model_name: GenAI model name
         dash_duo: dash_duo fixture
         prompt_tier: complexity of the prompt
+        prompt_name: short prompt description
         prompt_text: prompt text
         config: json config of the expected dashboard
 
@@ -201,6 +191,7 @@ def logic(  # noqa: PLR0912, PLR0915
         python_version,
         model_name,
         prompt_tier,
+        prompt_name,
         prompt_text,
         weighted_score,
     ]
@@ -218,6 +209,7 @@ def logic(  # noqa: PLR0912, PLR0915
                     "python_version",
                     "model",
                     "prompt_tier",
+                    "prompt_name",
                     "prompt_text",
                     "weighted_score",
                 ]
@@ -238,159 +230,50 @@ def logic(  # noqa: PLR0912, PLR0915
     print(f"Scores: {scores}")  # noqa: T201
 
 
-@pytest.mark.easy_dashboard
 @pytest.mark.parametrize(
     "model_name",
     [
         "gpt-4o-mini",
-        "claude-3-5-sonnet-latest",
     ],
     ids=[
         "gpt-4o-mini",
-        "claude-3-5-sonnet-latest",
     ],
 )
-def test_easy_dashboard(dash_duo, model_name):
-    dashboard = VizroAI(model=model_name).dashboard([df1, df2], easy_prompt)
-
-    logic(
-        dashboard=dashboard,
-        model_name=model_name,
-        dash_duo=dash_duo,
-        prompt_tier="easy",
-        prompt_text=easy_prompt.replace("\n", " "),
-        config={
-            "pages": [
-                {
-                    "components": [
-                        Component(type="ag_grid"),
-                        Component(type="card"),
-                        Component(type="graph"),
-                    ],
-                    "controls": [
-                        Control(type="filter"),
-                        Control(type="filter"),
-                    ],
-                },
-            ],
-        },
-    )
-
-
-@pytest.mark.medium_dashboard
-@pytest.mark.parametrize("model_name", ["gpt-4o-mini"], ids=["gpt-4o-mini"])
-def test_medium_dashboard(dash_duo, model_name):
-    dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3], medium_prompt)
-
-    logic(
-        dashboard=dashboard,
-        model_name=model_name,
-        dash_duo=dash_duo,
-        prompt_tier="medium",
-        prompt_text=medium_prompt.replace("\n", " "),
-        config={
-            "pages": [
-                {
-                    "components": [
-                        Component(type="ag_grid"),
-                        Component(type="graph"),
-                    ],
-                    "controls": [],
-                },
-                {
-                    "components": [
-                        Component(type="card"),
-                        Component(type="graph"),
-                    ],
-                    "controls": [
-                        Control(type="filter"),
-                        Control(type="filter"),
-                    ],
-                },
-                {
-                    "components": [
-                        Component(type="graph"),
-                        Component(type="graph"),
-                    ],
-                    "controls": [
-                        Control(type="filter"),
-                    ],
-                },
-                {
-                    "components": [
-                        Component(type="card"),
-                        Component(type="card"),
-                        Component(type="card"),
-                    ],
-                    "controls": [],
-                },
-            ],
-        },
-    )
-
-
-@pytest.mark.complex_dashboard
 @pytest.mark.parametrize(
-    "model_name",
-    ["gpt-4o-mini"],
-    ids=["gpt-4o-mini"],
+    "tier_type, prompt_name, prompt_text, expected_config, dfs",
+    [
+        (
+            easy_prompt["tier_type"],
+            easy_prompt["prompt_name"],
+            easy_prompt["prompt_text"],
+            easy_prompt["expected_config"],
+            [df1, df2],
+        ),
+        (
+            medium_prompt["tier_type"],
+            medium_prompt["prompt_name"],
+            medium_prompt["prompt_text"],
+            medium_prompt["expected_config"],
+            [df1, df2, df3],
+        ),
+        (
+            complex_prompt["tier_type"],
+            complex_prompt["prompt_name"],
+            complex_prompt["prompt_text"],
+            complex_prompt["expected_config"],
+            [df1, df2, df3, df4],
+        ),
+    ],
 )
-def test_complex_dashboard(dash_duo, model_name):
-    dashboard = VizroAI(model=model_name).dashboard([df1, df2, df3, df4], complex_prompt)
+def test_dashboard(dash_duo, model_name, tier_type, prompt_name, prompt_text, expected_config, dfs):  # noqa: PLR0913
+    created_dashboard = VizroAI(model=model_name).dashboard(dfs, prompt_text)
 
     logic(
-        dashboard=dashboard,
+        dashboard=created_dashboard,
         model_name=model_name,
         dash_duo=dash_duo,
-        prompt_tier="complex",
-        prompt_text=complex_prompt.replace("\n", " "),
-        config={
-            "pages": [
-                {
-                    "components": [
-                        Component(type="ag_grid"),
-                    ],
-                    "controls": [Control(type="filter"), Control(type="filter")],
-                },
-                {
-                    "components": [Component(type="graph"), Component(type="graph")],
-                    "controls": [Control(type="filter")],
-                },
-                {
-                    "components": [
-                        Component(type="graph"),
-                        Component(type="graph"),
-                        Component(type="graph"),
-                        Component(type="graph"),
-                        Component(type="graph"),
-                        Component(type="graph"),
-                    ],
-                    "controls": [
-                        Control(type="filter"),
-                        Control(type="filter"),
-                        Control(type="filter"),
-                        Control(type="filter"),
-                        Control(type="filter"),
-                        Control(type="filter"),
-                    ],
-                },
-                {
-                    "components": [
-                        Component(type="ag_grid"),
-                        Component(type="graph"),
-                        Component(type="graph"),
-                        Component(type="graph"),
-                    ],
-                    "controls": [
-                        Control(type="filter"),
-                        Control(type="filter"),
-                        Control(type="filter"),
-                        Control(type="filter"),
-                        Control(type="filter"),
-                        Control(type="filter"),
-                        Control(type="filter"),
-                    ],
-                },
-            ],
-        },
+        prompt_tier=tier_type,
+        prompt_name=prompt_name,
+        prompt_text=prompt_text.replace("\n", " "),
+        config=expected_config,
     )

From c52f3669ee6eb2da4085b5adf5d0a4e05b97f190 Mon Sep 17 00:00:00 2001
From: Alexey Snigir <alexey_snigir@external.mckinsey.com>
Date: Wed, 22 Jan 2025 19:06:15 +0100
Subject: [PATCH 14/14] separate anthropic test

---
 vizro-ai/tests/score/test_dashboard.py | 44 +++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/vizro-ai/tests/score/test_dashboard.py b/vizro-ai/tests/score/test_dashboard.py
index 5c7f4f9d5..5590fb683 100644
--- a/vizro-ai/tests/score/test_dashboard.py
+++ b/vizro-ai/tests/score/test_dashboard.py
@@ -59,9 +59,8 @@ def logic(  # noqa: PLR0912, PLR0913, PLR0915
     report_dir = "tests/score/reports"
     os.makedirs(report_dir, exist_ok=True)
 
-    app = Vizro().build(dashboard).dash
-
     try:
+        app = Vizro().build(dashboard).dash
         dash_duo.start_server(app)
         app_started = 1
         app_started_report = "App started!"
@@ -192,10 +191,10 @@ def logic(  # noqa: PLR0912, PLR0913, PLR0915
         model_name,
         prompt_tier,
         prompt_name,
-        prompt_text,
         weighted_score,
     ]
     data_rows.extend(score["score"] for score in scores)
+    data_rows.extend([prompt_text])
 
     with open(f"{report_dir}/report_model_{model_name}_{vizro_type}.csv", "a", newline=""):
         with open(f"{report_dir}/report_model_{model_name}_{vizro_type}.csv", "r+", newline="") as csvfile:
@@ -210,10 +209,10 @@ def logic(  # noqa: PLR0912, PLR0913, PLR0915
                     "model",
                     "prompt_tier",
                     "prompt_name",
-                    "prompt_text",
                     "weighted_score",
                 ]
                 header_rows.extend(score["score_name"] for score in scores)
+                header_rows.extend(["prompt_text"])
                 writer.writerow(header_rows)
             writer.writerow(data_rows)
 
@@ -265,7 +264,42 @@ def logic(  # noqa: PLR0912, PLR0913, PLR0915
         ),
     ],
 )
-def test_dashboard(dash_duo, model_name, tier_type, prompt_name, prompt_text, expected_config, dfs):  # noqa: PLR0913
+def test_dashboard_openai(dash_duo, model_name, tier_type, prompt_name, prompt_text, expected_config, dfs):  # noqa: PLR0913
+    created_dashboard = VizroAI(model=model_name).dashboard(dfs, prompt_text)
+
+    logic(
+        dashboard=created_dashboard,
+        model_name=model_name,
+        dash_duo=dash_duo,
+        prompt_tier=tier_type,
+        prompt_name=prompt_name,
+        prompt_text=prompt_text.replace("\n", " "),
+        config=expected_config,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_name",
+    [
+        "claude-3-5-sonnet-latest",
+    ],
+    ids=[
+        "claude-3-5-sonnet-latest",
+    ],
+)
+@pytest.mark.parametrize(
+    "tier_type, prompt_name, prompt_text, expected_config, dfs",
+    [
+        (
+            easy_prompt["tier_type"],
+            easy_prompt["prompt_name"],
+            easy_prompt["prompt_text"],
+            easy_prompt["expected_config"],
+            [df1, df2],
+        ),
+    ],
+)
+def test_dashboard_anthropic(dash_duo, model_name, tier_type, prompt_name, prompt_text, expected_config, dfs):  # noqa: PLR0913
     created_dashboard = VizroAI(model=model_name).dashboard(dfs, prompt_text)
 
     logic(