From f94da5ce4ae22e1530a03feb30158e7d86961f01 Mon Sep 17 00:00:00 2001
From: Harry Li <harry.li@ll.mit.edu>
Date: Tue, 19 Nov 2024 14:04:00 -0500
Subject: [PATCH 01/10] standardized output file names, created standalone
 mintaka-wikidata folder, moved parseCSVFile to own file

---
 .../{ => mintaka-wikidata}/.gitignore         |  0
 .../{ => mintaka-wikidata}/README.md          |  0
 .../calculateMintakaEvaluationMetrics.ts      | 17 +++------------
 .../curatedMintakaQuestions.csv               |  0
 .../mintaka-wikidata/data/.gitignore          |  1 +
 .../mintakaEvaluation.ts                      | 20 +++++++++---------
 .../{ => mintaka-wikidata}/plot/.gitignore    |  0
 .../{ => mintaka-wikidata}/plot/README        |  0
 .../plot/requirements.txt                     |  0
 .../plot/validation_figures.py                |  4 ++--
 .../prepMintakaQuestions.ts                   | 14 +------------
 src/utils/evaluations/questions.ts            |  6 ------
 src/utils/parseCSVFile.ts                     | 21 +++++++++++++++++++
 13 files changed, 38 insertions(+), 45 deletions(-)
 rename src/utils/evaluations/{ => mintaka-wikidata}/.gitignore (100%)
 rename src/utils/evaluations/{ => mintaka-wikidata}/README.md (100%)
 rename src/utils/evaluations/{ => mintaka-wikidata}/calculateMintakaEvaluationMetrics.ts (94%)
 rename src/utils/evaluations/{ => mintaka-wikidata}/curatedMintakaQuestions.csv (100%)
 create mode 100644 src/utils/evaluations/mintaka-wikidata/data/.gitignore
 rename src/utils/evaluations/{ => mintaka-wikidata}/mintakaEvaluation.ts (93%)
 rename src/utils/evaluations/{ => mintaka-wikidata}/plot/.gitignore (100%)
 rename src/utils/evaluations/{ => mintaka-wikidata}/plot/README (100%)
 rename src/utils/evaluations/{ => mintaka-wikidata}/plot/requirements.txt (100%)
 rename src/utils/evaluations/{ => mintaka-wikidata}/plot/validation_figures.py (95%)
 rename src/utils/evaluations/{ => mintaka-wikidata}/prepMintakaQuestions.ts (90%)
 delete mode 100644 src/utils/evaluations/questions.ts
 create mode 100644 src/utils/parseCSVFile.ts
diff --git a/src/utils/evaluations/.gitignore b/src/utils/evaluations/mintaka-wikidata/.gitignore
similarity index 100%
rename from src/utils/evaluations/.gitignore
rename to src/utils/evaluations/mintaka-wikidata/.gitignore
diff --git a/src/utils/evaluations/README.md b/src/utils/evaluations/mintaka-wikidata/README.md
similarity index 100%
rename from src/utils/evaluations/README.md
rename to src/utils/evaluations/mintaka-wikidata/README.md
diff --git a/src/utils/evaluations/calculateMintakaEvaluationMetrics.ts b/src/utils/evaluations/mintaka-wikidata/calculateMintakaEvaluationMetrics.ts
similarity index 94%
rename from src/utils/evaluations/calculateMintakaEvaluationMetrics.ts
rename to src/utils/evaluations/mintaka-wikidata/calculateMintakaEvaluationMetrics.ts
index 694988c..1a970ba 100644
--- a/src/utils/evaluations/calculateMintakaEvaluationMetrics.ts
+++ b/src/utils/evaluations/mintaka-wikidata/calculateMintakaEvaluationMetrics.ts
@@ -6,9 +6,10 @@
 import fs from "fs"
 import papaparse from "papaparse"
 import { EvaluationOutputRowType } from "./mintakaEvaluation";
+import { parseCSVFile } from "utils/parseCSVFile";
 
 
-calculateMetrics("./LinkQ Evaluation Output.csv","./Plain LLM Evaluation Output.csv","./output.csv")
+calculateMetrics("./data/linkq-evaluation-results.csv","./data/plainllm-evaluation-results.csv","./data/aggregated-evaluation-results.csv")
 
 type MetricType = {
   complexityType: string,
@@ -41,6 +42,7 @@ async function calculateMetrics(
     parseCSVFile<EvaluationOutputRowType>(linkqDataPath),
     parseCSVFile<EvaluationOutputRowType>(plainLLMDataPath),
   ])
+  console.log("linkqData",linkqData)
   console.log("Parsed data")
   if(linkqData.length !== plainLLMData.length) {
     throw new Error(`linkqData and plainLLMData lengths do not match`)
@@ -182,19 +184,6 @@ function isSyntaxCorrect(row: EvaluationOutputRowType) {
   return value === "YES"
 }
 
-
-export function parseCSVFile<T>(path:string):Promise<T[]> {
-  return new Promise((resolve) => {
-    const file = fs.createReadStream(path)
-    papaparse.parse<T>(file, {
-      header: true,
-      complete: function(results) {
-        resolve(results.data)
-      }
-    })
-  })
-}
-
 function meanAndStd(numArray: number[]) {
   let min = Infinity
   let max = -Infinity
diff --git a/src/utils/evaluations/curatedMintakaQuestions.csv b/src/utils/evaluations/mintaka-wikidata/curatedMintakaQuestions.csv
similarity index 100%
rename from src/utils/evaluations/curatedMintakaQuestions.csv
rename to src/utils/evaluations/mintaka-wikidata/curatedMintakaQuestions.csv
diff --git a/src/utils/evaluations/mintaka-wikidata/data/.gitignore b/src/utils/evaluations/mintaka-wikidata/data/.gitignore
new file mode 100644
index 0000000..16f2dc5
--- /dev/null
+++ b/src/utils/evaluations/mintaka-wikidata/data/.gitignore
@@ -0,0 +1 @@
+*.csv
\ No newline at end of file
diff --git a/src/utils/evaluations/mintakaEvaluation.ts b/src/utils/evaluations/mintaka-wikidata/mintakaEvaluation.ts
similarity index 93%
rename from src/utils/evaluations/mintakaEvaluation.ts
rename to src/utils/evaluations/mintaka-wikidata/mintakaEvaluation.ts
index 0159e11..c4505d3 100644
--- a/src/utils/evaluations/mintakaEvaluation.ts
+++ b/src/utils/evaluations/mintaka-wikidata/mintakaEvaluation.ts
@@ -15,15 +15,15 @@ if (process.env.HTTPS_PROXY) {
 import fs from "fs"
 import papaparse from "papaparse"
 
-import { ChatGPTAPI } from "../ChatGPTAPI"
-import { tryParsingOutQuery } from "../tryParsingOutQuery"
-import { runQuery } from "../knowledgeBase/runQuery"
-import { summarizeQueryResults } from "../summarizeQueryResults"
-import { getEntityDataFromQuery } from "../knowledgeBase/getEntityData"
-import { formatSparqlResultsAsString } from "../formatSparqlResultsAsString"
+import { ChatGPTAPI } from "../../ChatGPTAPI"
+import { tryParsingOutQuery } from "../../tryParsingOutQuery"
+import { runQuery } from "../../knowledgeBase/runQuery"
+import { summarizeQueryResults } from "../../summarizeQueryResults"
+import { getEntityDataFromQuery } from "../../knowledgeBase/getEntityData"
+import { formatSparqlResultsAsString } from "../../formatSparqlResultsAsString"
 import { QUESTIONS } from "./questions"
-import { INITIAL_SYSTEM_MESSAGE } from "../knowledgeBase/prompts"
-import { queryBuildingWorkflow } from "../queryBuildingWorkflow"
+import { INITIAL_SYSTEM_MESSAGE } from "../../knowledgeBase/prompts"
+import { queryBuildingWorkflow } from "../../queryBuildingWorkflow"
 
 import { loadEnv } from 'vite'
 const ENV = loadEnv("development","../../../")
@@ -124,7 +124,7 @@ async function runMintakaEvaluation(
 
 export async function runLinkQMintakaEvaluation() {
   return await runMintakaEvaluation(
-    `LinkQ Evaluation Output ${new Date().getTime()}.csv`,
+    `linkq-evaluation-output-${new Date().getTime()}.csv`,
     async (chatGPT:ChatGPTAPI, question:string) => {
       //force the LLM to start the query building workflow
       chatGPT.messages = [
@@ -155,7 +155,7 @@ export async function runLinkQMintakaEvaluation() {
 
 export async function runPlainLLMMintakaEvaluation() {
   return await runMintakaEvaluation(
-    `Plain LLM Evaluation Output ${new Date().getTime()}.csv`,
+    `plainllm-evaluation-results-${new Date().getTime()}.csv`,
     async (chatGPT:ChatGPTAPI, question:string) => {
       return await chatGPT.sendMessages([
         {
diff --git a/src/utils/evaluations/plot/.gitignore b/src/utils/evaluations/mintaka-wikidata/plot/.gitignore
similarity index 100%
rename from src/utils/evaluations/plot/.gitignore
rename to src/utils/evaluations/mintaka-wikidata/plot/.gitignore
diff --git a/src/utils/evaluations/plot/README b/src/utils/evaluations/mintaka-wikidata/plot/README
similarity index 100%
rename from src/utils/evaluations/plot/README
rename to src/utils/evaluations/mintaka-wikidata/plot/README
diff --git a/src/utils/evaluations/plot/requirements.txt b/src/utils/evaluations/mintaka-wikidata/plot/requirements.txt
similarity index 100%
rename from src/utils/evaluations/plot/requirements.txt
rename to src/utils/evaluations/mintaka-wikidata/plot/requirements.txt
diff --git a/src/utils/evaluations/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
similarity index 95%
rename from src/utils/evaluations/plot/validation_figures.py
rename to src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
index 0ee4d29..a1369ea 100644
--- a/src/utils/evaluations/plot/validation_figures.py
+++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
@@ -10,7 +10,7 @@
 sns.set(rc={'figure.dpi': 300, 'savefig.dpi': 300})
 
 ROOT = Path(__file__).parent
-DATA = Path(ROOT / 'raw_data')
+DATA = Path(ROOT)
 PLOTS = Path(ROOT / 'plots')
 
 def get_aggregated_accuracy_data():
@@ -21,7 +21,7 @@ def get_aggregated_accuracy_data():
 
 def get_raw_timing_data():
     timing_columns = ['Total Seconds', 'id', 'complexityType', 'category']
-    linkq_df = pd.read_csv(Path(DATA, 'linq-evaluation-results.csv'), usecols=timing_columns)
+    linkq_df = pd.read_csv(Path(DATA, 'linkq-evaluation-results.csv'), usecols=timing_columns)
     linkq_df['Algorithm'] = 'LinkQ'
     plainllm_df = pd.read_csv(Path(DATA, 'plainllm-evaluation-results.csv'), usecols=timing_columns)
     plainllm_df['Algorithm'] = 'GPT'
diff --git a/src/utils/evaluations/prepMintakaQuestions.ts b/src/utils/evaluations/mintaka-wikidata/prepMintakaQuestions.ts
similarity index 90%
rename from src/utils/evaluations/prepMintakaQuestions.ts
rename to src/utils/evaluations/mintaka-wikidata/prepMintakaQuestions.ts
index af7fe06..fc1bdde 100644
--- a/src/utils/evaluations/prepMintakaQuestions.ts
+++ b/src/utils/evaluations/mintaka-wikidata/prepMintakaQuestions.ts
@@ -4,9 +4,9 @@
 //npx tsx prepMintakaQuestions.ts
 
 import fs from "fs"
-import papaparse from "papaparse"
 
 import { MintakaQuestionType } from "./mintakaEvaluation";
+import { parseCSVFile } from "utils/parseCSVFile";
 
 prepMintakaQuestions()
 
@@ -86,15 +86,3 @@ export const QUESTIONS:MintakaQuestionType[] = ${JSON.stringify(filteredQuestion
   fs.writeFileSync("./questions.ts",questionsFileContent)
   console.log("Done prepping Mintaka questions!")
 }
-
-export function parseCSVFile<T>(path:string):Promise<T[]> {
-  return new Promise((resolve) => {
-    const file = fs.createReadStream(path)
-    papaparse.parse<T>(file, {
-      header: true,
-      complete: function(results) {
-        resolve(results.data)
-      }
-    })
-  })
-}
\ No newline at end of file
diff --git a/src/utils/evaluations/questions.ts b/src/utils/evaluations/questions.ts
deleted file mode 100644
index 7bd7259..0000000
--- a/src/utils/evaluations/questions.ts
+++ /dev/null
@@ -1,6 +0,0 @@
-// Copyright (c) 2024 Massachusetts Institute of Technology
-// SPDX-License-Identifier: MIT
-
-import { MintakaQuestionType } from "./mintakaEvaluation";
-
-export const QUESTIONS:MintakaQuestionType[] = []
\ No newline at end of file
diff --git a/src/utils/parseCSVFile.ts b/src/utils/parseCSVFile.ts
new file mode 100644
index 0000000..8afcd91
--- /dev/null
+++ b/src/utils/parseCSVFile.ts
@@ -0,0 +1,21 @@
+// Copyright (c) 2024 Massachusetts Institute of Technology
+// SPDX-License-Identifier: MIT
+
+import fs from "fs"
+import papaparse from "papaparse"
+
+export function parseCSVFile<T>(path:string):Promise<T[]> {
+  return new Promise((resolve, reject) => {
+    if(!fs.existsSync(path)) {
+      return reject(new Error(`The path '${path}' does not exist`))
+    }
+    const file = fs.createReadStream(path)
+    papaparse.parse<T>(file, {
+      header: true,
+      complete: function(results) {
+        console.log(path, results.data[0])
+        resolve(results.data)
+      }
+    })
+  })
+}
\ No newline at end of file

From 978e582d9e36fb3dbf147dd0529176d429d35551 Mon Sep 17 00:00:00 2001
From: Harry Li <harry.li@ll.mit.edu>
Date: Tue, 19 Nov 2024 14:39:19 -0500
Subject: [PATCH 02/10] reproduce images in submission

---
 .../evaluations/mintaka-wikidata/plot/README    | 12 ------------
 .../evaluations/mintaka-wikidata/plot/README.md | 16 ++++++++++++++++
 .../mintaka-wikidata/plot/validation_figures.py | 17 ++++++++++-------
 3 files changed, 26 insertions(+), 19 deletions(-)
 delete mode 100644 src/utils/evaluations/mintaka-wikidata/plot/README
 create mode 100644 src/utils/evaluations/mintaka-wikidata/plot/README.md

diff --git a/src/utils/evaluations/mintaka-wikidata/plot/README b/src/utils/evaluations/mintaka-wikidata/plot/README
deleted file mode 100644
index 2ccc549..0000000
--- a/src/utils/evaluations/mintaka-wikidata/plot/README
+++ /dev/null
@@ -1,12 +0,0 @@
-1.) Go to the evaluation results in google drive.
-2.) Download as a CSV without any extraneous tables that may be in the sheet
-    - aggregated results
-    - linkq results
-    - plainllm results
-3.) Create a new folder within this one called 'raw_data'
-4.) Place the CSVs from step 2 in the 'raw_data' folder
-5.) Rename the CSVs:
-    - 'Evaluation for CHI - Aggregated Results': 'aggregated-evaluation-results.csv'
-    - 'Evaluation for CHI - Plain LLM Evaluation Output': 'plainllm-evaluation-results.csv'
-    - 'Evaluation for CHI - LinkQ Evaluation Output': 'linq-evaluation-results.csv'
-6.) Run validation_figures.py in an environment with seaborn installed. (An example requirements.txt is provided.)
\ No newline at end of file
diff --git a/src/utils/evaluations/mintaka-wikidata/plot/README.md b/src/utils/evaluations/mintaka-wikidata/plot/README.md
new file mode 100644
index 0000000..9e7cbb3
--- /dev/null
+++ b/src/utils/evaluations/mintaka-wikidata/plot/README.md
@@ -0,0 +1,16 @@
+1. Download the evaluation results from TODO
+2. Place the CSVs from the `../data` folder
+3. Rename the CSVs, if applicable:
+    - 'Evaluation for CHI - Aggregated Results': 'aggregated-evaluation-results.csv'
+    - 'Evaluation for CHI - Plain LLM Evaluation Output': 'plainllm-evaluation-results.csv'
+    - 'Evaluation for CHI - LinkQ Evaluation Output': 'linq-evaluation-results.csv'
+4. Create a new conda environment, adivate it, and download the requirements
+```
+conda create --name linkq python=3.12
+conda activate linkq
+pip install -r requirements.txt
+```
+5. Run the script to generate the plots
+```
+python validation_figures.py
+```
\ No newline at end of file
diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
index a1369ea..ba815db 100644
--- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
+++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
@@ -10,12 +10,12 @@
 sns.set(rc={'figure.dpi': 300, 'savefig.dpi': 300})
 
 ROOT = Path(__file__).parent
-DATA = Path(ROOT)
+DATA = Path(ROOT.parent / 'data')
 PLOTS = Path(ROOT / 'plots')
 
 def get_aggregated_accuracy_data():
     df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question'])
-    df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT', 'complexityType': 'Category', 'category': 'Domain'})
+    df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Category', 'category': 'Domain'})
     df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'})
     return df
 
@@ -24,7 +24,7 @@ def get_raw_timing_data():
     linkq_df = pd.read_csv(Path(DATA, 'linkq-evaluation-results.csv'), usecols=timing_columns)
     linkq_df['Algorithm'] = 'LinkQ'
     plainllm_df = pd.read_csv(Path(DATA, 'plainllm-evaluation-results.csv'), usecols=timing_columns)
-    plainllm_df['Algorithm'] = 'GPT'
+    plainllm_df['Algorithm'] = 'GPT-4'
     combined_df = pd.concat([linkq_df, plainllm_df]).reset_index(drop=True)
     combined_df = combined_df.rename(columns={'complexityType': 'Category', 'category': 'Domain'})
     return combined_df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'})
@@ -32,18 +32,20 @@ def get_raw_timing_data():
 def percent_formatter(x):
     return f'{round(x)}%'
 
+palette = {'LinkQ': '#1f78b4', 'GPT-4': '#fdbf6f'}
+
 def accuracy_barchart_by_category():
     df = get_aggregated_accuracy_data()
     # Assumes same number of questions per category
     # If so must be int
     num_questions_per_category = len(df) // len(df['Category'].unique())
     df['LinkQ'] = (df['LinkQ'] > 0).astype(int)
-    df['GPT'] = (df['GPT'] > 0).astype(int)
+    df['GPT-4'] = (df['GPT-4'] > 0).astype(int)
     df = pd.melt(df, id_vars=['id', 'Domain', 'Category', 'question'], var_name='Algorithm', value_name='Correct')
-    df = df.groupby(['Category', 'Algorithm']).agg({'Correct': 'sum'}).reset_index()
+    df = df.groupby(['Category', 'Algorithm']).agg({'Correct': 'sum'}).sort_values(by="Correct",ascending=False).reset_index()
     df['Fraction'] = [f'{v}/{num_questions_per_category}' for v in df['Correct']]
     df['% Correct'] = (df['Correct'] / num_questions_per_category) * 100
-    ax = sns.barplot(df, x='Category', y='% Correct', hue='Algorithm', hue_order=['LinkQ', 'GPT'])
+    ax = sns.barplot(df, x='Category', y='% Correct', hue='Algorithm', hue_order=['LinkQ', 'GPT-4'], palette=palette)
 
     for container in ax.containers:
         ax.bar_label(container, fmt=percent_formatter)
@@ -52,7 +54,7 @@ def accuracy_barchart_by_category():
 
 def timing_boxplot_by_category():
     df = get_raw_timing_data()
-    sns.boxplot(df, x='Category', y='Total Seconds', hue='Algorithm')
+    sns.boxplot(df, x='Category', y='Total Seconds', hue='Algorithm', palette=palette)
     plt.savefig(Path(PLOTS, 'timing_boxplot_by_category.pdf'), bbox_inches='tight', format='pdf')
     plt.close()
 
@@ -60,6 +62,7 @@ def main():
     PLOTS.mkdir(exist_ok=True)
     accuracy_barchart_by_category()
     timing_boxplot_by_category()
+    print("Done creating plots!")
 
 
 if __name__ == '__main__':

From 3a56d9c80be64ee5247ecbe5b3cc5b68fd869d6f Mon Sep 17 00:00:00 2001
From: Harry Li <harry.li@ll.mit.edu>
Date: Tue, 19 Nov 2024 15:45:58 -0500
Subject: [PATCH 03/10] progress on correctness breakdown plots

---
 .../plot/validation_figures.py                | 62 ++++++++++++-------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
index ba815db..c4cb690 100644
--- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
+++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
@@ -13,39 +13,25 @@
 DATA = Path(ROOT.parent / 'data')
 PLOTS = Path(ROOT / 'plots')
 
-def get_aggregated_accuracy_data():
-    df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question'])
-    df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Category', 'category': 'Domain'})
-    df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'})
-    return df
-
-def get_raw_timing_data():
-    timing_columns = ['Total Seconds', 'id', 'complexityType', 'category']
-    linkq_df = pd.read_csv(Path(DATA, 'linkq-evaluation-results.csv'), usecols=timing_columns)
-    linkq_df['Algorithm'] = 'LinkQ'
-    plainllm_df = pd.read_csv(Path(DATA, 'plainllm-evaluation-results.csv'), usecols=timing_columns)
-    plainllm_df['Algorithm'] = 'GPT-4'
-    combined_df = pd.concat([linkq_df, plainllm_df]).reset_index(drop=True)
-    combined_df = combined_df.rename(columns={'complexityType': 'Category', 'category': 'Domain'})
-    return combined_df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'})
-
 def percent_formatter(x):
     return f'{round(x)}%'
 
 palette = {'LinkQ': '#1f78b4', 'GPT-4': '#fdbf6f'}
 
 def accuracy_barchart_by_category():
-    df = get_aggregated_accuracy_data()
+    df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question'])
+    df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Question Type'})
+    df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'})
     # Assumes same number of questions per category
     # If so must be int
-    num_questions_per_category = len(df) // len(df['Category'].unique())
+    num_questions_per_category = len(df) // len(df['Question Type'].unique())
     df['LinkQ'] = (df['LinkQ'] > 0).astype(int)
     df['GPT-4'] = (df['GPT-4'] > 0).astype(int)
-    df = pd.melt(df, id_vars=['id', 'Domain', 'Category', 'question'], var_name='Algorithm', value_name='Correct')
-    df = df.groupby(['Category', 'Algorithm']).agg({'Correct': 'sum'}).sort_values(by="Correct",ascending=False).reset_index()
+    df = pd.melt(df, id_vars=['id', 'category', 'Question Type', 'question'], var_name='Algorithm', value_name='Correct')
+    df = df.groupby(['Question Type', 'Algorithm']).agg({'Correct': 'sum'}).sort_values(by='Correct',ascending=False).reset_index()
     df['Fraction'] = [f'{v}/{num_questions_per_category}' for v in df['Correct']]
     df['% Correct'] = (df['Correct'] / num_questions_per_category) * 100
-    ax = sns.barplot(df, x='Category', y='% Correct', hue='Algorithm', hue_order=['LinkQ', 'GPT-4'], palette=palette)
+    ax = sns.barplot(df, x='Question Type', y='% Correct', hue='Algorithm', hue_order=['LinkQ', 'GPT-4'], palette=palette)
 
     for container in ax.containers:
         ax.bar_label(container, fmt=percent_formatter)
@@ -53,15 +39,45 @@ def accuracy_barchart_by_category():
     plt.close()
 
 def timing_boxplot_by_category():
-    df = get_raw_timing_data()
-    sns.boxplot(df, x='Category', y='Total Seconds', hue='Algorithm', palette=palette)
+    timing_columns = ['Total Seconds', 'id', 'complexityType', 'category']
+    linkq_df = pd.read_csv(Path(DATA, 'linkq-evaluation-results.csv'), usecols=timing_columns)
+    linkq_df['Algorithm'] = 'LinkQ'
+    plainllm_df = pd.read_csv(Path(DATA, 'plainllm-evaluation-results.csv'), usecols=timing_columns)
+    plainllm_df['Algorithm'] = 'GPT-4'
+    combined_df = pd.concat([linkq_df, plainllm_df]).reset_index(drop=True)
+    combined_df = combined_df.rename(columns={'complexityType': 'Question Type'})
+
+    df = combined_df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'})
+
+    sns.boxplot(df, x='Question Type', y='Total Seconds', hue='Algorithm', palette=palette)
     plt.savefig(Path(PLOTS, 'timing_boxplot_by_category.pdf'), bbox_inches='tight', format='pdf')
     plt.close()
 
+def correctness_boxplot_by_category(target_column_name:str,y_axis_label:str,output_name:str,palette:dict):
+    df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=[target_column_name, 'complexityType', 'category', 'id', 'question'])
+    df = df.rename(columns={target_column_name: 'Correct', 'complexityType': 'Question Type'})
+    df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'})
+    
+    # Assumes same number of questions per category
+    # If so must be int
+    num_questions_per_category = len(df) // len(df['Question Type'].unique())
+    df[y_axis_label] = 0
+    df = df.groupby(['Question Type', 'Correct']).agg(
+        {y_axis_label: 'count'})
+    df[y_axis_label] = (df[y_axis_label] / num_questions_per_category) * 100
+    ax = sns.barplot(df, x='Question Type', y=y_axis_label, hue='Correct', hue_order=[3,2,1,0], palette=palette)
+
+    for container in ax.containers:
+        ax.bar_label(container, fmt=percent_formatter)
+    plt.savefig(Path(PLOTS, f'{output_name}.pdf'), bbox_inches='tight', format='pdf')
+    plt.close()
+
 def main():
     PLOTS.mkdir(exist_ok=True)
     accuracy_barchart_by_category()
     timing_boxplot_by_category()
+    correctness_boxplot_by_category(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={0: '#999999', 1: '#7fa0b6', 2: '#528db4', 3: '#1f78b4'})
+    correctness_boxplot_by_category(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={0: '#999999', 1: '#fff1e0', 2: '#ffd6a1', 3: '#fdbf6f'})
     print("Done creating plots!")
 
 

From b32121c7facac18ce79d8f0fd675916ff74d818c Mon Sep 17 00:00:00 2001
From: Harry Li <harry.li@ll.mit.edu>
Date: Tue, 19 Nov 2024 16:12:27 -0500
Subject: [PATCH 04/10] formatted legend strings, fixed order for all plots

---
 .../plot/validation_figures.py                | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
index c4cb690..22cc64d 100644
--- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
+++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
@@ -16,7 +16,8 @@
 def percent_formatter(x):
     return f'{round(x)}%'
 
-palette = {'LinkQ': '#1f78b4', 'GPT-4': '#fdbf6f'}
+QUESTION_TYPE_ORDER = ['Comparative', 'Yes/No', 'Generic', 'MultiHop', "Intersection"]
+PALETTE = {'LinkQ': '#1f78b4', 'GPT-4': '#fdbf6f'}
 
 def accuracy_barchart_by_category():
     df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question'])
@@ -31,7 +32,7 @@ def accuracy_barchart_by_category():
     df = df.groupby(['Question Type', 'Algorithm']).agg({'Correct': 'sum'}).sort_values(by='Correct',ascending=False).reset_index()
     df['Fraction'] = [f'{v}/{num_questions_per_category}' for v in df['Correct']]
     df['% Correct'] = (df['Correct'] / num_questions_per_category) * 100
-    ax = sns.barplot(df, x='Question Type', y='% Correct', hue='Algorithm', hue_order=['LinkQ', 'GPT-4'], palette=palette)
+    ax = sns.barplot(df, x='Question Type', y='% Correct', order=['Comparative', 'Yes/No', 'Generic', 'MultiHop', "Intersection"], hue='Algorithm', hue_order=['LinkQ', 'GPT-4'], palette=PALETTE)
 
     for container in ax.containers:
         ax.bar_label(container, fmt=percent_formatter)
@@ -49,23 +50,26 @@ def timing_boxplot_by_category():
 
     df = combined_df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'})
 
-    sns.boxplot(df, x='Question Type', y='Total Seconds', hue='Algorithm', palette=palette)
+    sns.boxplot(df, x='Question Type', y='Total Seconds', order=QUESTION_TYPE_ORDER, hue='Algorithm', palette=PALETTE)
     plt.savefig(Path(PLOTS, 'timing_boxplot_by_category.pdf'), bbox_inches='tight', format='pdf')
     plt.close()
 
 def correctness_boxplot_by_category(target_column_name:str,y_axis_label:str,output_name:str,palette:dict):
     df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=[target_column_name, 'complexityType', 'category', 'id', 'question'])
-    df = df.rename(columns={target_column_name: 'Correct', 'complexityType': 'Question Type'})
+    df = df.rename(columns={target_column_name: 'Correctness', 'complexityType': 'Question Type'})
     df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'})
     
     # Assumes same number of questions per category
     # If so must be int
     num_questions_per_category = len(df) // len(df['Question Type'].unique())
     df[y_axis_label] = 0
-    df = df.groupby(['Question Type', 'Correct']).agg(
+    df['Correctness'] = df['Correctness'].apply(lambda x: f'{x}/3')
+    print(df)
+    df = df.groupby(['Question Type', 'Correctness']).agg(
         {y_axis_label: 'count'})
     df[y_axis_label] = (df[y_axis_label] / num_questions_per_category) * 100
-    ax = sns.barplot(df, x='Question Type', y=y_axis_label, hue='Correct', hue_order=[3,2,1,0], palette=palette)
+    print
+    ax = sns.barplot(df, x='Question Type', y=y_axis_label, order=QUESTION_TYPE_ORDER, hue='Correctness', hue_order=["3/3","2/3","1/3","0/3"], palette=palette)
 
     for container in ax.containers:
         ax.bar_label(container, fmt=percent_formatter)
@@ -76,8 +80,8 @@ def main():
     PLOTS.mkdir(exist_ok=True)
     accuracy_barchart_by_category()
     timing_boxplot_by_category()
-    correctness_boxplot_by_category(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={0: '#999999', 1: '#7fa0b6', 2: '#528db4', 3: '#1f78b4'})
-    correctness_boxplot_by_category(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={0: '#999999', 1: '#fff1e0', 2: '#ffd6a1', 3: '#fdbf6f'})
+    correctness_boxplot_by_category(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={"0/3": '#999999', "1/3": '#c8ddec', "2/3": '#72aad0', "3/3": '#1f78b4'})
+    correctness_boxplot_by_category(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={"0/3": '#999999', "1/3": '#fff4e5', "2/3": '#ffdeb3', "3/3": '#fdbf6f'})
     print("Done creating plots!")
 
 

From 9b2792bfe20c953d1d572dc5638b9bb481fe3beb Mon Sep 17 00:00:00 2001
From: Harry Li <harry.li@ll.mit.edu>
Date: Mon, 25 Nov 2024 14:10:14 -0500
Subject: [PATCH 05/10] stack bar charts checkpoint

---
 .../plot/validation_figures.py                | 157 +++++++++++++++++-
 1 file changed, 151 insertions(+), 6 deletions(-)

diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
index 22cc64d..0eb15de 100644
--- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
+++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
@@ -6,6 +6,7 @@
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
+import numpy as np
 
 sns.set(rc={'figure.dpi': 300, 'savefig.dpi': 300})
 
@@ -18,11 +19,12 @@ def percent_formatter(x):
 
 QUESTION_TYPE_ORDER = ['Comparative', 'Yes/No', 'Generic', 'MultiHop', "Intersection"]
 PALETTE = {'LinkQ': '#1f78b4', 'GPT-4': '#fdbf6f'}
+TO_REPLACE = {'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}
 
 def accuracy_barchart_by_category():
     df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question'])
     df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Question Type'})
-    df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'})
+    df = df.replace(to_replace=TO_REPLACE)
     # Assumes same number of questions per category
     # If so must be int
     num_questions_per_category = len(df) // len(df['Question Type'].unique())
@@ -48,16 +50,16 @@ def timing_boxplot_by_category():
     combined_df = pd.concat([linkq_df, plainllm_df]).reset_index(drop=True)
     combined_df = combined_df.rename(columns={'complexityType': 'Question Type'})
 
-    df = combined_df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'})
+    df = combined_df.replace(to_replace=TO_REPLACE)
 
     sns.boxplot(df, x='Question Type', y='Total Seconds', order=QUESTION_TYPE_ORDER, hue='Algorithm', palette=PALETTE)
     plt.savefig(Path(PLOTS, 'timing_boxplot_by_category.pdf'), bbox_inches='tight', format='pdf')
     plt.close()
 
-def correctness_boxplot_by_category(target_column_name:str,y_axis_label:str,output_name:str,palette:dict):
+def correctness_barchart_by_algorithm(target_column_name:str,y_axis_label:str,output_name:str,palette:dict):
     df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=[target_column_name, 'complexityType', 'category', 'id', 'question'])
     df = df.rename(columns={target_column_name: 'Correctness', 'complexityType': 'Question Type'})
-    df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'})
+    df = df.replace(to_replace=TO_REPLACE)
     
     # Assumes same number of questions per category
     # If so must be int
@@ -76,12 +78,155 @@ def correctness_boxplot_by_category(target_column_name:str,y_axis_label:str,outp
     plt.savefig(Path(PLOTS, f'{output_name}.pdf'), bbox_inches='tight', format='pdf')
     plt.close()
 
+linkq_palette = {"LinkQ 0/3": '#999999', "LinkQ 1/3": '#c8ddec', "LinkQ 2/3": '#72aad0', "LinkQ 3/3": '#1f78b4'}
+plainllm_palette = {"GPT-4 0/3": '#999999', "GPT-4 1/3": '#fff4e5', "GPT-4 2/3": '#ffdeb3', "GPT-4 3/3": '#fdbf6f'}
+tmp_palette = {"LinkQ 0/3": '#999999', "LinkQ 1/3": '#c8ddec', "LinkQ 2/3": '#72aad0', "LinkQ 3/3": '#1f78b4', "GPT-4 0/3": '#999999', "GPT-4 1/3": '#fff4e5', "GPT-4 2/3": '#ffdeb3', "GPT-4 3/3": '#fdbf6f'}
+def correctness_barchart():
+    df1 = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'complexityType', 'category', 'id', 'question'])
+    df1 = df1.rename(columns={'linkqAnswerCorrect': 'Correctness', 'complexityType': 'Question Type'})
+    df1 = df1.replace(to_replace=TO_REPLACE)
+    num_questions_per_category = len(df1) // len(df1['Question Type'].unique())
+    df1 = df1.loc[df1['Correctness'] != 0]
+    df1['Correctness'] = df1['Correctness'].apply(lambda x: f'LinkQ {x}/3')
+    
+
+    df2 = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question'])
+    df2 = df2.rename(columns={'plainLLMAnswerCorrect':'Correctness', 'complexityType': 'Question Type'})
+    df2 = df2.replace(to_replace=TO_REPLACE)
+    df2 = df2.loc[df2['Correctness'] != 0]
+    df2['Correctness'] = df2['Correctness'].apply(lambda x: f'GPT-4 {x}/3')
+
+    df = df1._append(df2, ignore_index=True)
+    
+    # Assumes same number of questions per category
+    # If so must be int
+    df['tmp'] = 0
+    print(df)
+    df = df.groupby(['Question Type', 'Correctness']).agg(
+        {'tmp': 'count'})
+    df['tmp'] = (df['tmp'] / num_questions_per_category) * 100
+    print("-----------------------------------------------------------------------------")
+    print(df)
+
+
+    ax = sns.barplot(df, x='Question Type', y="tmp", order=QUESTION_TYPE_ORDER, hue='Correctness', 
+                     hue_order=["LinkQ 3/3","LinkQ 2/3","LinkQ 1/3","GPT-4 3/3","GPT-4 2/3","GPT-4 1/3"], 
+                     palette=tmp_palette)
+
+    for container in ax.containers:
+        ax.bar_label(container, fmt=percent_formatter)
+    plt.savefig(Path(PLOTS, f'correctness.pdf'), bbox_inches='tight', format='pdf')
+    plt.show()
+    plt.close()
+
+def correctness_stacked_barchart():
+    df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question'])
+    df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Question Type'})
+    df = df.replace(to_replace=TO_REPLACE)
+
+    # custom sort the question types
+    df["Question Type"] = pd.Categorical(df["Question Type"], categories=QUESTION_TYPE_ORDER, ordered=True)
+    df = df.sort_values("Question Type")
+
+    df['LinkQ'] = df['LinkQ'].apply(lambda x: f'{x}/3')
+    df['GPT-4'] = df['GPT-4'].apply(lambda x: f'{x}/3')
+
+    # Assumes same number of questions per category
+    # If so must be int
+    num_questions_per_category = len(df) // len(df['Question Type'].unique())
+    print("num_questions_per_category",num_questions_per_category)
+
+
+    df = pd.melt(df, id_vars=['id', 'category', 'Question Type', 'question'], var_name='Algorithm', value_name='Correctness')
+    
+    
+    df['Count'] = 0
+    # print(df)
+    df = df.groupby(['Question Type', 'Algorithm', 'Correctness']).agg(
+        {'Count': 'count'}).unstack(fill_value=0).stack().reset_index()
+    df['Count'] = (df['Count'] / num_questions_per_category) * 100
+    print(df)
+    print("-----------------------------------------------------------------------------")
+
+    question_type = df['Question Type'].unique()
+    algorithms = ['LinkQ', 'GPT-4']
+    width = 0.35  # Width of the bar
+    x = np.arange(len(question_type))  # X-axis positions for question_type
+
+    # Plot side-by-side stacked bars
+    fig, ax = plt.subplots()
+
+    for i, algorithm in enumerate(algorithms):
+        print("-----------------------------------------------------------------------------")
+        # Filter data for the current algorithm
+        algorithm_data = df[df['Algorithm'] == algorithm]
+        three_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '3/3']['Count'].reset_index(drop=True)
+        two_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '2/3']['Count'].reset_index(drop=True)
+        one_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '1/3']['Count'].reset_index(drop=True)
+        color3 = tmp_palette[f'{algorithm} 3/3']
+        color2 = tmp_palette[f'{algorithm} 2/3']
+        color1 = tmp_palette[f'{algorithm} 1/3']
+        
+        print("algorithm",algorithm)
+        print("three_three_data",three_three_data)
+        print("two_three_data",two_three_data)
+        print("added",two_three_data+three_three_data)
+        bar3 = ax.bar(x + (i - 0.5) * width, three_three_data, width, color=color3, label=f'{algorithm} 3/3 Correct')
+        bar2 = ax.bar(x + (i - 0.5) * width, two_three_data, width, color=color2, bottom=three_three_data, label=f'{algorithm} 2/3 Correct')
+        bar1 = ax.bar(x + (i - 0.5) * width, one_three_data, width, color=color1, bottom=two_three_data+three_three_data, label=f'{algorithm} 1/3 Correct')
+    ax.set_xlabel('Question Type')
+    ax.set_ylabel('% Correct')
+    ax.set_title('Side-by-Side Stacked Bar Chart')
+    ax.set_xticks(x)
+    ax.set_xticklabels(question_type)
+    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.tight_layout()
+    plt.savefig(Path(PLOTS, f'correctness_stacked.pdf'), bbox_inches='tight', format='pdf')
+    plt.close()
+
+
+
+    fig, ax = plt.subplots()
+    for i, algorithm in enumerate(algorithms):
+        print("-----------------------------------------------------------------------------")
+        # Filter data for the current algorithm
+        algorithm_data = df[df['Algorithm'] == algorithm]
+        three_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '3/3']['Count'].reset_index(drop=True)
+        two_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '2/3']['Count'].reset_index(drop=True)
+        one_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '1/3']['Count'].reset_index(drop=True)
+        zero_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '0/3']['Count'].reset_index(drop=True)
+        color3 = tmp_palette[f'{algorithm} 3/3']
+        color2 = tmp_palette[f'{algorithm} 2/3']
+        color1 = tmp_palette[f'{algorithm} 1/3']
+        color0 = tmp_palette[f'{algorithm} 0/3']
+        
+        print("algorithm",algorithm)
+        print("three_three_data",three_three_data)
+        print("two_three_data",two_three_data)
+        print("added",two_three_data+three_three_data)
+        bar3 = ax.bar(x + (i - 0.5) * width, three_three_data, width, color=color3, label=f'{algorithm} 3/3 Correct')
+        bar2 = ax.bar(x + (i - 0.5) * width, two_three_data, width, color=color2, bottom=three_three_data, label=f'{algorithm} 2/3 Correct')
+        bar1 = ax.bar(x + (i - 0.5) * width, one_three_data, width, color=color1, bottom=two_three_data+three_three_data, label=f'{algorithm} 1/3 Correct')
+        bar0 = ax.bar(x + (i - 0.5) * width, zero_three_data, width, color=color0, bottom=one_three_data+two_three_data+three_three_data, label=f'{algorithm} 0/3 Correct')
+    ax.set_xlabel('Question Type')
+    ax.set_ylabel('% Correct')
+    ax.set_title('Side-by-Side Stacked Bar Chart')
+    ax.set_xticks(x)
+    ax.set_xticklabels(question_type)
+    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+    plt.tight_layout()
+    plt.savefig(Path(PLOTS, f'correctness_stacked_zeros.pdf'), bbox_inches='tight', format='pdf')
+    plt.show()
+    plt.close()
+
 def main():
     PLOTS.mkdir(exist_ok=True)
     accuracy_barchart_by_category()
     timing_boxplot_by_category()
-    correctness_boxplot_by_category(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={"0/3": '#999999', "1/3": '#c8ddec', "2/3": '#72aad0', "3/3": '#1f78b4'})
-    correctness_boxplot_by_category(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={"0/3": '#999999', "1/3": '#fff4e5', "2/3": '#ffdeb3', "3/3": '#fdbf6f'})
+    # correctness_barchart_by_algorithm(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={"0/3": '#999999', "1/3": '#c8ddec', "2/3": '#72aad0', "3/3": '#1f78b4'})
+    # correctness_barchart_by_algorithm(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={"0/3": '#999999', "1/3": '#fff4e5', "2/3": '#ffdeb3', "3/3": '#fdbf6f'})
+    # correctness_barchart()
+    correctness_stacked_barchart()
     print("Done creating plots!")
 
 

From 43fcbaa5582a3e49bc137e144e4b5334254d5190 Mon Sep 17 00:00:00 2001
From: Harry Li <harry.li@ll.mit.edu>
Date: Mon, 25 Nov 2024 14:20:30 -0500
Subject: [PATCH 06/10] filled in missing data

---
 .../mintaka-wikidata/plot/validation_figures.py        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
index 0eb15de..0efe2a6 100644
--- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
+++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
@@ -103,20 +103,20 @@ def correctness_barchart():
     df['tmp'] = 0
     print(df)
     df = df.groupby(['Question Type', 'Correctness']).agg(
-        {'tmp': 'count'})
+        {'tmp': 'count'}).unstack(fill_value=0).stack().reset_index()
     df['tmp'] = (df['tmp'] / num_questions_per_category) * 100
     print("-----------------------------------------------------------------------------")
     print(df)
 
 
     ax = sns.barplot(df, x='Question Type', y="tmp", order=QUESTION_TYPE_ORDER, hue='Correctness', 
-                     hue_order=["LinkQ 3/3","LinkQ 2/3","LinkQ 1/3","GPT-4 3/3","GPT-4 2/3","GPT-4 1/3"], 
+                     hue_order=["LinkQ 3/3","GPT-4 3/3","LinkQ 2/3","GPT-4 2/3","LinkQ 1/3","GPT-4 1/3"], 
                      palette=tmp_palette)
 
     for container in ax.containers:
         ax.bar_label(container, fmt=percent_formatter)
     plt.savefig(Path(PLOTS, f'correctness.pdf'), bbox_inches='tight', format='pdf')
-    plt.show()
+    # plt.show()
     plt.close()
 
 def correctness_stacked_barchart():
@@ -216,7 +216,7 @@ def correctness_stacked_barchart():
     ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
     plt.tight_layout()
     plt.savefig(Path(PLOTS, f'correctness_stacked_zeros.pdf'), bbox_inches='tight', format='pdf')
-    plt.show()
+    # plt.show()
     plt.close()
 
 def main():
@@ -225,7 +225,7 @@ def main():
     timing_boxplot_by_category()
     # correctness_barchart_by_algorithm(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={"0/3": '#999999', "1/3": '#c8ddec', "2/3": '#72aad0', "3/3": '#1f78b4'})
     # correctness_barchart_by_algorithm(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={"0/3": '#999999', "1/3": '#fff4e5', "2/3": '#ffdeb3', "3/3": '#fdbf6f'})
-    # correctness_barchart()
+    correctness_barchart()
     correctness_stacked_barchart()
     print("Done creating plots!")
 

From c77f6103815774802d303a5aecf4517342ac5be4 Mon Sep 17 00:00:00 2001
From: Harry Li <harry.li@ll.mit.edu>
Date: Mon, 25 Nov 2024 15:47:17 -0500
Subject: [PATCH 07/10] cleaned up and commented

---
 .../plot/validation_figures.py                | 209 +++++++-----------
 1 file changed, 76 insertions(+), 133 deletions(-)

diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
index 0efe2a6..004dddf 100644
--- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
+++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
@@ -3,6 +3,7 @@
 
 from pathlib import Path
 
+from functools import reduce
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
@@ -17,23 +18,30 @@
 def percent_formatter(x):
     return f'{round(x)}%'
 
+CORRECTNESS_PALETTE = {"LinkQ 0/3": '#999999', "LinkQ 1/3": '#c8ddec', "LinkQ 2/3": '#72aad0', "LinkQ 3/3": '#1f78b4', "GPT-4 0/3": '#999999', "GPT-4 1/3": '#fff4e5', "GPT-4 2/3": '#ffdeb3', "GPT-4 3/3": '#fdbf6f'}
 QUESTION_TYPE_ORDER = ['Comparative', 'Yes/No', 'Generic', 'MultiHop', "Intersection"]
 PALETTE = {'LinkQ': '#1f78b4', 'GPT-4': '#fdbf6f'}
 TO_REPLACE = {'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}
 
 def accuracy_barchart_by_category():
+    # Load the data and rename certain columns and values
     df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question'])
     df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Question Type'})
     df = df.replace(to_replace=TO_REPLACE)
-    # Assumes same number of questions per category
-    # If so must be int
-    num_questions_per_category = len(df) // len(df['Question Type'].unique())
+    
+    num_questions_per_type = len(df) // len(df['Question Type'].unique()) # Assumes same number of questions per category
     df['LinkQ'] = (df['LinkQ'] > 0).astype(int)
     df['GPT-4'] = (df['GPT-4'] > 0).astype(int)
+
+    # Unpivot the LinkQ and GPT-4 columns into Algorithm and Correctness columns
     df = pd.melt(df, id_vars=['id', 'category', 'Question Type', 'question'], var_name='Algorithm', value_name='Correct')
+
+    # Count the correctness values and convert them into percentages
     df = df.groupby(['Question Type', 'Algorithm']).agg({'Correct': 'sum'}).sort_values(by='Correct',ascending=False).reset_index()
-    df['Fraction'] = [f'{v}/{num_questions_per_category}' for v in df['Correct']]
-    df['% Correct'] = (df['Correct'] / num_questions_per_category) * 100
+    df['Fraction'] = [f'{v}/{num_questions_per_type}' for v in df['Correct']]
+    df['% Correct'] = (df['Correct'] / num_questions_per_type) * 100
+
+    # Plot the data
     ax = sns.barplot(df, x='Question Type', y='% Correct', order=['Comparative', 'Yes/No', 'Generic', 'MultiHop', "Intersection"], hue='Algorithm', hue_order=['LinkQ', 'GPT-4'], palette=PALETTE)
 
     for container in ax.containers:
@@ -41,190 +49,125 @@ def accuracy_barchart_by_category():
     plt.savefig(Path(PLOTS, 'accuracy_barchart_by_category.pdf'), bbox_inches='tight', format='pdf')
     plt.close()
 
+
 def timing_boxplot_by_category():
+    # Load the data and rename certain columns and values
     timing_columns = ['Total Seconds', 'id', 'complexityType', 'category']
     linkq_df = pd.read_csv(Path(DATA, 'linkq-evaluation-results.csv'), usecols=timing_columns)
     linkq_df['Algorithm'] = 'LinkQ'
     plainllm_df = pd.read_csv(Path(DATA, 'plainllm-evaluation-results.csv'), usecols=timing_columns)
     plainllm_df['Algorithm'] = 'GPT-4'
-    combined_df = pd.concat([linkq_df, plainllm_df]).reset_index(drop=True)
-    combined_df = combined_df.rename(columns={'complexityType': 'Question Type'})
-
-    df = combined_df.replace(to_replace=TO_REPLACE)
+    df = pd.concat([linkq_df, plainllm_df]).reset_index(drop=True)
+    df = df.rename(columns={'complexityType': 'Question Type'})
+    df = df.replace(to_replace=TO_REPLACE)
 
     sns.boxplot(df, x='Question Type', y='Total Seconds', order=QUESTION_TYPE_ORDER, hue='Algorithm', palette=PALETTE)
     plt.savefig(Path(PLOTS, 'timing_boxplot_by_category.pdf'), bbox_inches='tight', format='pdf')
     plt.close()
 
-def correctness_barchart_by_algorithm(target_column_name:str,y_axis_label:str,output_name:str,palette:dict):
-    df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=[target_column_name, 'complexityType', 'category', 'id', 'question'])
-    df = df.rename(columns={target_column_name: 'Correctness', 'complexityType': 'Question Type'})
-    df = df.replace(to_replace=TO_REPLACE)
-    
-    # Assumes same number of questions per category
-    # If so must be int
-    num_questions_per_category = len(df) // len(df['Question Type'].unique())
-    df[y_axis_label] = 0
-    df['Correctness'] = df['Correctness'].apply(lambda x: f'{x}/3')
-    print(df)
-    df = df.groupby(['Question Type', 'Correctness']).agg(
-        {y_axis_label: 'count'})
-    df[y_axis_label] = (df[y_axis_label] / num_questions_per_category) * 100
-    print
-    ax = sns.barplot(df, x='Question Type', y=y_axis_label, order=QUESTION_TYPE_ORDER, hue='Correctness', hue_order=["3/3","2/3","1/3","0/3"], palette=palette)
-
-    for container in ax.containers:
-        ax.bar_label(container, fmt=percent_formatter)
-    plt.savefig(Path(PLOTS, f'{output_name}.pdf'), bbox_inches='tight', format='pdf')
-    plt.close()
 
-linkq_palette = {"LinkQ 0/3": '#999999', "LinkQ 1/3": '#c8ddec', "LinkQ 2/3": '#72aad0', "LinkQ 3/3": '#1f78b4'}
-plainllm_palette = {"GPT-4 0/3": '#999999', "GPT-4 1/3": '#fff4e5', "GPT-4 2/3": '#ffdeb3', "GPT-4 3/3": '#fdbf6f'}
-tmp_palette = {"LinkQ 0/3": '#999999', "LinkQ 1/3": '#c8ddec', "LinkQ 2/3": '#72aad0', "LinkQ 3/3": '#1f78b4', "GPT-4 0/3": '#999999', "GPT-4 1/3": '#fff4e5', "GPT-4 2/3": '#ffdeb3', "GPT-4 3/3": '#fdbf6f'}
 def correctness_barchart():
-    df1 = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'complexityType', 'category', 'id', 'question'])
-    df1 = df1.rename(columns={'linkqAnswerCorrect': 'Correctness', 'complexityType': 'Question Type'})
-    df1 = df1.replace(to_replace=TO_REPLACE)
-    num_questions_per_category = len(df1) // len(df1['Question Type'].unique())
-    df1 = df1.loc[df1['Correctness'] != 0]
-    df1['Correctness'] = df1['Correctness'].apply(lambda x: f'LinkQ {x}/3')
-    
-
-    df2 = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question'])
-    df2 = df2.rename(columns={'plainLLMAnswerCorrect':'Correctness', 'complexityType': 'Question Type'})
-    df2 = df2.replace(to_replace=TO_REPLACE)
-    df2 = df2.loc[df2['Correctness'] != 0]
-    df2['Correctness'] = df2['Correctness'].apply(lambda x: f'GPT-4 {x}/3')
+    # Load the data and rename certain columns and values
+    df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question'])
+    df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Question Type'})
+    df = df.replace(to_replace=TO_REPLACE)
+    df['LinkQ'] = df['LinkQ'].apply(lambda x: f'LinkQ {x}/3')
+    df['GPT-4'] = df['GPT-4'].apply(lambda x: f'GPT-4 {x}/3')
 
-    df = df1._append(df2, ignore_index=True)
-    
     # Assumes same number of questions per category
-    # If so must be int
-    df['tmp'] = 0
-    print(df)
-    df = df.groupby(['Question Type', 'Correctness']).agg(
-        {'tmp': 'count'}).unstack(fill_value=0).stack().reset_index()
-    df['tmp'] = (df['tmp'] / num_questions_per_category) * 100
-    print("-----------------------------------------------------------------------------")
-    print(df)
+    num_questions_per_type = len(df) // len(df['Question Type'].unique()) # Assumes same number of questions per category
+
+    # Unpivot the LinkQ and GPT-4 columns into Algorithm and Correctness columns
+    df = pd.melt(df, id_vars=['id', 'category', 'Question Type', 'question'], var_name='Algorithm', value_name='Correctness')
 
+    # Count the correctness values and convert them into percentages
+    df['Value'] = 0
+    df = df.groupby(['Question Type', 'Correctness']).agg(
+        {'Value': 'count'}).unstack(fill_value=0).stack(future_stack=True).reset_index()
+    df['Value'] = (df['Value'] / num_questions_per_type) * 100
 
-    ax = sns.barplot(df, x='Question Type', y="tmp", order=QUESTION_TYPE_ORDER, hue='Correctness', 
+    # Plot the data
+    ax = sns.barplot(df, x='Question Type', y="Value", order=QUESTION_TYPE_ORDER, hue='Correctness', 
                      hue_order=["LinkQ 3/3","GPT-4 3/3","LinkQ 2/3","GPT-4 2/3","LinkQ 1/3","GPT-4 1/3"], 
-                     palette=tmp_palette)
+                     palette=CORRECTNESS_PALETTE)
 
     for container in ax.containers:
         ax.bar_label(container, fmt=percent_formatter)
     plt.savefig(Path(PLOTS, f'correctness.pdf'), bbox_inches='tight', format='pdf')
-    # plt.show()
     plt.close()
 
+
 def correctness_stacked_barchart():
+    # Load the data and rename certain columns and values
     df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question'])
     df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Question Type'})
     df = df.replace(to_replace=TO_REPLACE)
+    df['LinkQ'] = df['LinkQ'].apply(lambda x: f'{x}/3')
+    df['GPT-4'] = df['GPT-4'].apply(lambda x: f'{x}/3')
 
-    # custom sort the question types
+    # Custom sort the question types to keep all the plots consistent
     df["Question Type"] = pd.Categorical(df["Question Type"], categories=QUESTION_TYPE_ORDER, ordered=True)
     df = df.sort_values("Question Type")
 
-    df['LinkQ'] = df['LinkQ'].apply(lambda x: f'{x}/3')
-    df['GPT-4'] = df['GPT-4'].apply(lambda x: f'{x}/3')
-
     # Assumes same number of questions per category
-    # If so must be int
-    num_questions_per_category = len(df) // len(df['Question Type'].unique())
-    print("num_questions_per_category",num_questions_per_category)
-
+    num_questions_per_type = len(df) // len(df['Question Type'].unique())
 
+    # Unpivot the LinkQ and GPT-4 columns into Algorithm and Correctness columns
     df = pd.melt(df, id_vars=['id', 'category', 'Question Type', 'question'], var_name='Algorithm', value_name='Correctness')
     
-    
-    df['Count'] = 0
-    # print(df)
-    df = df.groupby(['Question Type', 'Algorithm', 'Correctness']).agg(
-        {'Count': 'count'}).unstack(fill_value=0).stack().reset_index()
-    df['Count'] = (df['Count'] / num_questions_per_category) * 100
-    print(df)
-    print("-----------------------------------------------------------------------------")
-
-    question_type = df['Question Type'].unique()
-    algorithms = ['LinkQ', 'GPT-4']
+    # Count the correctness values and convert them into percentages
+    df['Value'] = 0
+    df = df.groupby(['Question Type', 'Algorithm', 'Correctness'],observed=False).agg(
+        {'Value': 'count'}).unstack(fill_value=0).stack(future_stack=True).reset_index()
+    df['Value'] = (df['Value'] / num_questions_per_type) * 100
+
+    # Prep the plot data
+    question_types = df['Question Type'].unique()
+    x = np.arange(len(question_types))  # X-axis positions for question_types
+    algorithms = ['LinkQ', 'GPT-4'] # this list determines left to right ordering of the algorithms
+    correctness = ['3/3','2/3','1/3'] # this list determines bottom to top stacking order of correctness
     width = 0.35  # Width of the bar
-    x = np.arange(len(question_type))  # X-axis positions for question_type
 
     # Plot side-by-side stacked bars
     fig, ax = plt.subplots()
-
-    for i, algorithm in enumerate(algorithms):
-        print("-----------------------------------------------------------------------------")
+    for alg_idx, algorithm in enumerate(algorithms):
         # Filter data for the current algorithm
         algorithm_data = df[df['Algorithm'] == algorithm]
-        three_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '3/3']['Count'].reset_index(drop=True)
-        two_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '2/3']['Count'].reset_index(drop=True)
-        one_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '1/3']['Count'].reset_index(drop=True)
-        color3 = tmp_palette[f'{algorithm} 3/3']
-        color2 = tmp_palette[f'{algorithm} 2/3']
-        color1 = tmp_palette[f'{algorithm} 1/3']
+        # Filter again by correctness
+        filtered_values = list(map(
+            lambda x: algorithm_data.loc[algorithm_data['Correctness'] == x]['Value'].reset_index(drop=True),
+            correctness))
         
-        print("algorithm",algorithm)
-        print("three_three_data",three_three_data)
-        print("two_three_data",two_three_data)
-        print("added",two_three_data+three_three_data)
-        bar3 = ax.bar(x + (i - 0.5) * width, three_three_data, width, color=color3, label=f'{algorithm} 3/3 Correct')
-        bar2 = ax.bar(x + (i - 0.5) * width, two_three_data, width, color=color2, bottom=three_three_data, label=f'{algorithm} 2/3 Correct')
-        bar1 = ax.bar(x + (i - 0.5) * width, one_three_data, width, color=color1, bottom=two_three_data+three_three_data, label=f'{algorithm} 1/3 Correct')
-    ax.set_xlabel('Question Type')
-    ax.set_ylabel('% Correct')
-    ax.set_title('Side-by-Side Stacked Bar Chart')
-    ax.set_xticks(x)
-    ax.set_xticklabels(question_type)
-    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
-    plt.tight_layout()
-    plt.savefig(Path(PLOTS, f'correctness_stacked.pdf'), bbox_inches='tight', format='pdf')
-    plt.close()
+        # Loop over all the correctness to stack the bars on top of each other
+        bottom = None # The first correctness bars will be stacked from the bottom
+        for correct_idx, correct in enumerate(correctness):
+            values = filtered_values[correct_idx] # Series containing the values for this algorithm + correctness, by question type
+            color = CORRECTNESS_PALETTE[f'{algorithm} {correct}'] # Get the color palette for this algorithm + correctness
+            # Stack the bars for this correctness
+            bar = ax.bar(
+                x=x + (alg_idx - 0.5) * width,
+                height=values, 
+                width=width, 
+                color=color, 
+                label=f'{algorithm} {correct} Correct',
+                bottom=bottom)
+            # For the next set of stacked bars, we need to add these count values so we know where we should stack from
+            bottom = values if (bottom is None) else (bottom + values)
 
-
-
-    fig, ax = plt.subplots()
-    for i, algorithm in enumerate(algorithms):
-        print("-----------------------------------------------------------------------------")
-        # Filter data for the current algorithm
-        algorithm_data = df[df['Algorithm'] == algorithm]
-        three_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '3/3']['Count'].reset_index(drop=True)
-        two_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '2/3']['Count'].reset_index(drop=True)
-        one_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '1/3']['Count'].reset_index(drop=True)
-        zero_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '0/3']['Count'].reset_index(drop=True)
-        color3 = tmp_palette[f'{algorithm} 3/3']
-        color2 = tmp_palette[f'{algorithm} 2/3']
-        color1 = tmp_palette[f'{algorithm} 1/3']
-        color0 = tmp_palette[f'{algorithm} 0/3']
-        
-        print("algorithm",algorithm)
-        print("three_three_data",three_three_data)
-        print("two_three_data",two_three_data)
-        print("added",two_three_data+three_three_data)
-        bar3 = ax.bar(x + (i - 0.5) * width, three_three_data, width, color=color3, label=f'{algorithm} 3/3 Correct')
-        bar2 = ax.bar(x + (i - 0.5) * width, two_three_data, width, color=color2, bottom=three_three_data, label=f'{algorithm} 2/3 Correct')
-        bar1 = ax.bar(x + (i - 0.5) * width, one_three_data, width, color=color1, bottom=two_three_data+three_three_data, label=f'{algorithm} 1/3 Correct')
-        bar0 = ax.bar(x + (i - 0.5) * width, zero_three_data, width, color=color0, bottom=one_three_data+two_three_data+three_three_data, label=f'{algorithm} 0/3 Correct')
     ax.set_xlabel('Question Type')
     ax.set_ylabel('% Correct')
     ax.set_title('Side-by-Side Stacked Bar Chart')
     ax.set_xticks(x)
-    ax.set_xticklabels(question_type)
+    ax.set_xticklabels(question_types)
     ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
     plt.tight_layout()
-    plt.savefig(Path(PLOTS, f'correctness_stacked_zeros.pdf'), bbox_inches='tight', format='pdf')
-    # plt.show()
+    plt.savefig(Path(PLOTS, 'correctness_stacked.pdf'), bbox_inches='tight', format='pdf')
     plt.close()
 
 def main():
     PLOTS.mkdir(exist_ok=True)
     accuracy_barchart_by_category()
     timing_boxplot_by_category()
-    # correctness_barchart_by_algorithm(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={"0/3": '#999999', "1/3": '#c8ddec', "2/3": '#72aad0', "3/3": '#1f78b4'})
-    # correctness_barchart_by_algorithm(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={"0/3": '#999999', "1/3": '#fff4e5', "2/3": '#ffdeb3', "3/3": '#fdbf6f'})
     correctness_barchart()
     correctness_stacked_barchart()
     print("Done creating plots!")

From d5ed9abb4dcb0ef2c83eca88e46c45ca69779d58 Mon Sep 17 00:00:00 2001
From: Harry Li <harry.li@ll.mit.edu>
Date: Mon, 25 Nov 2024 16:16:40 -0500
Subject: [PATCH 08/10] summed percentages

---
 .../plot/validation_figures.py                | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
index 004dddf..3729cfb 100644
--- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
+++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
@@ -138,28 +138,38 @@ def correctness_stacked_barchart():
             lambda x: algorithm_data.loc[algorithm_data['Correctness'] == x]['Value'].reset_index(drop=True),
             correctness))
         
+        plot_x = x + (alg_idx - 0.5) * width
+        bottom = np.zeros(len(question_types)) # The first correctness bars will be stacked from the bottom
         # Loop over all the correctness to stack the bars on top of each other
-        bottom = None # The first correctness bars will be stacked from the bottom
         for correct_idx, correct in enumerate(correctness):
             values = filtered_values[correct_idx] # Series containing the values for this algorithm + correctness, by question type
             color = CORRECTNESS_PALETTE[f'{algorithm} {correct}'] # Get the color palette for this algorithm + correctness
             # Stack the bars for this correctness
             bar = ax.bar(
-                x=x + (alg_idx - 0.5) * width,
+                x=plot_x,
                 height=values, 
                 width=width, 
                 color=color, 
-                label=f'{algorithm} {correct} Correct',
+                label=f'{algorithm} {correct}',
                 bottom=bottom)
+            
+            # for xpos, value, y in zip(plot_x, values, bottom):
+            #     if value != 0.0:
+            #         ax.text(x=xpos, y=y + value/2, s=percent_formatter(value), ha='center', va='center', fontsize=10)
+                
             # For the next set of stacked bars, we need to add these count values so we know where we should stack from
-            bottom = values if (bottom is None) else (bottom + values)
+            bottom += values
+
+        # Label the percentage sums
+        for xpos, total in zip(plot_x, bottom):
+            ax.text(x=xpos, y=total + 0.5, s=percent_formatter(total), ha='center', va='bottom', fontsize=10)
 
     ax.set_xlabel('Question Type')
     ax.set_ylabel('% Correct')
-    ax.set_title('Side-by-Side Stacked Bar Chart')
+    # ax.set_title('Side-by-Side Stacked Bar Chart')
     ax.set_xticks(x)
     ax.set_xticklabels(question_types)
-    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+    ax.legend(title="# Correct / 3 Attempts", title_fontsize=10, bbox_to_anchor=(1, 1), loc='upper left')
     plt.tight_layout()
     plt.savefig(Path(PLOTS, 'correctness_stacked.pdf'), bbox_inches='tight', format='pdf')
     plt.close()

From 6e8fe8ca6a6974a764b6e1f5148070831223f3af Mon Sep 17 00:00:00 2001
From: Harry Li <harry.li@ll.mit.edu>
Date: Mon, 2 Dec 2024 09:30:10 -0500
Subject: [PATCH 09/10] tweaked stacked bar chart

---
 .../evaluations/mintaka-wikidata/plot/validation_figures.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
index 3729cfb..dbedf84 100644
--- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
+++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
@@ -126,10 +126,11 @@ def correctness_stacked_barchart():
     x = np.arange(len(question_types))  # X-axis positions for question_types
     algorithms = ['LinkQ', 'GPT-4'] # this list determines left to right ordering of the algorithms
     correctness = ['3/3','2/3','1/3'] # this list determines bottom to top stacking order of correctness
-    width = 0.35  # Width of the bar
+    width = 0.38  # Width of the bar
 
     # Plot side-by-side stacked bars
     fig, ax = plt.subplots()
+    fig.set_figwidth(7)
     for alg_idx, algorithm in enumerate(algorithms):
         # Filter data for the current algorithm
         algorithm_data = df[df['Algorithm'] == algorithm]
@@ -162,7 +163,7 @@ def correctness_stacked_barchart():
 
         # Label the percentage sums
         for xpos, total in zip(plot_x, bottom):
-            ax.text(x=xpos, y=total + 0.5, s=percent_formatter(total), ha='center', va='bottom', fontsize=10)
+            ax.text(x=xpos, y=total + 0.5, s=percent_formatter(total), ha='center', va='bottom', fontsize=11)
 
     ax.set_xlabel('Question Type')
     ax.set_ylabel('% Correct')
@@ -170,6 +171,7 @@ def correctness_stacked_barchart():
     ax.set_xticks(x)
     ax.set_xticklabels(question_types)
     ax.legend(title="# Correct / 3 Attempts", title_fontsize=10, bbox_to_anchor=(1, 1), loc='upper left')
+    plt.grid(axis='x', which='both', visible=False)
     plt.tight_layout()
     plt.savefig(Path(PLOTS, 'correctness_stacked.pdf'), bbox_inches='tight', format='pdf')
     plt.close()

From 2248580f06847273eb3cd6ebdd49e358c967a797 Mon Sep 17 00:00:00 2001
From: Harry Li <harry.li@ll.mit.edu>
Date: Mon, 2 Dec 2024 15:11:41 -0500
Subject: [PATCH 10/10] updated stacked bar chart visuals

---
 .../mintaka-wikidata/plot/validation_figures.py           | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
index dbedf84..7bb4eb4 100644
--- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
+++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py
@@ -16,7 +16,7 @@
 PLOTS = Path(ROOT / 'plots')
 
 def percent_formatter(x):
-    return f'{round(x)}%'
+    return f'{'{:.1%}'.format(x/100)}'
 
 CORRECTNESS_PALETTE = {"LinkQ 0/3": '#999999', "LinkQ 1/3": '#c8ddec', "LinkQ 2/3": '#72aad0', "LinkQ 3/3": '#1f78b4', "GPT-4 0/3": '#999999', "GPT-4 1/3": '#fff4e5', "GPT-4 2/3": '#ffdeb3', "GPT-4 3/3": '#fdbf6f'}
 QUESTION_TYPE_ORDER = ['Comparative', 'Yes/No', 'Generic', 'MultiHop', "Intersection"]
@@ -130,7 +130,7 @@ def correctness_stacked_barchart():
 
     # Plot side-by-side stacked bars
     fig, ax = plt.subplots()
-    fig.set_figwidth(7)
+    fig.set_figwidth(8)
     for alg_idx, algorithm in enumerate(algorithms):
         # Filter data for the current algorithm
         algorithm_data = df[df['Algorithm'] == algorithm]
@@ -152,6 +152,8 @@ def correctness_stacked_barchart():
                 width=width, 
                 color=color, 
                 label=f'{algorithm} {correct}',
+                edgecolor="black",
+                linewidth=0.5,
                 bottom=bottom)
             
             # for xpos, value, y in zip(plot_x, values, bottom):
@@ -163,7 +165,7 @@ def correctness_stacked_barchart():
 
         # Label the percentage sums
         for xpos, total in zip(plot_x, bottom):
-            ax.text(x=xpos, y=total + 0.5, s=percent_formatter(total), ha='center', va='bottom', fontsize=11)
+            ax.text(x=xpos, y=total + 0.5, s=percent_formatter(total), ha='center', va='bottom', fontsize=9)
 
     ax.set_xlabel('Question Type')
     ax.set_ylabel('% Correct')