From f94da5ce4ae22e1530a03feb30158e7d86961f01 Mon Sep 17 00:00:00 2001 From: Harry Li Date: Tue, 19 Nov 2024 14:04:00 -0500 Subject: [PATCH 01/10] standardized output file names, created standalone mintaka-wikidata folder, moved parseCSVFile to own file --- .../{ => mintaka-wikidata}/.gitignore | 0 .../{ => mintaka-wikidata}/README.md | 0 .../calculateMintakaEvaluationMetrics.ts | 17 +++------------ .../curatedMintakaQuestions.csv | 0 .../mintaka-wikidata/data/.gitignore | 1 + .../mintakaEvaluation.ts | 20 +++++++++--------- .../{ => mintaka-wikidata}/plot/.gitignore | 0 .../{ => mintaka-wikidata}/plot/README | 0 .../plot/requirements.txt | 0 .../plot/validation_figures.py | 4 ++-- .../prepMintakaQuestions.ts | 14 +------------ src/utils/evaluations/questions.ts | 6 ------ src/utils/parseCSVFile.ts | 21 +++++++++++++++++++ 13 files changed, 38 insertions(+), 45 deletions(-) rename src/utils/evaluations/{ => mintaka-wikidata}/.gitignore (100%) rename src/utils/evaluations/{ => mintaka-wikidata}/README.md (100%) rename src/utils/evaluations/{ => mintaka-wikidata}/calculateMintakaEvaluationMetrics.ts (94%) rename src/utils/evaluations/{ => mintaka-wikidata}/curatedMintakaQuestions.csv (100%) create mode 100644 src/utils/evaluations/mintaka-wikidata/data/.gitignore rename src/utils/evaluations/{ => mintaka-wikidata}/mintakaEvaluation.ts (93%) rename src/utils/evaluations/{ => mintaka-wikidata}/plot/.gitignore (100%) rename src/utils/evaluations/{ => mintaka-wikidata}/plot/README (100%) rename src/utils/evaluations/{ => mintaka-wikidata}/plot/requirements.txt (100%) rename src/utils/evaluations/{ => mintaka-wikidata}/plot/validation_figures.py (95%) rename src/utils/evaluations/{ => mintaka-wikidata}/prepMintakaQuestions.ts (90%) delete mode 100644 src/utils/evaluations/questions.ts create mode 100644 src/utils/parseCSVFile.ts diff --git a/src/utils/evaluations/.gitignore b/src/utils/evaluations/mintaka-wikidata/.gitignore similarity index 100% rename from src/utils/evaluations/.gitignore rename to src/utils/evaluations/mintaka-wikidata/.gitignore diff --git a/src/utils/evaluations/README.md b/src/utils/evaluations/mintaka-wikidata/README.md similarity index 100% rename from src/utils/evaluations/README.md rename to src/utils/evaluations/mintaka-wikidata/README.md diff --git a/src/utils/evaluations/calculateMintakaEvaluationMetrics.ts b/src/utils/evaluations/mintaka-wikidata/calculateMintakaEvaluationMetrics.ts similarity index 94% rename from src/utils/evaluations/calculateMintakaEvaluationMetrics.ts rename to src/utils/evaluations/mintaka-wikidata/calculateMintakaEvaluationMetrics.ts index 694988c..1a970ba 100644 --- a/src/utils/evaluations/calculateMintakaEvaluationMetrics.ts +++ b/src/utils/evaluations/mintaka-wikidata/calculateMintakaEvaluationMetrics.ts @@ -6,9 +6,10 @@ import fs from "fs" import papaparse from "papaparse" import { EvaluationOutputRowType } from "./mintakaEvaluation"; +import { parseCSVFile } from "utils/parseCSVFile"; -calculateMetrics("./LinkQ Evaluation Output.csv","./Plain LLM Evaluation Output.csv","./output.csv") +calculateMetrics("./data/linkq-evaluation-results.csv","./data/plainllm-evaluation-results.csv","./data/aggregated-evaluation-results.csv") type MetricType = { complexityType: string, @@ -41,6 +42,7 @@ async function calculateMetrics( parseCSVFile(linkqDataPath), parseCSVFile(plainLLMDataPath), ]) + console.log("linkqData",linkqData) console.log("Parsed data") if(linkqData.length !== plainLLMData.length) { throw new Error(`linkqData and plainLLMData lengths do not match`) @@ -182,19 +184,6 @@ function isSyntaxCorrect(row: EvaluationOutputRowType) { return value === "YES" } - -export function parseCSVFile(path:string):Promise { - return new Promise((resolve) => { - const file = fs.createReadStream(path) - papaparse.parse(file, { - header: true, - complete: function(results) { - resolve(results.data) - } - }) - }) -} - function meanAndStd(numArray: number[]) { let min = Infinity let max = -Infinity diff --git a/src/utils/evaluations/curatedMintakaQuestions.csv b/src/utils/evaluations/mintaka-wikidata/curatedMintakaQuestions.csv similarity index 100% rename from src/utils/evaluations/curatedMintakaQuestions.csv rename to src/utils/evaluations/mintaka-wikidata/curatedMintakaQuestions.csv diff --git a/src/utils/evaluations/mintaka-wikidata/data/.gitignore b/src/utils/evaluations/mintaka-wikidata/data/.gitignore new file mode 100644 index 0000000..16f2dc5 --- /dev/null +++ b/src/utils/evaluations/mintaka-wikidata/data/.gitignore @@ -0,0 +1 @@ +*.csv \ No newline at end of file diff --git a/src/utils/evaluations/mintakaEvaluation.ts b/src/utils/evaluations/mintaka-wikidata/mintakaEvaluation.ts similarity index 93% rename from src/utils/evaluations/mintakaEvaluation.ts rename to src/utils/evaluations/mintaka-wikidata/mintakaEvaluation.ts index 0159e11..c4505d3 100644 --- a/src/utils/evaluations/mintakaEvaluation.ts +++ b/src/utils/evaluations/mintaka-wikidata/mintakaEvaluation.ts @@ -15,15 +15,15 @@ if (process.env.HTTPS_PROXY) { import fs from "fs" import papaparse from "papaparse" -import { ChatGPTAPI } from "../ChatGPTAPI" -import { tryParsingOutQuery } from "../tryParsingOutQuery" -import { runQuery } from "../knowledgeBase/runQuery" -import { summarizeQueryResults } from "../summarizeQueryResults" -import { getEntityDataFromQuery } from "../knowledgeBase/getEntityData" -import { formatSparqlResultsAsString } from "../formatSparqlResultsAsString" +import { ChatGPTAPI } from "../../ChatGPTAPI" +import { tryParsingOutQuery } from "../../tryParsingOutQuery" +import { runQuery } from "../../knowledgeBase/runQuery" +import { summarizeQueryResults } from "../../summarizeQueryResults" +import { getEntityDataFromQuery } from "../../knowledgeBase/getEntityData" +import { formatSparqlResultsAsString } from "../../formatSparqlResultsAsString" import { QUESTIONS } from "./questions" -import { INITIAL_SYSTEM_MESSAGE } from "../knowledgeBase/prompts" -import { queryBuildingWorkflow } from "../queryBuildingWorkflow" +import { INITIAL_SYSTEM_MESSAGE } from "../../knowledgeBase/prompts" +import { queryBuildingWorkflow } from "../../queryBuildingWorkflow" import { loadEnv } from 'vite' const ENV = loadEnv("development","../../../") @@ -124,7 +124,7 @@ async function runMintakaEvaluation( export async function runLinkQMintakaEvaluation() { return await runMintakaEvaluation( - `LinkQ Evaluation Output ${new Date().getTime()}.csv`, + `linkq-evaluation-output-${new Date().getTime()}.csv`, async (chatGPT:ChatGPTAPI, question:string) => { //force the LLM to start the query building workflow chatGPT.messages = [ @@ -155,7 +155,7 @@ export async function runLinkQMintakaEvaluation() { export async function runPlainLLMMintakaEvaluation() { return await runMintakaEvaluation( - `Plain LLM Evaluation Output ${new Date().getTime()}.csv`, + `plainllm-evaluation-results-${new Date().getTime()}.csv`, async (chatGPT:ChatGPTAPI, question:string) => { return await chatGPT.sendMessages([ { diff --git a/src/utils/evaluations/plot/.gitignore b/src/utils/evaluations/mintaka-wikidata/plot/.gitignore similarity index 100% rename from src/utils/evaluations/plot/.gitignore rename to src/utils/evaluations/mintaka-wikidata/plot/.gitignore diff --git a/src/utils/evaluations/plot/README b/src/utils/evaluations/mintaka-wikidata/plot/README similarity index 100% rename from src/utils/evaluations/plot/README rename to src/utils/evaluations/mintaka-wikidata/plot/README diff --git a/src/utils/evaluations/plot/requirements.txt b/src/utils/evaluations/mintaka-wikidata/plot/requirements.txt similarity index 100% rename from src/utils/evaluations/plot/requirements.txt rename to src/utils/evaluations/mintaka-wikidata/plot/requirements.txt diff --git a/src/utils/evaluations/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py similarity index 95% rename from src/utils/evaluations/plot/validation_figures.py rename to src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py index 0ee4d29..a1369ea 100644 --- a/src/utils/evaluations/plot/validation_figures.py +++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py @@ -10,7 +10,7 @@ sns.set(rc={'figure.dpi': 300, 'savefig.dpi': 300}) ROOT = Path(__file__).parent -DATA = Path(ROOT / 'raw_data') +DATA = Path(ROOT) PLOTS = Path(ROOT / 'plots') def get_aggregated_accuracy_data(): @@ -21,7 +21,7 @@ def get_aggregated_accuracy_data(): def get_raw_timing_data(): timing_columns = ['Total Seconds', 'id', 'complexityType', 'category'] - linkq_df = pd.read_csv(Path(DATA, 'linq-evaluation-results.csv'), usecols=timing_columns) + linkq_df = pd.read_csv(Path(DATA, 'linkq-evaluation-results.csv'), usecols=timing_columns) linkq_df['Algorithm'] = 'LinkQ' plainllm_df = pd.read_csv(Path(DATA, 'plainllm-evaluation-results.csv'), usecols=timing_columns) plainllm_df['Algorithm'] = 'GPT' diff --git a/src/utils/evaluations/prepMintakaQuestions.ts b/src/utils/evaluations/mintaka-wikidata/prepMintakaQuestions.ts similarity index 90% rename from src/utils/evaluations/prepMintakaQuestions.ts rename to src/utils/evaluations/mintaka-wikidata/prepMintakaQuestions.ts index af7fe06..fc1bdde 100644 --- a/src/utils/evaluations/prepMintakaQuestions.ts +++ b/src/utils/evaluations/mintaka-wikidata/prepMintakaQuestions.ts @@ -4,9 +4,9 @@ //npx tsx prepMintakaQuestions.ts import fs from "fs" -import papaparse from "papaparse" import { MintakaQuestionType } from "./mintakaEvaluation"; +import { parseCSVFile } from "utils/parseCSVFile"; prepMintakaQuestions() @@ -86,15 +86,3 @@ export const QUESTIONS:MintakaQuestionType[] = ${JSON.stringify(filteredQuestion fs.writeFileSync("./questions.ts",questionsFileContent) console.log("Done prepping Mintaka questions!") } - -export function parseCSVFile(path:string):Promise { - return new Promise((resolve) => { - const file = fs.createReadStream(path) - papaparse.parse(file, { - header: true, - complete: function(results) { - resolve(results.data) - } - }) - }) -} \ No newline at end of file diff --git a/src/utils/evaluations/questions.ts b/src/utils/evaluations/questions.ts deleted file mode 100644 index 7bd7259..0000000 --- a/src/utils/evaluations/questions.ts +++ /dev/null @@ -1,6 +0,0 @@ -// Copyright (c) 2024 Massachusetts Institute of Technology -// SPDX-License-Identifier: MIT - -import { MintakaQuestionType } from "./mintakaEvaluation"; - -export const QUESTIONS:MintakaQuestionType[] = [] \ No newline at end of file diff --git a/src/utils/parseCSVFile.ts b/src/utils/parseCSVFile.ts new file mode 100644 index 0000000..8afcd91 --- /dev/null +++ b/src/utils/parseCSVFile.ts @@ -0,0 +1,21 @@ +// Copyright (c) 2024 Massachusetts Institute of Technology +// SPDX-License-Identifier: MIT + +import fs from "fs" +import papaparse from "papaparse" + +export function parseCSVFile(path:string):Promise { + return new Promise((resolve, reject) => { + if(!fs.existsSync(path)) { + return reject(new Error(`The path '${path}' does not exist`)) + } + const file = fs.createReadStream(path) + papaparse.parse(file, { + header: true, + complete: function(results) { + console.log(path, results.data[0]) + resolve(results.data) + } + }) + }) +} \ No newline at end of file From 978e582d9e36fb3dbf147dd0529176d429d35551 Mon Sep 17 00:00:00 2001 From: Harry Li Date: Tue, 19 Nov 2024 14:39:19 -0500 Subject: [PATCH 02/10] reproduce images in submission --- .../evaluations/mintaka-wikidata/plot/README | 12 ------------ .../evaluations/mintaka-wikidata/plot/README.md | 16 ++++++++++++++++ .../mintaka-wikidata/plot/validation_figures.py | 17 ++++++++++------- 3 files changed, 26 insertions(+), 19 deletions(-) delete mode 100644 src/utils/evaluations/mintaka-wikidata/plot/README create mode 100644 src/utils/evaluations/mintaka-wikidata/plot/README.md diff --git a/src/utils/evaluations/mintaka-wikidata/plot/README b/src/utils/evaluations/mintaka-wikidata/plot/README deleted file mode 100644 index 2ccc549..0000000 --- a/src/utils/evaluations/mintaka-wikidata/plot/README +++ /dev/null @@ -1,12 +0,0 @@ -1.) Go to the evaluation results in google drive. -2.) Download as a CSV without any extraneous tables that may be in the sheet - - aggregated results - - linkq results - - plainllm results -3.) Create a new folder within this one called 'raw_data' -4.) Place the CSVs from step 2 in the 'raw_data' folder -5.) Rename the CSVs: - - 'Evaluation for CHI - Aggregated Results': 'aggregated-evaluation-results.csv' - - 'Evaluation for CHI - Plain LLM Evaluation Output': 'plainllm-evaluation-results.csv' - - 'Evaluation for CHI - LinkQ Evaluation Output': 'linq-evaluation-results.csv' -6.) Run validation_figures.py in an environment with seaborn installed. (An example requirements.txt is provided.) \ No newline at end of file diff --git a/src/utils/evaluations/mintaka-wikidata/plot/README.md b/src/utils/evaluations/mintaka-wikidata/plot/README.md new file mode 100644 index 0000000..9e7cbb3 --- /dev/null +++ b/src/utils/evaluations/mintaka-wikidata/plot/README.md @@ -0,0 +1,16 @@ +1. Download the evaluation results from TODO +2. Place the CSVs from the `../data` folder +3. Rename the CSVs, if applicable: + - 'Evaluation for CHI - Aggregated Results': 'aggregated-evaluation-results.csv' + - 'Evaluation for CHI - Plain LLM Evaluation Output': 'plainllm-evaluation-results.csv' + - 'Evaluation for CHI - LinkQ Evaluation Output': 'linq-evaluation-results.csv' +4. Create a new conda environment, adivate it, and download the requirements +``` +conda create --name linkq python=3.12 +conda activate linkq +pip install -r requirements.txt +``` +5. Run the script to generate the plots +``` +python validation_figures.py +``` \ No newline at end of file diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py index a1369ea..ba815db 100644 --- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py +++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py @@ -10,12 +10,12 @@ sns.set(rc={'figure.dpi': 300, 'savefig.dpi': 300}) ROOT = Path(__file__).parent -DATA = Path(ROOT) +DATA = Path(ROOT.parent / 'data') PLOTS = Path(ROOT / 'plots') def get_aggregated_accuracy_data(): df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question']) - df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT', 'complexityType': 'Category', 'category': 'Domain'}) + df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Category', 'category': 'Domain'}) df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}) return df @@ -24,7 +24,7 @@ def get_raw_timing_data(): linkq_df = pd.read_csv(Path(DATA, 'linkq-evaluation-results.csv'), usecols=timing_columns) linkq_df['Algorithm'] = 'LinkQ' plainllm_df = pd.read_csv(Path(DATA, 'plainllm-evaluation-results.csv'), usecols=timing_columns) - plainllm_df['Algorithm'] = 'GPT' + plainllm_df['Algorithm'] = 'GPT-4' combined_df = pd.concat([linkq_df, plainllm_df]).reset_index(drop=True) combined_df = combined_df.rename(columns={'complexityType': 'Category', 'category': 'Domain'}) return combined_df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}) @@ -32,18 +32,20 @@ def get_raw_timing_data(): def percent_formatter(x): return f'{round(x)}%' +palette = {'LinkQ': '#1f78b4', 'GPT-4': '#fdbf6f'} + def accuracy_barchart_by_category(): df = get_aggregated_accuracy_data() # Assumes same number of questions per category # If so must be int num_questions_per_category = len(df) // len(df['Category'].unique()) df['LinkQ'] = (df['LinkQ'] > 0).astype(int) - df['GPT'] = (df['GPT'] > 0).astype(int) + df['GPT-4'] = (df['GPT-4'] > 0).astype(int) df = pd.melt(df, id_vars=['id', 'Domain', 'Category', 'question'], var_name='Algorithm', value_name='Correct') - df = df.groupby(['Category', 'Algorithm']).agg({'Correct': 'sum'}).reset_index() + df = df.groupby(['Category', 'Algorithm']).agg({'Correct': 'sum'}).sort_values(by="Correct",ascending=False).reset_index() df['Fraction'] = [f'{v}/{num_questions_per_category}' for v in df['Correct']] df['% Correct'] = (df['Correct'] / num_questions_per_category) * 100 - ax = sns.barplot(df, x='Category', y='% Correct', hue='Algorithm', hue_order=['LinkQ', 'GPT']) + ax = sns.barplot(df, x='Category', y='% Correct', hue='Algorithm', hue_order=['LinkQ', 'GPT-4'], palette=palette) for container in ax.containers: ax.bar_label(container, fmt=percent_formatter) @@ -52,7 +54,7 @@ def accuracy_barchart_by_category(): def timing_boxplot_by_category(): df = get_raw_timing_data() - sns.boxplot(df, x='Category', y='Total Seconds', hue='Algorithm') + sns.boxplot(df, x='Category', y='Total Seconds', hue='Algorithm', palette=palette) plt.savefig(Path(PLOTS, 'timing_boxplot_by_category.pdf'), bbox_inches='tight', format='pdf') plt.close() @@ -60,6 +62,7 @@ def main(): PLOTS.mkdir(exist_ok=True) accuracy_barchart_by_category() timing_boxplot_by_category() + print("Done creating plots!") if __name__ == '__main__': From 3a56d9c80be64ee5247ecbe5b3cc5b68fd869d6f Mon Sep 17 00:00:00 2001 From: Harry Li Date: Tue, 19 Nov 2024 15:45:58 -0500 Subject: [PATCH 03/10] progress on correctness breakdown plots --- .../plot/validation_figures.py | 62 ++++++++++++------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py index ba815db..c4cb690 100644 --- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py +++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py @@ -13,39 +13,25 @@ DATA = Path(ROOT.parent / 'data') PLOTS = Path(ROOT / 'plots') -def get_aggregated_accuracy_data(): - df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question']) - df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Category', 'category': 'Domain'}) - df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}) - return df - -def get_raw_timing_data(): - timing_columns = ['Total Seconds', 'id', 'complexityType', 'category'] - linkq_df = pd.read_csv(Path(DATA, 'linkq-evaluation-results.csv'), usecols=timing_columns) - linkq_df['Algorithm'] = 'LinkQ' - plainllm_df = pd.read_csv(Path(DATA, 'plainllm-evaluation-results.csv'), usecols=timing_columns) - plainllm_df['Algorithm'] = 'GPT-4' - combined_df = pd.concat([linkq_df, plainllm_df]).reset_index(drop=True) - combined_df = combined_df.rename(columns={'complexityType': 'Category', 'category': 'Domain'}) - return combined_df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}) - def percent_formatter(x): return f'{round(x)}%' palette = {'LinkQ': '#1f78b4', 'GPT-4': '#fdbf6f'} def accuracy_barchart_by_category(): - df = get_aggregated_accuracy_data() + df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question']) + df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Question Type'}) + df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}) # Assumes same number of questions per category # If so must be int - num_questions_per_category = len(df) // len(df['Category'].unique()) + num_questions_per_category = len(df) // len(df['Question Type'].unique()) df['LinkQ'] = (df['LinkQ'] > 0).astype(int) df['GPT-4'] = (df['GPT-4'] > 0).astype(int) - df = pd.melt(df, id_vars=['id', 'Domain', 'Category', 'question'], var_name='Algorithm', value_name='Correct') - df = df.groupby(['Category', 'Algorithm']).agg({'Correct': 'sum'}).sort_values(by="Correct",ascending=False).reset_index() + df = pd.melt(df, id_vars=['id', 'category', 'Question Type', 'question'], var_name='Algorithm', value_name='Correct') + df = df.groupby(['Question Type', 'Algorithm']).agg({'Correct': 'sum'}).sort_values(by='Correct',ascending=False).reset_index() df['Fraction'] = [f'{v}/{num_questions_per_category}' for v in df['Correct']] df['% Correct'] = (df['Correct'] / num_questions_per_category) * 100 - ax = sns.barplot(df, x='Category', y='% Correct', hue='Algorithm', hue_order=['LinkQ', 'GPT-4'], palette=palette) + ax = sns.barplot(df, x='Question Type', y='% Correct', hue='Algorithm', hue_order=['LinkQ', 'GPT-4'], palette=palette) for container in ax.containers: ax.bar_label(container, fmt=percent_formatter) @@ -53,15 +39,45 @@ def accuracy_barchart_by_category(): plt.close() def timing_boxplot_by_category(): - df = get_raw_timing_data() - sns.boxplot(df, x='Category', y='Total Seconds', hue='Algorithm', palette=palette) + timing_columns = ['Total Seconds', 'id', 'complexityType', 'category'] + linkq_df = pd.read_csv(Path(DATA, 'linkq-evaluation-results.csv'), usecols=timing_columns) + linkq_df['Algorithm'] = 'LinkQ' + plainllm_df = pd.read_csv(Path(DATA, 'plainllm-evaluation-results.csv'), usecols=timing_columns) + plainllm_df['Algorithm'] = 'GPT-4' + combined_df = pd.concat([linkq_df, plainllm_df]).reset_index(drop=True) + combined_df = combined_df.rename(columns={'complexityType': 'Question Type'}) + + df = combined_df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}) + + sns.boxplot(df, x='Question Type', y='Total Seconds', hue='Algorithm', palette=palette) plt.savefig(Path(PLOTS, 'timing_boxplot_by_category.pdf'), bbox_inches='tight', format='pdf') plt.close() +def correctness_boxplot_by_category(target_column_name:str,y_axis_label:str,output_name:str,palette:dict): + df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=[target_column_name, 'complexityType', 'category', 'id', 'question']) + df = df.rename(columns={target_column_name: 'Correct', 'complexityType': 'Question Type'}) + df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}) + + # Assumes same number of questions per category + # If so must be int + num_questions_per_category = len(df) // len(df['Question Type'].unique()) + df[y_axis_label] = 0 + df = df.groupby(['Question Type', 'Correct']).agg( + {y_axis_label: 'count'}) + df[y_axis_label] = (df[y_axis_label] / num_questions_per_category) * 100 + ax = sns.barplot(df, x='Question Type', y=y_axis_label, hue='Correct', hue_order=[3,2,1,0], palette=palette) + + for container in ax.containers: + ax.bar_label(container, fmt=percent_formatter) + plt.savefig(Path(PLOTS, f'{output_name}.pdf'), bbox_inches='tight', format='pdf') + plt.close() + def main(): PLOTS.mkdir(exist_ok=True) accuracy_barchart_by_category() timing_boxplot_by_category() + correctness_boxplot_by_category(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={0: '#999999', 1: '#7fa0b6', 2: '#528db4', 3: '#1f78b4'}) + correctness_boxplot_by_category(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={0: '#999999', 1: '#fff1e0', 2: '#ffd6a1', 3: '#fdbf6f'}) print("Done creating plots!") From b32121c7facac18ce79d8f0fd675916ff74d818c Mon Sep 17 00:00:00 2001 From: Harry Li Date: Tue, 19 Nov 2024 16:12:27 -0500 Subject: [PATCH 04/10] formatted legend strings, fixed order for all plots --- .../plot/validation_figures.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py index c4cb690..22cc64d 100644 --- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py +++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py @@ -16,7 +16,8 @@ def percent_formatter(x): return f'{round(x)}%' -palette = {'LinkQ': '#1f78b4', 'GPT-4': '#fdbf6f'} +QUESTION_TYPE_ORDER = ['Comparative', 'Yes/No', 'Generic', 'MultiHop', "Intersection"] +PALETTE = {'LinkQ': '#1f78b4', 'GPT-4': '#fdbf6f'} def accuracy_barchart_by_category(): df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question']) @@ -31,7 +32,7 @@ def accuracy_barchart_by_category(): df = df.groupby(['Question Type', 'Algorithm']).agg({'Correct': 'sum'}).sort_values(by='Correct',ascending=False).reset_index() df['Fraction'] = [f'{v}/{num_questions_per_category}' for v in df['Correct']] df['% Correct'] = (df['Correct'] / num_questions_per_category) * 100 - ax = sns.barplot(df, x='Question Type', y='% Correct', hue='Algorithm', hue_order=['LinkQ', 'GPT-4'], palette=palette) + ax = sns.barplot(df, x='Question Type', y='% Correct', order=['Comparative', 'Yes/No', 'Generic', 'MultiHop', "Intersection"], hue='Algorithm', hue_order=['LinkQ', 'GPT-4'], palette=PALETTE) for container in ax.containers: ax.bar_label(container, fmt=percent_formatter) @@ -49,23 +50,26 @@ def timing_boxplot_by_category(): df = combined_df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}) - sns.boxplot(df, x='Question Type', y='Total Seconds', hue='Algorithm', palette=palette) + sns.boxplot(df, x='Question Type', y='Total Seconds', order=QUESTION_TYPE_ORDER, hue='Algorithm', palette=PALETTE) plt.savefig(Path(PLOTS, 'timing_boxplot_by_category.pdf'), bbox_inches='tight', format='pdf') plt.close() def correctness_boxplot_by_category(target_column_name:str,y_axis_label:str,output_name:str,palette:dict): df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=[target_column_name, 'complexityType', 'category', 'id', 'question']) - df = df.rename(columns={target_column_name: 'Correct', 'complexityType': 'Question Type'}) + df = df.rename(columns={target_column_name: 'Correctness', 'complexityType': 'Question Type'}) df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}) # Assumes same number of questions per category # If so must be int num_questions_per_category = len(df) // len(df['Question Type'].unique()) df[y_axis_label] = 0 - df = df.groupby(['Question Type', 'Correct']).agg( + df['Correctness'] = df['Correctness'].apply(lambda x: f'{x}/3') + print(df) + df = df.groupby(['Question Type', 'Correctness']).agg( {y_axis_label: 'count'}) df[y_axis_label] = (df[y_axis_label] / num_questions_per_category) * 100 - ax = sns.barplot(df, x='Question Type', y=y_axis_label, hue='Correct', hue_order=[3,2,1,0], palette=palette) + print + ax = sns.barplot(df, x='Question Type', y=y_axis_label, order=QUESTION_TYPE_ORDER, hue='Correctness', hue_order=["3/3","2/3","1/3","0/3"], palette=palette) for container in ax.containers: ax.bar_label(container, fmt=percent_formatter) @@ -76,8 +80,8 @@ def main(): PLOTS.mkdir(exist_ok=True) accuracy_barchart_by_category() timing_boxplot_by_category() - correctness_boxplot_by_category(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={0: '#999999', 1: '#7fa0b6', 2: '#528db4', 3: '#1f78b4'}) - correctness_boxplot_by_category(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={0: '#999999', 1: '#fff1e0', 2: '#ffd6a1', 3: '#fdbf6f'}) + correctness_boxplot_by_category(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={"0/3": '#999999', "1/3": '#c8ddec', "2/3": '#72aad0', "3/3": '#1f78b4'}) + correctness_boxplot_by_category(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={"0/3": '#999999', "1/3": '#fff4e5', "2/3": '#ffdeb3', "3/3": '#fdbf6f'}) print("Done creating plots!") From 9b2792bfe20c953d1d572dc5638b9bb481fe3beb Mon Sep 17 00:00:00 2001 From: Harry Li Date: Mon, 25 Nov 2024 14:10:14 -0500 Subject: [PATCH 05/10] stack bar charts checkpoint --- .../plot/validation_figures.py | 157 +++++++++++++++++- 1 file changed, 151 insertions(+), 6 deletions(-) diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py index 22cc64d..0eb15de 100644 --- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py +++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py @@ -6,6 +6,7 @@ import pandas as pd import seaborn as sns import matplotlib.pyplot as plt +import numpy as np sns.set(rc={'figure.dpi': 300, 'savefig.dpi': 300}) @@ -18,11 +19,12 @@ def percent_formatter(x): QUESTION_TYPE_ORDER = ['Comparative', 'Yes/No', 'Generic', 'MultiHop', "Intersection"] PALETTE = {'LinkQ': '#1f78b4', 'GPT-4': '#fdbf6f'} +TO_REPLACE = {'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'} def accuracy_barchart_by_category(): df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question']) df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Question Type'}) - df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}) + df = df.replace(to_replace=TO_REPLACE) # Assumes same number of questions per category # If so must be int num_questions_per_category = len(df) // len(df['Question Type'].unique()) @@ -48,16 +50,16 @@ def timing_boxplot_by_category(): combined_df = pd.concat([linkq_df, plainllm_df]).reset_index(drop=True) combined_df = combined_df.rename(columns={'complexityType': 'Question Type'}) - df = combined_df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}) + df = combined_df.replace(to_replace=TO_REPLACE) sns.boxplot(df, x='Question Type', y='Total Seconds', order=QUESTION_TYPE_ORDER, hue='Algorithm', palette=PALETTE) plt.savefig(Path(PLOTS, 'timing_boxplot_by_category.pdf'), bbox_inches='tight', format='pdf') plt.close() -def correctness_boxplot_by_category(target_column_name:str,y_axis_label:str,output_name:str,palette:dict): +def correctness_barchart_by_algorithm(target_column_name:str,y_axis_label:str,output_name:str,palette:dict): df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=[target_column_name, 'complexityType', 'category', 'id', 'question']) df = df.rename(columns={target_column_name: 'Correctness', 'complexityType': 'Question Type'}) - df = df.replace(to_replace={'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'}) + df = df.replace(to_replace=TO_REPLACE) # Assumes same number of questions per category # If so must be int @@ -76,12 +78,155 @@ def correctness_boxplot_by_category(target_column_name:str,y_axis_label:str,outp plt.savefig(Path(PLOTS, f'{output_name}.pdf'), bbox_inches='tight', format='pdf') plt.close() +linkq_palette = {"LinkQ 0/3": '#999999', "LinkQ 1/3": '#c8ddec', "LinkQ 2/3": '#72aad0', "LinkQ 3/3": '#1f78b4'} +plainllm_palette = {"GPT-4 0/3": '#999999', "GPT-4 1/3": '#fff4e5', "GPT-4 2/3": '#ffdeb3', "GPT-4 3/3": '#fdbf6f'} +tmp_palette = {"LinkQ 0/3": '#999999', "LinkQ 1/3": '#c8ddec', "LinkQ 2/3": '#72aad0', "LinkQ 3/3": '#1f78b4', "GPT-4 0/3": '#999999', "GPT-4 1/3": '#fff4e5', "GPT-4 2/3": '#ffdeb3', "GPT-4 3/3": '#fdbf6f'} +def correctness_barchart(): + df1 = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'complexityType', 'category', 'id', 'question']) + df1 = df1.rename(columns={'linkqAnswerCorrect': 'Correctness', 'complexityType': 'Question Type'}) + df1 = df1.replace(to_replace=TO_REPLACE) + num_questions_per_category = len(df1) // len(df1['Question Type'].unique()) + df1 = df1.loc[df1['Correctness'] != 0] + df1['Correctness'] = df1['Correctness'].apply(lambda x: f'LinkQ {x}/3') + + + df2 = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question']) + df2 = df2.rename(columns={'plainLLMAnswerCorrect':'Correctness', 'complexityType': 'Question Type'}) + df2 = df2.replace(to_replace=TO_REPLACE) + df2 = df2.loc[df2['Correctness'] != 0] + df2['Correctness'] = df2['Correctness'].apply(lambda x: f'GPT-4 {x}/3') + + df = df1._append(df2, ignore_index=True) + + # Assumes same number of questions per category + # If so must be int + df['tmp'] = 0 + print(df) + df = df.groupby(['Question Type', 'Correctness']).agg( + {'tmp': 'count'}) + df['tmp'] = (df['tmp'] / num_questions_per_category) * 100 + print("-----------------------------------------------------------------------------") + print(df) + + + ax = sns.barplot(df, x='Question Type', y="tmp", order=QUESTION_TYPE_ORDER, hue='Correctness', + hue_order=["LinkQ 3/3","LinkQ 2/3","LinkQ 1/3","GPT-4 3/3","GPT-4 2/3","GPT-4 1/3"], + palette=tmp_palette) + + for container in ax.containers: + ax.bar_label(container, fmt=percent_formatter) + plt.savefig(Path(PLOTS, f'correctness.pdf'), bbox_inches='tight', format='pdf') + plt.show() + plt.close() + +def correctness_stacked_barchart(): + df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question']) + df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Question Type'}) + df = df.replace(to_replace=TO_REPLACE) + + # custom sort the question types + df["Question Type"] = pd.Categorical(df["Question Type"], categories=QUESTION_TYPE_ORDER, ordered=True) + df = df.sort_values("Question Type") + + df['LinkQ'] = df['LinkQ'].apply(lambda x: f'{x}/3') + df['GPT-4'] = df['GPT-4'].apply(lambda x: f'{x}/3') + + # Assumes same number of questions per category + # If so must be int + num_questions_per_category = len(df) // len(df['Question Type'].unique()) + print("num_questions_per_category",num_questions_per_category) + + + df = pd.melt(df, id_vars=['id', 'category', 'Question Type', 'question'], var_name='Algorithm', value_name='Correctness') + + + df['Count'] = 0 + # print(df) + df = df.groupby(['Question Type', 'Algorithm', 'Correctness']).agg( + {'Count': 'count'}).unstack(fill_value=0).stack().reset_index() + df['Count'] = (df['Count'] / num_questions_per_category) * 100 + print(df) + print("-----------------------------------------------------------------------------") + + question_type = df['Question Type'].unique() + algorithms = ['LinkQ', 'GPT-4'] + width = 0.35 # Width of the bar + x = np.arange(len(question_type)) # X-axis positions for question_type + + # Plot side-by-side stacked bars + fig, ax = plt.subplots() + + for i, algorithm in enumerate(algorithms): + print("-----------------------------------------------------------------------------") + # Filter data for the current algorithm + algorithm_data = df[df['Algorithm'] == algorithm] + three_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '3/3']['Count'].reset_index(drop=True) + two_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '2/3']['Count'].reset_index(drop=True) + one_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '1/3']['Count'].reset_index(drop=True) + color3 = tmp_palette[f'{algorithm} 3/3'] + color2 = tmp_palette[f'{algorithm} 2/3'] + color1 = tmp_palette[f'{algorithm} 1/3'] + + print("algorithm",algorithm) + print("three_three_data",three_three_data) + print("two_three_data",two_three_data) + print("added",two_three_data+three_three_data) + bar3 = ax.bar(x + (i - 0.5) * width, three_three_data, width, color=color3, label=f'{algorithm} 3/3 Correct') + bar2 = ax.bar(x + (i - 0.5) * width, two_three_data, width, color=color2, bottom=three_three_data, label=f'{algorithm} 2/3 Correct') + bar1 = ax.bar(x + (i - 0.5) * width, one_three_data, width, color=color1, bottom=two_three_data+three_three_data, label=f'{algorithm} 1/3 Correct') + ax.set_xlabel('Question Type') + ax.set_ylabel('% Correct') + ax.set_title('Side-by-Side Stacked Bar Chart') + ax.set_xticks(x) + ax.set_xticklabels(question_type) + ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + plt.tight_layout() + plt.savefig(Path(PLOTS, f'correctness_stacked.pdf'), bbox_inches='tight', format='pdf') + plt.close() + + + + fig, ax = plt.subplots() + for i, algorithm in enumerate(algorithms): + print("-----------------------------------------------------------------------------") + # Filter data for the current algorithm + algorithm_data = df[df['Algorithm'] == algorithm] + three_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '3/3']['Count'].reset_index(drop=True) + two_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '2/3']['Count'].reset_index(drop=True) + one_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '1/3']['Count'].reset_index(drop=True) + zero_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '0/3']['Count'].reset_index(drop=True) + color3 = tmp_palette[f'{algorithm} 3/3'] + color2 = tmp_palette[f'{algorithm} 2/3'] + color1 = tmp_palette[f'{algorithm} 1/3'] + color0 = tmp_palette[f'{algorithm} 0/3'] + + print("algorithm",algorithm) + print("three_three_data",three_three_data) + print("two_three_data",two_three_data) + print("added",two_three_data+three_three_data) + bar3 = ax.bar(x + (i - 0.5) * width, three_three_data, width, color=color3, label=f'{algorithm} 3/3 Correct') + bar2 = ax.bar(x + (i - 0.5) * width, two_three_data, width, color=color2, bottom=three_three_data, label=f'{algorithm} 2/3 Correct') + bar1 = ax.bar(x + (i - 0.5) * width, one_three_data, width, color=color1, bottom=two_three_data+three_three_data, label=f'{algorithm} 1/3 Correct') + bar0 = ax.bar(x + (i - 0.5) * width, zero_three_data, width, color=color0, bottom=one_three_data+two_three_data+three_three_data, label=f'{algorithm} 0/3 Correct') + ax.set_xlabel('Question Type') + ax.set_ylabel('% Correct') + ax.set_title('Side-by-Side Stacked Bar Chart') + ax.set_xticks(x) + ax.set_xticklabels(question_type) + ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + plt.tight_layout() + plt.savefig(Path(PLOTS, f'correctness_stacked_zeros.pdf'), bbox_inches='tight', format='pdf') + plt.show() + plt.close() + def main(): PLOTS.mkdir(exist_ok=True) accuracy_barchart_by_category() timing_boxplot_by_category() - correctness_boxplot_by_category(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={"0/3": '#999999', "1/3": '#c8ddec', "2/3": '#72aad0', "3/3": '#1f78b4'}) - correctness_boxplot_by_category(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={"0/3": '#999999', "1/3": '#fff4e5', "2/3": '#ffdeb3', "3/3": '#fdbf6f'}) + # correctness_barchart_by_algorithm(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={"0/3": '#999999', "1/3": '#c8ddec', "2/3": '#72aad0', "3/3": '#1f78b4'}) + # correctness_barchart_by_algorithm(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={"0/3": '#999999', "1/3": '#fff4e5', "2/3": '#ffdeb3', "3/3": '#fdbf6f'}) + # correctness_barchart() + correctness_stacked_barchart() print("Done creating plots!") From 43fcbaa5582a3e49bc137e144e4b5334254d5190 Mon Sep 17 00:00:00 2001 From: Harry Li Date: Mon, 25 Nov 2024 14:20:30 -0500 Subject: [PATCH 06/10] filled in missing data --- .../mintaka-wikidata/plot/validation_figures.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py index 0eb15de..0efe2a6 100644 --- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py +++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py @@ -103,20 +103,20 @@ def correctness_barchart(): df['tmp'] = 0 print(df) df = df.groupby(['Question Type', 'Correctness']).agg( - {'tmp': 'count'}) + {'tmp': 'count'}).unstack(fill_value=0).stack().reset_index() df['tmp'] = (df['tmp'] / num_questions_per_category) * 100 print("-----------------------------------------------------------------------------") print(df) ax = sns.barplot(df, x='Question Type', y="tmp", order=QUESTION_TYPE_ORDER, hue='Correctness', - hue_order=["LinkQ 3/3","LinkQ 2/3","LinkQ 1/3","GPT-4 3/3","GPT-4 2/3","GPT-4 1/3"], + hue_order=["LinkQ 3/3","GPT-4 3/3","LinkQ 2/3","GPT-4 2/3","LinkQ 1/3","GPT-4 1/3"], palette=tmp_palette) for container in ax.containers: ax.bar_label(container, fmt=percent_formatter) plt.savefig(Path(PLOTS, f'correctness.pdf'), bbox_inches='tight', format='pdf') - plt.show() + # plt.show() plt.close() def correctness_stacked_barchart(): @@ -216,7 +216,7 @@ def correctness_stacked_barchart(): ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.tight_layout() plt.savefig(Path(PLOTS, f'correctness_stacked_zeros.pdf'), bbox_inches='tight', format='pdf') - plt.show() + # plt.show() plt.close() def main(): @@ -225,7 +225,7 @@ def main(): timing_boxplot_by_category() # correctness_barchart_by_algorithm(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={"0/3": '#999999', "1/3": '#c8ddec', "2/3": '#72aad0', "3/3": '#1f78b4'}) # correctness_barchart_by_algorithm(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={"0/3": '#999999', "1/3": '#fff4e5', "2/3": '#ffdeb3', "3/3": '#fdbf6f'}) - # correctness_barchart() + correctness_barchart() correctness_stacked_barchart() print("Done creating plots!") From c77f6103815774802d303a5aecf4517342ac5be4 Mon Sep 17 00:00:00 2001 From: Harry Li Date: Mon, 25 Nov 2024 15:47:17 -0500 Subject: [PATCH 07/10] cleaned up and commented --- .../plot/validation_figures.py | 209 +++++++----------- 1 file changed, 76 insertions(+), 133 deletions(-) diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py index 0efe2a6..004dddf 100644 --- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py +++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py @@ -3,6 +3,7 @@ from pathlib import Path +from functools import reduce import pandas as pd import seaborn as sns import matplotlib.pyplot as plt @@ -17,23 +18,30 @@ def percent_formatter(x): return f'{round(x)}%' +CORRECTNESS_PALETTE = {"LinkQ 0/3": '#999999', "LinkQ 1/3": '#c8ddec', "LinkQ 2/3": '#72aad0', "LinkQ 3/3": '#1f78b4', "GPT-4 0/3": '#999999', "GPT-4 1/3": '#fff4e5', "GPT-4 2/3": '#ffdeb3', "GPT-4 3/3": '#fdbf6f'} QUESTION_TYPE_ORDER = ['Comparative', 'Yes/No', 'Generic', 'MultiHop', "Intersection"] PALETTE = {'LinkQ': '#1f78b4', 'GPT-4': '#fdbf6f'} TO_REPLACE = {'multihop': 'MultiHop', 'generic': 'Generic', 'intersection': 'Intersection', 'yesno': 'Yes/No', 'comparative': 'Comparative'} def accuracy_barchart_by_category(): + # Load the data and rename certain columns and values df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question']) df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Question Type'}) df = df.replace(to_replace=TO_REPLACE) - # Assumes same number of questions per category - # If so must be int - num_questions_per_category = len(df) // len(df['Question Type'].unique()) + + num_questions_per_type = len(df) // len(df['Question Type'].unique()) # Assumes same number of questions per category df['LinkQ'] = (df['LinkQ'] > 0).astype(int) df['GPT-4'] = (df['GPT-4'] > 0).astype(int) + + # Unpivot the LinkQ and GPT-4 columns into Algorithm and Correctness columns df = pd.melt(df, id_vars=['id', 'category', 'Question Type', 'question'], var_name='Algorithm', value_name='Correct') + + # Count the correctness values and convert them into percentages df = df.groupby(['Question Type', 'Algorithm']).agg({'Correct': 'sum'}).sort_values(by='Correct',ascending=False).reset_index() - df['Fraction'] = [f'{v}/{num_questions_per_category}' for v in df['Correct']] - df['% Correct'] = (df['Correct'] / num_questions_per_category) * 100 + df['Fraction'] = [f'{v}/{num_questions_per_type}' for v in df['Correct']] + df['% Correct'] = (df['Correct'] / num_questions_per_type) * 100 + + # Plot the data ax = sns.barplot(df, x='Question Type', y='% Correct', order=['Comparative', 'Yes/No', 'Generic', 'MultiHop', "Intersection"], hue='Algorithm', hue_order=['LinkQ', 'GPT-4'], palette=PALETTE) for container in ax.containers: @@ -41,190 +49,125 @@ def accuracy_barchart_by_category(): plt.savefig(Path(PLOTS, 'accuracy_barchart_by_category.pdf'), bbox_inches='tight', format='pdf') plt.close() + def timing_boxplot_by_category(): + # Load the data and rename certain columns and values timing_columns = ['Total Seconds', 'id', 'complexityType', 'category'] linkq_df = pd.read_csv(Path(DATA, 'linkq-evaluation-results.csv'), usecols=timing_columns) linkq_df['Algorithm'] = 'LinkQ' plainllm_df = pd.read_csv(Path(DATA, 'plainllm-evaluation-results.csv'), usecols=timing_columns) plainllm_df['Algorithm'] = 'GPT-4' - combined_df = pd.concat([linkq_df, plainllm_df]).reset_index(drop=True) - combined_df = combined_df.rename(columns={'complexityType': 'Question Type'}) - - df = combined_df.replace(to_replace=TO_REPLACE) + df = pd.concat([linkq_df, plainllm_df]).reset_index(drop=True) + df = df.rename(columns={'complexityType': 'Question Type'}) + df = df.replace(to_replace=TO_REPLACE) sns.boxplot(df, x='Question Type', y='Total Seconds', order=QUESTION_TYPE_ORDER, hue='Algorithm', palette=PALETTE) plt.savefig(Path(PLOTS, 'timing_boxplot_by_category.pdf'), bbox_inches='tight', format='pdf') plt.close() -def correctness_barchart_by_algorithm(target_column_name:str,y_axis_label:str,output_name:str,palette:dict): - df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=[target_column_name, 'complexityType', 'category', 'id', 'question']) - df = df.rename(columns={target_column_name: 'Correctness', 'complexityType': 'Question Type'}) - df = df.replace(to_replace=TO_REPLACE) - - # Assumes same number of questions per category - # If so must be int - num_questions_per_category = len(df) // len(df['Question Type'].unique()) - df[y_axis_label] = 0 - df['Correctness'] = df['Correctness'].apply(lambda x: f'{x}/3') - print(df) - df = df.groupby(['Question Type', 'Correctness']).agg( - {y_axis_label: 'count'}) - df[y_axis_label] = (df[y_axis_label] / num_questions_per_category) * 100 - print - ax = sns.barplot(df, x='Question Type', y=y_axis_label, order=QUESTION_TYPE_ORDER, hue='Correctness', hue_order=["3/3","2/3","1/3","0/3"], palette=palette) - - for container in ax.containers: - ax.bar_label(container, fmt=percent_formatter) - plt.savefig(Path(PLOTS, f'{output_name}.pdf'), bbox_inches='tight', format='pdf') - plt.close() -linkq_palette = {"LinkQ 0/3": '#999999', "LinkQ 1/3": '#c8ddec', "LinkQ 2/3": '#72aad0', "LinkQ 3/3": '#1f78b4'} -plainllm_palette = {"GPT-4 0/3": '#999999', "GPT-4 1/3": '#fff4e5', "GPT-4 2/3": '#ffdeb3', "GPT-4 3/3": '#fdbf6f'} -tmp_palette = {"LinkQ 0/3": '#999999', "LinkQ 1/3": '#c8ddec', "LinkQ 2/3": '#72aad0', "LinkQ 3/3": '#1f78b4', "GPT-4 0/3": '#999999', "GPT-4 1/3": '#fff4e5', "GPT-4 2/3": '#ffdeb3', "GPT-4 3/3": '#fdbf6f'} def correctness_barchart(): - df1 = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'complexityType', 'category', 'id', 'question']) - df1 = df1.rename(columns={'linkqAnswerCorrect': 'Correctness', 'complexityType': 'Question Type'}) - df1 = df1.replace(to_replace=TO_REPLACE) - num_questions_per_category = len(df1) // len(df1['Question Type'].unique()) - df1 = df1.loc[df1['Correctness'] != 0] - df1['Correctness'] = df1['Correctness'].apply(lambda x: f'LinkQ {x}/3') - - - df2 = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question']) - df2 = df2.rename(columns={'plainLLMAnswerCorrect':'Correctness', 'complexityType': 'Question Type'}) - df2 = df2.replace(to_replace=TO_REPLACE) - df2 = df2.loc[df2['Correctness'] != 0] - df2['Correctness'] = df2['Correctness'].apply(lambda x: f'GPT-4 {x}/3') + # Load the data and rename certain columns and values + df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question']) + df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Question Type'}) + df = df.replace(to_replace=TO_REPLACE) + df['LinkQ'] = df['LinkQ'].apply(lambda x: f'LinkQ {x}/3') + df['GPT-4'] = df['GPT-4'].apply(lambda x: f'GPT-4 {x}/3') - df = df1._append(df2, ignore_index=True) - # Assumes same number of questions per category - # If so must be int - df['tmp'] = 0 - print(df) - df = df.groupby(['Question Type', 'Correctness']).agg( - {'tmp': 'count'}).unstack(fill_value=0).stack().reset_index() - df['tmp'] = (df['tmp'] / num_questions_per_category) * 100 - print("-----------------------------------------------------------------------------") - print(df) + num_questions_per_type = len(df) // len(df['Question Type'].unique()) # Assumes same number of questions per category + + # Unpivot the LinkQ and GPT-4 columns into Algorithm and Correctness columns + df = pd.melt(df, id_vars=['id', 'category', 'Question Type', 'question'], var_name='Algorithm', value_name='Correctness') + # Count the correctness values and convert them into percentages + df['Value'] = 0 + df = df.groupby(['Question Type', 'Correctness']).agg( + {'Value': 'count'}).unstack(fill_value=0).stack(future_stack=True).reset_index() + df['Value'] = (df['Value'] / num_questions_per_type) * 100 - ax = sns.barplot(df, x='Question Type', y="tmp", order=QUESTION_TYPE_ORDER, hue='Correctness', + # Plot the data + ax = sns.barplot(df, x='Question Type', y="Value", order=QUESTION_TYPE_ORDER, hue='Correctness', hue_order=["LinkQ 3/3","GPT-4 3/3","LinkQ 2/3","GPT-4 2/3","LinkQ 1/3","GPT-4 1/3"], - palette=tmp_palette) + palette=CORRECTNESS_PALETTE) for container in ax.containers: ax.bar_label(container, fmt=percent_formatter) plt.savefig(Path(PLOTS, f'correctness.pdf'), bbox_inches='tight', format='pdf') - # plt.show() plt.close() + def correctness_stacked_barchart(): + # Load the data and rename certain columns and values df = pd.read_csv(Path(DATA, 'aggregated-evaluation-results.csv'), usecols=['linkqAnswerCorrect', 'plainLLMAnswerCorrect', 'complexityType', 'category', 'id', 'question']) df = df.rename(columns={'linkqAnswerCorrect': 'LinkQ', 'plainLLMAnswerCorrect': 'GPT-4', 'complexityType': 'Question Type'}) df = df.replace(to_replace=TO_REPLACE) + df['LinkQ'] = df['LinkQ'].apply(lambda x: f'{x}/3') + df['GPT-4'] = df['GPT-4'].apply(lambda x: f'{x}/3') - # custom sort the question types + # Custom sort the question types to keep all the plots consistent df["Question Type"] = pd.Categorical(df["Question Type"], categories=QUESTION_TYPE_ORDER, ordered=True) df = df.sort_values("Question Type") - df['LinkQ'] = df['LinkQ'].apply(lambda x: f'{x}/3') - df['GPT-4'] = df['GPT-4'].apply(lambda x: f'{x}/3') - # Assumes same number of questions per category - # If so must be int - num_questions_per_category = len(df) // len(df['Question Type'].unique()) - print("num_questions_per_category",num_questions_per_category) - + num_questions_per_type = len(df) // len(df['Question Type'].unique()) + # Unpivot the LinkQ and GPT-4 columns into Algorithm and Correctness columns df = pd.melt(df, id_vars=['id', 'category', 'Question Type', 'question'], var_name='Algorithm', value_name='Correctness') - - df['Count'] = 0 - # print(df) - df = df.groupby(['Question Type', 'Algorithm', 'Correctness']).agg( - {'Count': 'count'}).unstack(fill_value=0).stack().reset_index() - df['Count'] = (df['Count'] / num_questions_per_category) * 100 - print(df) - print("-----------------------------------------------------------------------------") - - question_type = df['Question Type'].unique() - algorithms = ['LinkQ', 'GPT-4'] + # Count the correctness values and convert them into percentages + df['Value'] = 0 + df = df.groupby(['Question Type', 'Algorithm', 'Correctness'],observed=False).agg( + {'Value': 'count'}).unstack(fill_value=0).stack(future_stack=True).reset_index() + df['Value'] = (df['Value'] / num_questions_per_type) * 100 + + # Prep the plot data + question_types = df['Question Type'].unique() + x = np.arange(len(question_types)) # X-axis positions for question_types + algorithms = ['LinkQ', 'GPT-4'] # this list determines left to right ordering of the algorithms + correctness = ['3/3','2/3','1/3'] # this list determines bottom to top stacking order of correctness width = 0.35 # Width of the bar - x = np.arange(len(question_type)) # X-axis positions for question_type # Plot side-by-side stacked bars fig, ax = plt.subplots() - - for i, algorithm in enumerate(algorithms): - print("-----------------------------------------------------------------------------") + for alg_idx, algorithm in enumerate(algorithms): # Filter data for the current algorithm algorithm_data = df[df['Algorithm'] == algorithm] - three_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '3/3']['Count'].reset_index(drop=True) - two_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '2/3']['Count'].reset_index(drop=True) - one_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '1/3']['Count'].reset_index(drop=True) - color3 = tmp_palette[f'{algorithm} 3/3'] - color2 = tmp_palette[f'{algorithm} 2/3'] - color1 = tmp_palette[f'{algorithm} 1/3'] + # Filter again by correctness + filtered_values = list(map( + lambda x: algorithm_data.loc[algorithm_data['Correctness'] == x]['Value'].reset_index(drop=True), + correctness)) - print("algorithm",algorithm) - print("three_three_data",three_three_data) - print("two_three_data",two_three_data) - print("added",two_three_data+three_three_data) - bar3 = ax.bar(x + (i - 0.5) * width, three_three_data, width, color=color3, label=f'{algorithm} 3/3 Correct') - bar2 = ax.bar(x + (i - 0.5) * width, two_three_data, width, color=color2, bottom=three_three_data, label=f'{algorithm} 2/3 Correct') - bar1 = ax.bar(x + (i - 0.5) * width, one_three_data, width, color=color1, bottom=two_three_data+three_three_data, label=f'{algorithm} 1/3 Correct') - ax.set_xlabel('Question Type') - ax.set_ylabel('% Correct') - ax.set_title('Side-by-Side Stacked Bar Chart') - ax.set_xticks(x) - ax.set_xticklabels(question_type) - ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') - plt.tight_layout() - plt.savefig(Path(PLOTS, f'correctness_stacked.pdf'), bbox_inches='tight', format='pdf') - plt.close() + # Loop over all the correctness to stack the bars on top of each other + bottom = None # The first correctness bars will be stacked from the bottom + for correct_idx, correct in enumerate(correctness): + values = filtered_values[correct_idx] # Series containing the values for this algorithm + correctness, by question type + color = CORRECTNESS_PALETTE[f'{algorithm} {correct}'] # Get the color palette for this algorithm + correctness + # Stack the bars for this correctness + bar = ax.bar( + x=x + (alg_idx - 0.5) * width, + height=values, + width=width, + color=color, + label=f'{algorithm} {correct} Correct', + bottom=bottom) + # For the next set of stacked bars, we need to add these count values so we know where we should stack from + bottom = values if (bottom is None) else (bottom + values) - - - fig, ax = plt.subplots() - for i, algorithm in enumerate(algorithms): - print("-----------------------------------------------------------------------------") - # Filter data for the current algorithm - algorithm_data = df[df['Algorithm'] == algorithm] - three_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '3/3']['Count'].reset_index(drop=True) - two_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '2/3']['Count'].reset_index(drop=True) - one_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '1/3']['Count'].reset_index(drop=True) - zero_three_data = algorithm_data.loc[algorithm_data['Correctness'] == '0/3']['Count'].reset_index(drop=True) - color3 = tmp_palette[f'{algorithm} 3/3'] - color2 = tmp_palette[f'{algorithm} 2/3'] - color1 = tmp_palette[f'{algorithm} 1/3'] - color0 = tmp_palette[f'{algorithm} 0/3'] - - print("algorithm",algorithm) - print("three_three_data",three_three_data) - print("two_three_data",two_three_data) - print("added",two_three_data+three_three_data) - bar3 = ax.bar(x + (i - 0.5) * width, three_three_data, width, color=color3, label=f'{algorithm} 3/3 Correct') - bar2 = ax.bar(x + (i - 0.5) * width, two_three_data, width, color=color2, bottom=three_three_data, label=f'{algorithm} 2/3 Correct') - bar1 = ax.bar(x + (i - 0.5) * width, one_three_data, width, color=color1, bottom=two_three_data+three_three_data, label=f'{algorithm} 1/3 Correct') - bar0 = ax.bar(x + (i - 0.5) * width, zero_three_data, width, color=color0, bottom=one_three_data+two_three_data+three_three_data, label=f'{algorithm} 0/3 Correct') ax.set_xlabel('Question Type') ax.set_ylabel('% Correct') ax.set_title('Side-by-Side Stacked Bar Chart') ax.set_xticks(x) - ax.set_xticklabels(question_type) + ax.set_xticklabels(question_types) ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.tight_layout() - plt.savefig(Path(PLOTS, f'correctness_stacked_zeros.pdf'), bbox_inches='tight', format='pdf') - # plt.show() + plt.savefig(Path(PLOTS, 'correctness_stacked.pdf'), bbox_inches='tight', format='pdf') plt.close() def main(): PLOTS.mkdir(exist_ok=True) accuracy_barchart_by_category() timing_boxplot_by_category() - # correctness_barchart_by_algorithm(target_column_name="linkqAnswerCorrect",y_axis_label="LinkQ Correctness",output_name="linkq_correctness",palette={"0/3": '#999999', "1/3": '#c8ddec', "2/3": '#72aad0', "3/3": '#1f78b4'}) - # correctness_barchart_by_algorithm(target_column_name="plainLLMAnswerCorrect",y_axis_label="GPT-4 Correctness",output_name="plainllm_correctness",palette={"0/3": '#999999', "1/3": '#fff4e5', "2/3": '#ffdeb3', "3/3": '#fdbf6f'}) correctness_barchart() correctness_stacked_barchart() print("Done creating plots!") From d5ed9abb4dcb0ef2c83eca88e46c45ca69779d58 Mon Sep 17 00:00:00 2001 From: Harry Li Date: Mon, 25 Nov 2024 16:16:40 -0500 Subject: [PATCH 08/10] summed percentages --- .../plot/validation_figures.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py index 004dddf..3729cfb 100644 --- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py +++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py @@ -138,28 +138,38 @@ def correctness_stacked_barchart(): lambda x: algorithm_data.loc[algorithm_data['Correctness'] == x]['Value'].reset_index(drop=True), correctness)) + plot_x = x + (alg_idx - 0.5) * width + bottom = np.zeros(len(question_types)) # The first correctness bars will be stacked from the bottom # Loop over all the correctness to stack the bars on top of each other - bottom = None # The first correctness bars will be stacked from the bottom for correct_idx, correct in enumerate(correctness): values = filtered_values[correct_idx] # Series containing the values for this algorithm + correctness, by question type color = CORRECTNESS_PALETTE[f'{algorithm} {correct}'] # Get the color palette for this algorithm + correctness # Stack the bars for this correctness bar = ax.bar( - x=x + (alg_idx - 0.5) * width, + x=plot_x, height=values, width=width, color=color, - label=f'{algorithm} {correct} Correct', + label=f'{algorithm} {correct}', bottom=bottom) + + # for xpos, value, y in zip(plot_x, values, bottom): + # if value != 0.0: + # ax.text(x=xpos, y=y + value/2, s=percent_formatter(value), ha='center', va='center', fontsize=10) + # For the next set of stacked bars, we need to add these count values so we know where we should stack from - bottom = values if (bottom is None) else (bottom + values) + bottom += values + + # Label the percentage sums + for xpos, total in zip(plot_x, bottom): + ax.text(x=xpos, y=total + 0.5, s=percent_formatter(total), ha='center', va='bottom', fontsize=10) ax.set_xlabel('Question Type') ax.set_ylabel('% Correct') - ax.set_title('Side-by-Side Stacked Bar Chart') + # ax.set_title('Side-by-Side Stacked Bar Chart') ax.set_xticks(x) ax.set_xticklabels(question_types) - ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + ax.legend(title="# Correct / 3 Attempts", title_fontsize=10, bbox_to_anchor=(1, 1), loc='upper left') plt.tight_layout() plt.savefig(Path(PLOTS, 'correctness_stacked.pdf'), bbox_inches='tight', format='pdf') plt.close() From 6e8fe8ca6a6974a764b6e1f5148070831223f3af Mon Sep 17 00:00:00 2001 From: Harry Li Date: Mon, 2 Dec 2024 09:30:10 -0500 Subject: [PATCH 09/10] tweaked stacked bar chart --- .../evaluations/mintaka-wikidata/plot/validation_figures.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py index 3729cfb..dbedf84 100644 --- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py +++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py @@ -126,10 +126,11 @@ def correctness_stacked_barchart(): x = np.arange(len(question_types)) # X-axis positions for question_types algorithms = ['LinkQ', 'GPT-4'] # this list determines left to right ordering of the algorithms correctness = ['3/3','2/3','1/3'] # this list determines bottom to top stacking order of correctness - width = 0.35 # Width of the bar + width = 0.38 # Width of the bar # Plot side-by-side stacked bars fig, ax = plt.subplots() + fig.set_figwidth(7) for alg_idx, algorithm in enumerate(algorithms): # Filter data for the current algorithm algorithm_data = df[df['Algorithm'] == algorithm] @@ -162,7 +163,7 @@ def correctness_stacked_barchart(): # Label the percentage sums for xpos, total in zip(plot_x, bottom): - ax.text(x=xpos, y=total + 0.5, s=percent_formatter(total), ha='center', va='bottom', fontsize=10) + ax.text(x=xpos, y=total + 0.5, s=percent_formatter(total), ha='center', va='bottom', fontsize=11) ax.set_xlabel('Question Type') ax.set_ylabel('% Correct') @@ -170,6 +171,7 @@ def correctness_stacked_barchart(): ax.set_xticks(x) ax.set_xticklabels(question_types) ax.legend(title="# Correct / 3 Attempts", title_fontsize=10, bbox_to_anchor=(1, 1), loc='upper left') + plt.grid(axis='x', which='both', visible=False) plt.tight_layout() plt.savefig(Path(PLOTS, 'correctness_stacked.pdf'), bbox_inches='tight', format='pdf') plt.close() From 2248580f06847273eb3cd6ebdd49e358c967a797 Mon Sep 17 00:00:00 2001 From: Harry Li Date: Mon, 2 Dec 2024 15:11:41 -0500 Subject: [PATCH 10/10] updated stacked bar chart visuals --- .../mintaka-wikidata/plot/validation_figures.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py index dbedf84..7bb4eb4 100644 --- a/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py +++ b/src/utils/evaluations/mintaka-wikidata/plot/validation_figures.py @@ -16,7 +16,7 @@ PLOTS = Path(ROOT / 'plots') def percent_formatter(x): - return f'{round(x)}%' + return f'{'{:.1%}'.format(x/100)}' CORRECTNESS_PALETTE = {"LinkQ 0/3": '#999999', "LinkQ 1/3": '#c8ddec', "LinkQ 2/3": '#72aad0', "LinkQ 3/3": '#1f78b4', "GPT-4 0/3": '#999999', "GPT-4 1/3": '#fff4e5', "GPT-4 2/3": '#ffdeb3', "GPT-4 3/3": '#fdbf6f'} QUESTION_TYPE_ORDER = ['Comparative', 'Yes/No', 'Generic', 'MultiHop', "Intersection"] @@ -130,7 +130,7 @@ def correctness_stacked_barchart(): # Plot side-by-side stacked bars fig, ax = plt.subplots() - fig.set_figwidth(7) + fig.set_figwidth(8) for alg_idx, algorithm in enumerate(algorithms): # Filter data for the current algorithm algorithm_data = df[df['Algorithm'] == algorithm] @@ -152,6 +152,8 @@ def correctness_stacked_barchart(): width=width, color=color, label=f'{algorithm} {correct}', + edgecolor="black", + linewidth=0.5, bottom=bottom) # for xpos, value, y in zip(plot_x, values, bottom): @@ -163,7 +165,7 @@ def correctness_stacked_barchart(): # Label the percentage sums for xpos, total in zip(plot_x, bottom): - ax.text(x=xpos, y=total + 0.5, s=percent_formatter(total), ha='center', va='bottom', fontsize=11) + ax.text(x=xpos, y=total + 0.5, s=percent_formatter(total), ha='center', va='bottom', fontsize=9) ax.set_xlabel('Question Type') ax.set_ylabel('% Correct')