diff --git a/evaluation/README.md b/evaluation/README.md new file mode 100644 index 00000000..a1d0ad70 --- /dev/null +++ b/evaluation/README.md @@ -0,0 +1,40 @@ + +# Evaluations + +## LLM Output Evaluator + +The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost. + +It supports batch evalaution via a configuration CSV and produces a detailed metrics report in CSV format. + +### Usage + +This script evaluates LLM outputs using the `lighteval` library: https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks + +Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI, Anthropic) configured properly. + + +```bash +python evals.py --config path/to/config.csv --reference path/to/reference.csv --output path/to/results.csv +``` + +The arguments to the script are: + +- Path to the config CSV file: Must include the columns "Model Name" and "Query" +- Path to the reference CSV file: Must include the columns "Context" and "Reference" +- Path where the evaluation resuls will be saved + + +The script outputs a CSV with the following columns: + +* Evaluates LLM outputs for: + + * Extractiveness Coverage + * Extractiveness Density + * Extractiveness Compression + +* Computes: + + * Token usage (input/output) + * Estimated cost in USD + * Duration (in seconds) diff --git a/evaluation/evals.py b/evaluation/evals.py new file mode 100644 index 00000000..a263d3bc --- /dev/null +++ b/evaluation/evals.py @@ -0,0 +1,156 @@ +""" +Evaluate LLM outputs using multiple metrics and compute associated costs +""" + +#TODO: Run this script with uv to manage dependencies + +# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs + +import sys +import os + +# Ensure the parent directory is in the path to import ModelFactory +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) + +import argparse +import logging + +import pandas as pd +from lighteval.tasks.requests import Doc +from lighteval.metrics.metrics_sample import Extractiveness + +from server.api.services.llm_services import ModelFactory + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + + +def evaluate_response( + model_name: str, query: str, context: str +) -> pd.DataFrame: + """ + Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost + + Args: + model_name (str): The name of the model to be used for evaluation. + query (str): The user query to be processed. + context (str): The context or document content to be used. + reference (str): The reference text for comparison (not used in this function, but can be used for further evaluations). + + Returns: + pd.DataFrame: A DataFrame containing the output text, extractiveness metrics, token usage, cost, and duration. + """ + + handler = ModelFactory.get_handler(model_name) + + # TODO: Add error handling for unsupported models + + output_text, token_usage, pricing, duration = handler.handle_request(query, context) + + doc = Doc(query="", choices=[], gold_index=0, specific={"text": context}) + extractiveness = Extractiveness().compute( + formatted_doc=doc, predictions=[output_text] + ) + + input_cost_dollars = (pricing["input"] / 1000000) * token_usage.input_tokens + output_cost_dollars = (pricing["output"] / 1000000) * token_usage.output_tokens + + total_cost_dollars = input_cost_dollars + output_cost_dollars + + return pd.DataFrame( + [ + { + "Output Text": output_text, + "Extractiveness Coverage": extractiveness["summarization_coverage"], + "Extractiveness Density": extractiveness["summarization_density"], + "Extractiveness Compression": extractiveness[ + "summarization_compression" + ], + "Input Token Usage": token_usage.input_tokens, + "Output Token Usage": token_usage.output_tokens, + "Cost (USD)": total_cost_dollars, + "Duration (s)": duration, + } + ] + ) + + +if __name__ == "__main__": + # TODO: Add CLI argument to specify the metrics to be computed + parser = argparse.ArgumentParser( + description="Evaluate LLM outputs using multiple metrics and compute associated costs" + ) + parser.add_argument("--config", "-c", required=True, help="Path to config CSV file") + parser.add_argument( + "--reference", "-r", required=True, help="Path to reference CSV file" + ) + parser.add_argument("--output", "-o", required=True, help="Path to output CSV file") + + args = parser.parse_args() + + df_config = pd.read_csv(args.config) + logging.info(f"Config DataFrame shape: {df_config.shape}") + logging.info(f"Config DataFrame columns: {df_config.columns.tolist()}") + + # Remove the trailing whitespace from column names + df_config.columns = df_config.columns.str.strip() + + # Check if the required columns are present + # TODO: Make this more flexible by allowing the user to use default instructions + required_columns = ["Model Name", "Query"] + if not all(col in df_config.columns for col in required_columns): + raise ValueError( + f"Config DataFrame must contain the following columns: {required_columns}" + ) + + # Check if all models in the config are supported by ModelFactory + if not all( + model in ModelFactory.HANDLERS.keys() + for model in df_config["Model Name"].unique() + ): + raise ValueError( + f"Unsupported model(s) found in config: {set(df_config['Model Name'].unique()) - set(ModelFactory.HANDLERS.keys())}" + ) + + df_reference = pd.read_csv(args.reference) + logging.info(f"Reference DataFrame shape: {df_reference.shape}") + logging.info(f"Reference DataFrame columns: {df_reference.columns.tolist()}") + + # Remove the trailing whitespace from column names + df_reference.columns = df_reference.columns.str.strip() + # Check if the required columns are present + required_columns = ["Context"] + if not all(col in df_reference.columns for col in required_columns): + raise ValueError( + f"Reference DataFrame must contain the following columns: {required_columns}" + ) + + # Cross join the config and reference DataFrames + df_in = df_config.merge(df_reference, how="cross") + + # TODO: Parallelize the evaluation process for each row in df_in using concurrent.futures or similar libraries + df_evals = pd.DataFrame() + for index, row in df_in.iterrows(): + df_evals = pd.concat( + [ + df_evals, + evaluate_response( + row["Model Name"], row["Query"], row["Context"] + ), + ], + axis=0, + ) + + logging.info(f"Processed row {index + 1}/{len(df_in)}") + + # Concatenate the input and evaluations DataFrames + + df_out = pd.concat( + [df_in.reset_index(drop=True), df_evals.reset_index(drop=True)], axis=1 + ) + + df_out.to_csv(args.output, index=False) + logging.info(f"Output DataFrame shape: {df_out.shape}") + logging.info(f"Results saved to {args.output}") + logging.info("Evaluation completed successfully.") diff --git a/evaluation/test_evals.py b/evaluation/test_evals.py new file mode 100644 index 00000000..f41817c6 --- /dev/null +++ b/evaluation/test_evals.py @@ -0,0 +1,53 @@ + +from unittest.mock import patch, MagicMock + +import pytest +import pandas as pd + +from evals import evaluate_response + +class MockTokenUsage: + def __init__(self, input_tokens, output_tokens): + self.input_tokens = input_tokens + self.output_tokens = output_tokens + +@patch("evals.ModelFactory.get_handler") +@patch("evals.Extractiveness.compute") +def test_evaluate_response(mock_extractiveness_compute, mock_get_handler): + + # Mock BaseModelHandler + mock_handler = MagicMock() + mock_handler.handle_request.return_value = ( + "This is a summary.", + MockTokenUsage(input_tokens=100, output_tokens=50), + {"input": 15.0, "output": 30.0}, # $15 and $30 per 1M tokens + 1.23, # duration + ) + + mock_get_handler.return_value = mock_handler + + mock_extractiveness_compute.return_value = { + "summarization_coverage": 0.8, + "summarization_density": 1.5, + "summarization_compression": 2.0, + } + + df = evaluate_response( + model_name="mock-model", + query="What is the summary?", + context="This is a long article about something important.", + reference="This is a reference summary.", + ) + + assert isinstance(df, pd.DataFrame) + assert df.shape == (1, 8) + assert df["Output Text"].iloc[0] == "This is a summary." + assert df["Extractiveness Coverage"].iloc[0] == 0.8 + assert df["Extractiveness Density"].iloc[0] == 1.5 + assert df["Extractiveness Compression"].iloc[0] == 2.0 + assert df["Input Token Usage"].iloc[0] == 100 + assert df["Output Token Usage"].iloc[0] == 50 + + expected_cost = (15.0 / 1_000_000) * 100 + (30.0 / 1_000_000) * 50 + assert pytest.approx(df["Cost (USD)"].iloc[0], rel=1e-4) == expected_cost + assert pytest.approx(df["Duration (s)"].iloc[0], rel=1e-4) == 1.23 \ No newline at end of file diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py new file mode 100644 index 00000000..049451f1 --- /dev/null +++ b/server/api/services/llm_services.py @@ -0,0 +1,283 @@ +""" +This module contains functions to interact with different AI models +""" + +import os +import time +import logging +from abc import ABC, abstractmethod + +import anthropic +import openai + + +class BaseModelHandler(ABC): + @abstractmethod + def handle_request( + self, query: str, context: str + ) -> tuple[str, dict[str, int], dict[str, float], float]: + pass + + +class ClaudeHaiku35CitationsHandler(BaseModelHandler): + MODEL = "claude-3-5-haiku-20241022" + # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing + PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.80, "output": 4.00} + + def __init__(self) -> None: + self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) + + def handle_request( + self, query: str, context: str + ) -> tuple[str, dict[str, int], dict[str, float], float]: + """ + Handles the request to the Claude Haiku 3.5 model with citations enabled + + Args: + query: The user query to be processed + context: The context or document content to be used for citations + + """ + + start_time = time.time() + # TODO: Add error handling for API requests and invalid responses + message = self.client.messages.create( + model=self.MODEL, + max_tokens=1024, + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": query}, + { + "type": "document", + "source": {"type": "content", "content": context}, + "citations": {"enabled": True}, + }, + ], + } + ], + ) + duration = time.time() - start_time + + # Response Structure: https://docs.anthropic.com/en/docs/build-with-claude/citations#response-structure + + text = [] + cited_text = [] + for content in message.to_dict()["content"]: + text.append(content["text"]) + if "citations" in content.keys(): + text.append( + " ".join( + [ + f"<{citation['start_block_index']} - {citation['end_block_index']}>" + for citation in content["citations"] + ] + ) + ) + cited_text.append( + " ".join( + [ + f"<{citation['start_block_index']} - {citation['end_block_index']}> {citation['cited_text']}" + for citation in content["citations"] + ] + ) + ) + + full_text = " ".join(text) + + return ( + full_text, + message.usage, + self.PRICING_DOLLARS_PER_MILLION_TOKENS, + duration, + ) + + +class ClaudeHaiku3Handler(BaseModelHandler): + MODEL = "claude-3-haiku-20240307" + # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing + PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.25, "output": 1.25} + + def __init__(self) -> None: + self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) + + def handle_request( + self, query: str, context: str + ) -> tuple[str, dict[str, int], dict[str, float], float]: + """ + Handles the request to the Claude Haiku 3 model with citations disabled + + Args: + query: The user query to be processed + context: The context or document content to be used + + """ + + start_time = time.time() + # TODO: Add error handling for API requests and invalid responses + message = self.client.messages.create( + model=self.MODEL, + max_tokens=1024, + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": query}, + { + "type": "document", + "source": {"type": "content", "content": context}, + "citations": {"enabled": False}, + }, + ], + } + ], + ) + duration = time.time() - start_time + + text = [] + for content in message.to_dict()["content"]: + text.append(content["text"]) + + full_text = " ".join(text) + + return ( + full_text, + message.usage, + self.PRICING_DOLLARS_PER_MILLION_TOKENS, + duration, + ) + + +class GPT4OMiniHandler(BaseModelHandler): + MODEL = "gpt-4o-mini" + # Model Pricing: https://platform.openai.com/docs/pricing + PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.15, "output": 0.60} + + def __init__(self) -> None: + self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + + def handle_request( + self, query: str, context: str + ) -> tuple[str, dict[str, int], dict[str, float], float]: + """ + Handles the request to the GPT-4o Mini model + + Args: + query: The user query to be processed + context: The context or document content to be used + + """ + start_time = time.time() + # TODO: Add error handling for API requests and invalid responses + response = self.client.responses.create( + model=self.MODEL, + instructions=query, + input=context, + ) + duration = time.time() - start_time + + return ( + response.output_text, + response.usage, + self.PRICING_DOLLARS_PER_MILLION_TOKENS, + duration, + ) + + +class GPT41NanoHandler(BaseModelHandler): + MODEL = "gpt-4.1-nano" + + # Model Pricing: https://platform.openai.com/docs/pricing + PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.10, "output": 0.40} + + # GPT 4.1 Prompting Guide: https://cookbook.openai.com/examples/gpt4-1_prompting_guide + + # Long context performance can degrade as more items are required to be retrieved, + # or perform complex reasoning that requires knowledge of the state of the entire context + + INSTRUCTIONS = """ + + # Role and Objective + + - You are a seasoned physician or medical professional who treats patients with bipolar disorder + - You are analyzing medical research by processing peer-reviewed papers to extract key details + + # Instructions + + - Identify rules for medication inclusion or exclusion based on medical history or concerns + + - Only use retrieved context and never rely on your own knowledge for any of these questions. + - Always follow the provided output format for new messages including citations for any factual statements + + # Output Format + + - When providing factual information from retrieved context, always include citations immediately after the relevant statement(s). + + + """ + + def __init__(self) -> None: + self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) + + def handle_request( + self, query: str, context: str + ) -> tuple[str, dict[str, int], dict[str, float], float]: + """ + Handles the request to the GPT-4.1 Nano model + + Args: + query: The user query to be processed + context: The context or document content to be used + + """ + + # If no query is provided, use the default instructions + if not query: + query = self.INSTRUCTIONS + + + start_time = time.time() + # TODO: Add error handling for API requests and invalid responses + + response = self.client.responses.create( + model=self.MODEL, + instructions=query, + input=context, + ) + duration = time.time() - start_time + + return ( + response.output_text, + response.usage, + self.PRICING_DOLLARS_PER_MILLION_TOKENS, + duration, + ) + + +class ModelFactory: + HANDLERS = { + "CLAUDE_HAIKU_3_5_CITATIONS": ClaudeHaiku35CitationsHandler, + "CLAUDE_HAIKU_3": ClaudeHaiku3Handler, + "GPT_4O_MINI": GPT4OMiniHandler, + "GPT_41_NANO": GPT41NanoHandler, + } + + # HANDLERS doesn't vary per instance so we can use a class method + @classmethod + def get_handler(cls, model_name: str) -> BaseModelHandler | None: + """ + Factory method to get the appropriate model handler based on the model name + + Args: + model_name (str): The name of the model for which to get the handler. + Returns: + BaseModelHandler: An instance of the appropriate model handler class. + """ + + handler_class = cls.HANDLERS.get(model_name) + if handler_class: + return handler_class() + else: + logging.error(f"Unsupported model: {model_name}") + return None diff --git a/server/api/views/text_extraction/views.py b/server/api/views/text_extraction/views.py index 92b34c9c..e4122851 100644 --- a/server/api/views/text_extraction/views.py +++ b/server/api/views/text_extraction/views.py @@ -1,5 +1,7 @@ import os -from ...services.openai_services import openAIServices +import json +import re + from rest_framework.views import APIView from rest_framework.permissions import IsAuthenticated from rest_framework.response import Response @@ -7,16 +9,46 @@ from django.utils.decorators import method_decorator from django.views.decorators.csrf import csrf_exempt import anthropic -import json -import re + +from ...services.openai_services import openAIServices from api.models.model_embeddings import Embeddings +USER_PROMPT = """ +I'm creating a system to analyze medical research. It processes peer-reviewed papers to extract key details -# TODO: Add docstrings and type hints -def anthropic_citations(client, content_chunks, user_prompt): +Act as a seasoned physician or medical professional who treat patients with bipolar disorder + +Identify rules for medication inclusion or exclusion based on medical history or concerns + +Return an output with the same structure as these examples: + +The rule is history of suicide attempts. The type of rule is "INCLUDE". The reason is lithium is the +only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder. +The medications for this rule are lithium. + +The rule is weight gain concerns. The type of rule is "EXCLUDE". The reason is Seroquel, Risperdal, Abilify, and +Zyprexa are known for causing weight gain. The medications for this rule are Quetiapine, Aripiprazole, Olanzapine, Risperidone +} +""" + + +def anthropic_citations(client: anthropic.Client, user_prompt: str, content_chunks: list) -> tuple: """ + Sends a message to Anthropic Citations and extract and format the response + + Parameters + ---------- + client: An instance of the Anthropic API client used to make the request + user_prompt: The user's question or instruction to be processed by the model + content_chunks: A list of text chunks that provide context for the model to use during generation + + Returns + ------- + tuple + """ + message = client.messages.create( model="claude-3-5-haiku-20241022", max_tokens=1024, @@ -68,35 +100,17 @@ class RuleExtractionAPIView(APIView): def get(self, request): try: - client = anthropic.Anthropic( - api_key=os.getenv("ANTHROPIC_API_KEY")) - - user_prompt = """ - I'm creating a system to analyze medical research. It processes peer-reviewed papers to extract key details - - Act as a seasoned physician or medical professional who treat patients with bipolar disorder - - Identify rules for medication inclusion or exclusion based on medical history or concerns - - Return an output with the same structure as these examples: - - The rule is history of suicide attempts. The type of rule is "INCLUDE". The reason is lithium is the - only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder. - The medications for this rule are lithium. - - The rule is weight gain concerns. The type of rule is "EXCLUDE". The reason is Seroquel, Risperdal, Abilify, and - Zyprexa are known for causing weight gain. The medications for this rule are Quetiapine, Aripiprazole, Olanzapine, Risperidone - } - """ - + client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + guid = request.query_params.get('guid') query = Embeddings.objects.filter(upload_file__guid=guid) + # TODO: Format into the Anthropic API"s expected input format in the anthropic_citations function chunks = [{"type": "text", "text": chunk.text} for chunk in query] - texts, cited_texts = anthropic_citations( - client, chunks, user_prompt) + texts, cited_texts = anthropic_citations(client, USER_PROMPT, chunks) + return Response({"texts": texts, "cited_texts": cited_texts}, status=status.HTTP_200_OK) diff --git a/server/api/views/uploadFile/title.py b/server/api/views/uploadFile/title.py index 637607a2..93453607 100644 --- a/server/api/views/uploadFile/title.py +++ b/server/api/views/uploadFile/title.py @@ -2,7 +2,7 @@ import fitz -from server.api.services.openai_services import openAIServices +from ...services.openai_services import openAIServices # regular expression to match common research white paper titles. Created by Chat-gpt # requires at least 3 words, no dates, no version numbers.