diff --git a/evaluation/README.md b/evaluation/README.md
new file mode 100644
index 00000000..a1d0ad70
--- /dev/null
+++ b/evaluation/README.md
@@ -0,0 +1,40 @@
+
+# Evaluations
+
+## LLM Output Evaluator
+
+The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost.
+
+It supports batch evalaution via a configuration CSV and produces a detailed metrics report in CSV format.
+
+### Usage
+
+This script evaluates LLM outputs using the `lighteval` library: https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
+
+Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI, Anthropic) configured properly.
+
+
+```bash
+python evals.py --config path/to/config.csv --reference path/to/reference.csv --output path/to/results.csv
+```
+
+The arguments to the script are:
+
+- Path to the config CSV file: Must include the columns "Model Name" and "Query"
+- Path to the reference CSV file: Must include the columns "Context" and "Reference"
+- Path where the evaluation resuls will be saved
+
+
+The script outputs a CSV with the following columns:
+
+* Evaluates LLM outputs for:
+
+  * Extractiveness Coverage
+  * Extractiveness Density
+  * Extractiveness Compression
+
+* Computes:
+
+  * Token usage (input/output)
+  * Estimated cost in USD
+  * Duration (in seconds)
diff --git a/evaluation/evals.py b/evaluation/evals.py
new file mode 100644
index 00000000..a263d3bc
--- /dev/null
+++ b/evaluation/evals.py
@@ -0,0 +1,156 @@
+"""
+Evaluate LLM outputs using multiple metrics and compute associated costs
+"""
+
+#TODO: Run this script with uv to manage dependencies
+
+# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
+
+import sys
+import os
+
+# Ensure the parent directory is in the path to import ModelFactory
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+import argparse
+import logging
+
+import pandas as pd
+from lighteval.tasks.requests import Doc
+from lighteval.metrics.metrics_sample import Extractiveness
+
+from server.api.services.llm_services import ModelFactory
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+def evaluate_response(
+    model_name: str, query: str, context: str
+) -> pd.DataFrame:
+    """
+    Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost
+
+    Args:
+        model_name (str): The name of the model to be used for evaluation.
+        query (str): The user query to be processed.
+        context (str): The context or document content to be used.
+        reference (str): The reference text for comparison (not used in this function, but can be used for further evaluations).
+
+    Returns:
+        pd.DataFrame: A DataFrame containing the output text, extractiveness metrics, token usage, cost, and duration.
+    """
+
+    handler = ModelFactory.get_handler(model_name)
+
+    # TODO: Add error handling for unsupported models
+
+    output_text, token_usage, pricing, duration = handler.handle_request(query, context)
+
+    doc = Doc(query="", choices=[], gold_index=0, specific={"text": context})
+    extractiveness = Extractiveness().compute(
+        formatted_doc=doc, predictions=[output_text]
+    )
+
+    input_cost_dollars = (pricing["input"] / 1000000) * token_usage.input_tokens
+    output_cost_dollars = (pricing["output"] / 1000000) * token_usage.output_tokens
+
+    total_cost_dollars = input_cost_dollars + output_cost_dollars
+
+    return pd.DataFrame(
+        [
+            {
+                "Output Text": output_text,
+                "Extractiveness Coverage": extractiveness["summarization_coverage"],
+                "Extractiveness Density": extractiveness["summarization_density"],
+                "Extractiveness Compression": extractiveness[
+                    "summarization_compression"
+                ],
+                "Input Token Usage": token_usage.input_tokens,
+                "Output Token Usage": token_usage.output_tokens,
+                "Cost (USD)": total_cost_dollars,
+                "Duration (s)": duration,
+            }
+        ]
+    )
+
+
+if __name__ == "__main__":
+    # TODO: Add CLI argument to specify the metrics to be computed
+    parser = argparse.ArgumentParser(
+        description="Evaluate LLM outputs using multiple metrics and compute associated costs"
+    )
+    parser.add_argument("--config", "-c", required=True, help="Path to config CSV file")
+    parser.add_argument(
+        "--reference", "-r", required=True, help="Path to reference CSV file"
+    )
+    parser.add_argument("--output", "-o", required=True, help="Path to output CSV file")
+
+    args = parser.parse_args()
+
+    df_config = pd.read_csv(args.config)
+    logging.info(f"Config DataFrame shape: {df_config.shape}")
+    logging.info(f"Config DataFrame columns: {df_config.columns.tolist()}")
+
+    # Remove the trailing whitespace from column names
+    df_config.columns = df_config.columns.str.strip()
+
+    # Check if the required columns are present
+    # TODO: Make this more flexible by allowing the user to use default instructions
+    required_columns = ["Model Name", "Query"]
+    if not all(col in df_config.columns for col in required_columns):
+        raise ValueError(
+            f"Config DataFrame must contain the following columns: {required_columns}"
+        )
+
+    # Check if all models in the config are supported by ModelFactory
+    if not all(
+        model in ModelFactory.HANDLERS.keys()
+        for model in df_config["Model Name"].unique()
+    ):
+        raise ValueError(
+            f"Unsupported model(s) found in config: {set(df_config['Model Name'].unique()) - set(ModelFactory.HANDLERS.keys())}"
+        )
+
+    df_reference = pd.read_csv(args.reference)
+    logging.info(f"Reference DataFrame shape: {df_reference.shape}")
+    logging.info(f"Reference DataFrame columns: {df_reference.columns.tolist()}")
+
+    # Remove the trailing whitespace from column names
+    df_reference.columns = df_reference.columns.str.strip()
+    # Check if the required columns are present
+    required_columns = ["Context"]
+    if not all(col in df_reference.columns for col in required_columns):
+        raise ValueError(
+            f"Reference DataFrame must contain the following columns: {required_columns}"
+        )
+
+    # Cross join the config and reference DataFrames
+    df_in = df_config.merge(df_reference, how="cross")
+
+    # TODO: Parallelize the evaluation process for each row in df_in using concurrent.futures or similar libraries
+    df_evals = pd.DataFrame()
+    for index, row in df_in.iterrows():
+        df_evals = pd.concat(
+            [
+                df_evals,
+                evaluate_response(
+                    row["Model Name"], row["Query"], row["Context"]
+                ),
+            ],
+            axis=0,
+        )
+
+        logging.info(f"Processed row {index + 1}/{len(df_in)}")
+
+    # Concatenate the input and evaluations DataFrames
+
+    df_out = pd.concat(
+        [df_in.reset_index(drop=True), df_evals.reset_index(drop=True)], axis=1
+    )
+
+    df_out.to_csv(args.output, index=False)
+    logging.info(f"Output DataFrame shape: {df_out.shape}")
+    logging.info(f"Results saved to {args.output}")
+    logging.info("Evaluation completed successfully.")
diff --git a/evaluation/test_evals.py b/evaluation/test_evals.py
new file mode 100644
index 00000000..f41817c6
--- /dev/null
+++ b/evaluation/test_evals.py
@@ -0,0 +1,53 @@
+
+from unittest.mock import patch, MagicMock
+
+import pytest
+import pandas as pd
+
+from evals import evaluate_response  
+
+class MockTokenUsage:
+    def __init__(self, input_tokens, output_tokens):
+        self.input_tokens = input_tokens
+        self.output_tokens = output_tokens
+
+@patch("evals.ModelFactory.get_handler")
+@patch("evals.Extractiveness.compute")
+def test_evaluate_response(mock_extractiveness_compute, mock_get_handler):
+
+    # Mock BaseModelHandler
+    mock_handler = MagicMock()
+    mock_handler.handle_request.return_value = (
+        "This is a summary.",
+        MockTokenUsage(input_tokens=100, output_tokens=50),
+        {"input": 15.0, "output": 30.0},  # $15 and $30 per 1M tokens
+        1.23,  # duration
+    )
+
+    mock_get_handler.return_value = mock_handler
+
+    mock_extractiveness_compute.return_value = {
+        "summarization_coverage": 0.8,
+        "summarization_density": 1.5,
+        "summarization_compression": 2.0,
+    }
+
+    df = evaluate_response(
+        model_name="mock-model",
+        query="What is the summary?",
+        context="This is a long article about something important.",
+        reference="This is a reference summary.",
+    )
+
+    assert isinstance(df, pd.DataFrame)
+    assert df.shape == (1, 8)
+    assert df["Output Text"].iloc[0] == "This is a summary."
+    assert df["Extractiveness Coverage"].iloc[0] == 0.8
+    assert df["Extractiveness Density"].iloc[0] == 1.5
+    assert df["Extractiveness Compression"].iloc[0] == 2.0
+    assert df["Input Token Usage"].iloc[0] == 100
+    assert df["Output Token Usage"].iloc[0] == 50
+
+    expected_cost = (15.0 / 1_000_000) * 100 + (30.0 / 1_000_000) * 50
+    assert pytest.approx(df["Cost (USD)"].iloc[0], rel=1e-4) == expected_cost
+    assert pytest.approx(df["Duration (s)"].iloc[0], rel=1e-4) == 1.23
\ No newline at end of file
diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py
new file mode 100644
index 00000000..049451f1
--- /dev/null
+++ b/server/api/services/llm_services.py
@@ -0,0 +1,283 @@
+"""
+This module contains functions to interact with different AI models
+"""
+
+import os
+import time
+import logging
+from abc import ABC, abstractmethod
+
+import anthropic
+import openai
+
+
+class BaseModelHandler(ABC):
+    @abstractmethod
+    def handle_request(
+        self, query: str, context: str
+    ) -> tuple[str, dict[str, int], dict[str, float], float]:
+        pass
+
+
+class ClaudeHaiku35CitationsHandler(BaseModelHandler):
+    MODEL = "claude-3-5-haiku-20241022"
+    # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
+    PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.80, "output": 4.00}
+
+    def __init__(self) -> None:
+        self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
+
+    def handle_request(
+        self, query: str, context: str
+    ) -> tuple[str, dict[str, int], dict[str, float], float]:
+        """
+        Handles the request to the Claude Haiku 3.5 model with citations enabled
+
+        Args:
+            query: The user query to be processed
+            context: The context or document content to be used for citations
+
+        """
+
+        start_time = time.time()
+        # TODO: Add error handling for API requests and invalid responses
+        message = self.client.messages.create(
+            model=self.MODEL,
+            max_tokens=1024,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": query},
+                        {
+                            "type": "document",
+                            "source": {"type": "content", "content": context},
+                            "citations": {"enabled": True},
+                        },
+                    ],
+                }
+            ],
+        )
+        duration = time.time() - start_time
+
+        # Response Structure: https://docs.anthropic.com/en/docs/build-with-claude/citations#response-structure
+
+        text = []
+        cited_text = []
+        for content in message.to_dict()["content"]:
+            text.append(content["text"])
+            if "citations" in content.keys():
+                text.append(
+                    " ".join(
+                        [
+                            f"<{citation['start_block_index']} - {citation['end_block_index']}>"
+                            for citation in content["citations"]
+                        ]
+                    )
+                )
+                cited_text.append(
+                    " ".join(
+                        [
+                            f"<{citation['start_block_index']} - {citation['end_block_index']}> {citation['cited_text']}"
+                            for citation in content["citations"]
+                        ]
+                    )
+                )
+
+        full_text = " ".join(text)
+
+        return (
+            full_text,
+            message.usage,
+            self.PRICING_DOLLARS_PER_MILLION_TOKENS,
+            duration,
+        )
+
+
+class ClaudeHaiku3Handler(BaseModelHandler):
+    MODEL = "claude-3-haiku-20240307"
+    # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
+    PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.25, "output": 1.25}
+
+    def __init__(self) -> None:
+        self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
+
+    def handle_request(
+        self, query: str, context: str
+    ) -> tuple[str, dict[str, int], dict[str, float], float]:
+        """
+        Handles the request to the Claude Haiku 3 model with citations disabled
+
+        Args:
+            query: The user query to be processed
+            context: The context or document content to be used
+
+        """
+
+        start_time = time.time()
+        # TODO: Add error handling for API requests and invalid responses
+        message = self.client.messages.create(
+            model=self.MODEL,
+            max_tokens=1024,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": query},
+                        {
+                            "type": "document",
+                            "source": {"type": "content", "content": context},
+                            "citations": {"enabled": False},
+                        },
+                    ],
+                }
+            ],
+        )
+        duration = time.time() - start_time
+
+        text = []
+        for content in message.to_dict()["content"]:
+            text.append(content["text"])
+
+        full_text = " ".join(text)
+
+        return (
+            full_text,
+            message.usage,
+            self.PRICING_DOLLARS_PER_MILLION_TOKENS,
+            duration,
+        )
+
+
+class GPT4OMiniHandler(BaseModelHandler):
+    MODEL = "gpt-4o-mini"
+    # Model Pricing: https://platform.openai.com/docs/pricing
+    PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.15, "output": 0.60}
+
+    def __init__(self) -> None:
+        self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+
+    def handle_request(
+        self, query: str, context: str
+    ) -> tuple[str, dict[str, int], dict[str, float], float]:
+        """
+        Handles the request to the GPT-4o Mini model
+
+        Args:
+            query: The user query to be processed
+            context: The context or document content to be used
+
+        """
+        start_time = time.time()
+        # TODO: Add error handling for API requests and invalid responses
+        response = self.client.responses.create(
+            model=self.MODEL,
+            instructions=query,
+            input=context,
+        )
+        duration = time.time() - start_time
+
+        return (
+            response.output_text,
+            response.usage,
+            self.PRICING_DOLLARS_PER_MILLION_TOKENS,
+            duration,
+        )
+
+
+class GPT41NanoHandler(BaseModelHandler):
+    MODEL = "gpt-4.1-nano"
+
+    # Model Pricing: https://platform.openai.com/docs/pricing
+    PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.10, "output": 0.40}
+
+    # GPT 4.1 Prompting Guide: https://cookbook.openai.com/examples/gpt4-1_prompting_guide
+
+    # Long context performance can degrade as more items are required to be retrieved, 
+    # or perform complex reasoning that requires knowledge of the state of the entire context
+
+    INSTRUCTIONS = """
+        
+    # Role and Objective
+
+    - You are a seasoned physician or medical professional who treats patients with bipolar disorder
+    - You are analyzing medical research by processing peer-reviewed papers to extract key details
+
+    # Instructions
+
+    - Identify rules for medication inclusion or exclusion based on medical history or concerns 
+
+    - Only use retrieved context and never rely on your own knowledge for any of these questions.
+    - Always follow the provided output format for new messages including citations for any factual statements 
+
+    # Output Format
+
+    - When providing factual information from retrieved context, always include citations immediately after the relevant statement(s). 
+
+
+    """
+
+    def __init__(self) -> None:
+        self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+
+    def handle_request(
+        self, query: str, context: str
+    ) -> tuple[str, dict[str, int], dict[str, float], float]:
+        """
+        Handles the request to the GPT-4.1 Nano model
+
+        Args:
+            query: The user query to be processed
+            context: The context or document content to be used
+
+        """
+
+        # If no query is provided, use the default instructions
+        if not query:
+            query = self.INSTRUCTIONS
+
+
+        start_time = time.time()
+        # TODO: Add error handling for API requests and invalid responses
+
+        response = self.client.responses.create(
+            model=self.MODEL,
+            instructions=query,
+            input=context,
+        )
+        duration = time.time() - start_time
+
+        return (
+            response.output_text,
+            response.usage,
+            self.PRICING_DOLLARS_PER_MILLION_TOKENS,
+            duration,
+        )
+
+
+class ModelFactory:
+    HANDLERS = {
+        "CLAUDE_HAIKU_3_5_CITATIONS": ClaudeHaiku35CitationsHandler,
+        "CLAUDE_HAIKU_3": ClaudeHaiku3Handler,
+        "GPT_4O_MINI": GPT4OMiniHandler,
+        "GPT_41_NANO": GPT41NanoHandler,
+    }
+
+    # HANDLERS doesn't vary per instance so we can use a class method
+    @classmethod
+    def get_handler(cls, model_name: str) -> BaseModelHandler | None:
+        """
+        Factory method to get the appropriate model handler based on the model name
+
+        Args:
+            model_name (str): The name of the model for which to get the handler.
+        Returns:
+            BaseModelHandler: An instance of the appropriate model handler class.
+        """
+
+        handler_class = cls.HANDLERS.get(model_name)
+        if handler_class:
+            return handler_class()
+        else:
+            logging.error(f"Unsupported model: {model_name}")
+            return None
diff --git a/server/api/views/text_extraction/views.py b/server/api/views/text_extraction/views.py
index 92b34c9c..e4122851 100644
--- a/server/api/views/text_extraction/views.py
+++ b/server/api/views/text_extraction/views.py
@@ -1,5 +1,7 @@
 import os
-from ...services.openai_services import openAIServices
+import json
+import re
+
 from rest_framework.views import APIView
 from rest_framework.permissions import IsAuthenticated
 from rest_framework.response import Response
@@ -7,16 +9,46 @@
 from django.utils.decorators import method_decorator
 from django.views.decorators.csrf import csrf_exempt
 import anthropic
-import json
-import re
+
+from ...services.openai_services import openAIServices
 from api.models.model_embeddings import Embeddings
 
+USER_PROMPT = """
+I'm creating a system to analyze medical research. It processes peer-reviewed papers to extract key details
 
-# TODO: Add docstrings and type hints
-def anthropic_citations(client, content_chunks, user_prompt):
+Act as a seasoned physician or medical professional who treat patients with bipolar disorder
+
+Identify rules for medication inclusion or exclusion based on medical history or concerns 
+
+Return an output with the same structure as these examples:
+
+The rule is history of suicide attempts. The type of rule is "INCLUDE". The reason is lithium is the 
+only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder.
+The medications for this rule are lithium.
+
+The rule is weight gain concerns. The type of rule is "EXCLUDE". The reason is Seroquel, Risperdal, Abilify, and 
+Zyprexa are known for causing weight gain. The medications for this rule are Quetiapine, Aripiprazole, Olanzapine, Risperidone
+}
+"""
+
+
+def anthropic_citations(client: anthropic.Client, user_prompt: str, content_chunks: list) -> tuple:
     """
+    Sends a message to Anthropic Citations and extract and format the response 
+
+    Parameters
+    ----------
+    client: An instance of the Anthropic API client used to make the request
+    user_prompt: The user's question or instruction to be processed by the model
+    content_chunks: A list of text chunks that provide context for the model to use during generation
+
+    Returns
+    -------
+    tuple
+
     """
 
+
     message = client.messages.create(
         model="claude-3-5-haiku-20241022",
         max_tokens=1024,
@@ -68,35 +100,17 @@ class RuleExtractionAPIView(APIView):
     def get(self, request):
         try:
 
-            client = anthropic.Anthropic(
-                api_key=os.getenv("ANTHROPIC_API_KEY"))
-
-            user_prompt = """
-            I'm creating a system to analyze medical research. It processes peer-reviewed papers to extract key details
-
-            Act as a seasoned physician or medical professional who treat patients with bipolar disorder
-
-            Identify rules for medication inclusion or exclusion based on medical history or concerns
-
-            Return an output with the same structure as these examples:
-
-            The rule is history of suicide attempts. The type of rule is "INCLUDE". The reason is lithium is the
-            only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder.
-            The medications for this rule are lithium.
-
-            The rule is weight gain concerns. The type of rule is "EXCLUDE". The reason is Seroquel, Risperdal, Abilify, and
-            Zyprexa are known for causing weight gain. The medications for this rule are Quetiapine, Aripiprazole, Olanzapine, Risperidone
-            }
-            """
-
+            client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+            
             guid = request.query_params.get('guid')
 
             query = Embeddings.objects.filter(upload_file__guid=guid)
 
+            # TODO: Format into the Anthropic API"s expected input format in the anthropic_citations function
             chunks = [{"type": "text", "text": chunk.text} for chunk in query]
 
-            texts, cited_texts = anthropic_citations(
-                client, chunks, user_prompt)
+            texts, cited_texts = anthropic_citations(client, USER_PROMPT, chunks)
+
 
             return Response({"texts": texts, "cited_texts": cited_texts}, status=status.HTTP_200_OK)
 
diff --git a/server/api/views/uploadFile/title.py b/server/api/views/uploadFile/title.py
index 637607a2..93453607 100644
--- a/server/api/views/uploadFile/title.py
+++ b/server/api/views/uploadFile/title.py
@@ -2,7 +2,7 @@
 
 import fitz
 
-from server.api.services.openai_services import openAIServices
+from ...services.openai_services import openAIServices
 
 # regular expression to match common research white paper titles. Created by Chat-gpt
 # requires at least 3 words, no dates, no version numbers.