diff --git a/docs/docs.json b/docs/docs.json
index 9b69ae9af8..a865ca8043 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -36,7 +36,8 @@
                   "gateway/prompt-integration",
                   "features/advanced-usage/llm-security",
                   "features/advanced-usage/moderations",
-                  "features/advanced-usage/retries"
+                  "features/advanced-usage/retries",
+                  "features/advanced-usage/token-limit-exception-handlers"
                 ]
               },
               {
diff --git a/docs/features/advanced-usage/token-limit-exception-handlers.mdx b/docs/features/advanced-usage/token-limit-exception-handlers.mdx
new file mode 100644
index 0000000000..d54c4a5b4e
--- /dev/null
+++ b/docs/features/advanced-usage/token-limit-exception-handlers.mdx
@@ -0,0 +1,122 @@
+---
+title: "Token Limit Exception Handlers"
+sidebarTitle: "Large Context"
+description: "Automatically handle requests that exceed a model's context window using truncate, middle-out, or fallback strategies."
+"twitter:title": "Large Context - Token Limit Exception Handlers"
+---
+
+import QuestionsSection from "/snippets/questions-section.mdx";
+
+When prompts get large, requests can exceed the model's maximum context window. Helicone can automatically apply strategies to keep your request within limits or switch to a fallback model — without changing your app code.
+
+## What This Does
+
+- Estimates tokens for your request based on model and content
+- Accounts for reserved output tokens (e.g., `max_tokens`, `max_output_tokens`)
+- Applies a chosen strategy only when the estimated input exceeds the allowed context
+
+<Note>
+Helicone uses provider-aware heuristics to estimate tokens and a best-effort approach across different request shapes.
+</Note>
+
+## Strategies
+
+- Truncate (`truncate`): Normalize and trim message content to reduce token count.
+- Middle-out (`middle-out`): Preserve the beginning and end of messages while trimming middle content to fit the limit.
+- Fallback (`fallback`): Switch to an alternate model when the request is too large. Provide multiple candidates in the request body `model` field as a comma-separated list (first is primary, second is fallback).
+
+<Info>
+For `fallback`, Helicone picks the second candidate if needed. When under the limit, Helicone normalizes the `model` to the primary. If your body lacks `model`, set `Helicone-Model-Override`.
+</Info>
+
+## Quick Start
+
+Add the `Helicone-Token-Limit-Exception-Handler` header to enable a strategy.
+
+<CodeGroup>
+```typescript Node.js
+import { OpenAI } from "openai";
+
+const client = new OpenAI({
+  baseURL: "https://ai-gateway.helicone.ai/v1",
+  apiKey: process.env.HELICONE_API_KEY,
+});
+
+// Middle-out strategy
+await client.chat.completions.create(
+  {
+    model: "gpt-4o", // or "gpt-4o, gpt-4o-mini" for fallback
+    messages: [
+      { role: "user", content: "A very long prompt ..." }
+    ],
+    max_tokens: 256
+  },
+  {
+    headers: {
+      "Helicone-Token-Limit-Exception-Handler": "middle-out"
+    }
+  }
+);
+```
+
+```python Python
+from openai import OpenAI
+import os
+
+client = OpenAI(
+    base_url="https://ai-gateway.helicone.ai/v1",
+    api_key=os.getenv("HELICONE_API_KEY"),
+)
+
+# Fallback strategy with model candidates
+resp = client.chat.completions.create(
+    model="gpt-4o, gpt-4o-mini",
+    messages=[{"role": "user", "content": "A very long prompt ..."}],
+    max_tokens=256,
+    extra_headers={
+        "Helicone-Token-Limit-Exception-Handler": "fallback",
+    }
+)
+```
+
+```bash cURL
+curl --request POST \
+     --url https://ai-gateway.helicone.ai/v1/chat/completions \
+     -H "Content-Type: application/json" \
+     -H "Authorization: Bearer $HELICONE_API_KEY" \
+     -H "Helicone-Token-Limit-Exception-Handler: truncate" \
+     --data '{
+       "model": "gpt-4o",
+       "messages": [{"role": "user", "content": "A very long prompt ..."}],
+       "max_tokens": 256
+     }'
+```
+
+</CodeGroup>
+
+## Configuration
+
+Enable and control via headers:
+
+<ParamField header="Helicone-Token-Limit-Exception-Handler" type="string" required>
+  One of: `truncate`, `middle-out`, `fallback`.
+</ParamField>
+
+<ParamField header="Helicone-Model-Override" type="string">
+  Optional. Used for token estimation and model selection when the request body doesn't include a `model` or you need to override it.
+</ParamField>
+
+### Fallback Model Selection
+
+- Provide candidates in the body: `model: "primary, fallback"`
+- Helicone chooses the fallback when input exceeds the allowed context
+- When under the limit, Helicone normalizes the `model` to the primary
+
+## Notes
+
+- Token estimation is heuristic and provider-aware; behavior is best-effort across request shapes.
+- Allowed context accounts for requested completion tokens (e.g., `max_tokens`).
+- Changes are applied before the provider call; your logged request reflects the applied strategy.
+
+<QuestionsSection />
+
diff --git a/docs/helicone-headers/header-directory.mdx b/docs/helicone-headers/header-directory.mdx
index 0233151151..c5f330aa6f 100644
--- a/docs/helicone-headers/header-directory.mdx
+++ b/docs/helicone-headers/header-directory.mdx
@@ -150,6 +150,17 @@ The URL to proxy the request to when using _gateway.helicone.ai_. For example, `
 <ResponseField name="Helicone-Omit-Request" type="boolean">
   Whether to exclude the request from the response. Set to `true` or  `false`.
 </ResponseField>
+<ResponseField name="Helicone-Token-Limit-Exception-Handler" type="string (truncate | middle-out | fallback)">
+  Control how Helicone handles requests that would exceed a model's context window. Accepted values:
+
+  - `truncate` — Best-effort normalization and trimming of message content to reduce token count.
+  - `middle-out` — Preserve the beginning and end of messages while removing middle content to fit within the limit. Uses token estimation to keep high-value context.
+  - `fallback` — Switch to an alternate model when input exceeds the context limit. Provide multiple candidates in the request body's `model` field as a comma-separated list (e.g., `"gpt-4o, gpt-4o-mini"`). Helicone picks the second model as the fallback when needed. When under the limit, Helicone normalizes the `model` field to the primary model.
+
+  <Note>
+    If your request body does not include a `model` or you need to override it for estimation, set `Helicone-Model-Override`. For fallbacks, specify multiple `model` candidates in the body; only the first two are considered.
+  </Note>
+</ResponseField>
 <ResponseField name="Helicone-Cache-Enabled" type="boolean">
   Whether to cache your responses. Set to `true` or  `false`. You can customize the behavior of the cache feature by setting additional headers in your request.
   <Accordion title="Additional headers" defaultOpen>
diff --git a/worker/src/lib/models/HeliconeHeaders.ts b/worker/src/lib/models/HeliconeHeaders.ts
index 117fce12ab..91753312d2 100644
--- a/worker/src/lib/models/HeliconeHeaders.ts
+++ b/worker/src/lib/models/HeliconeHeaders.ts
@@ -12,6 +12,11 @@ export type HeliconeFallback = {
 };
 
 export type HeliconeBearerKeyType = "standard" | "rate-limited";
+export enum HeliconeTokenLimitExceptionHandler {
+  Truncate = "truncate",
+  MiddleOut = "middle-out",
+  Fallback = "fallback",
+}
 
 export interface IHeliconeHeaders {
   heliconeAuth: Nullable<string>;
@@ -58,6 +63,7 @@ export interface IHeliconeHeaders {
     omitResponse: boolean;
     omitRequest: boolean;
   };
+  tokenLimitExceptionHandler: Nullable<HeliconeTokenLimitExceptionHandler>;
   sessionHeaders: {
     sessionId: Nullable<string>;
     path: Nullable<string>;
@@ -125,6 +131,7 @@ export class HeliconeHeaders implements IHeliconeHeaders {
   promptName: Nullable<string>;
   userId: Nullable<string>;
   omitHeaders: { omitResponse: boolean; omitRequest: boolean };
+  tokenLimitExceptionHandler: Nullable<HeliconeTokenLimitExceptionHandler>;
   sessionHeaders: {
     sessionId: Nullable<string>;
     path: Nullable<string>;
@@ -178,6 +185,8 @@ export class HeliconeHeaders implements IHeliconeHeaders {
     };
     this.promptName = heliconeHeaders.promptName;
     this.omitHeaders = heliconeHeaders.omitHeaders;
+    this.tokenLimitExceptionHandler =
+      heliconeHeaders.tokenLimitExceptionHandler;
     this.sessionHeaders = heliconeHeaders.sessionHeaders;
     this.userId = heliconeHeaders.userId;
     this.heliconeProperties = this.getHeliconeProperties(heliconeHeaders);
@@ -364,6 +373,7 @@ export class HeliconeHeaders implements IHeliconeHeaders {
         omitResponse: this.headers.get("Helicone-Omit-Response") === "true",
         omitRequest: this.headers.get("Helicone-Omit-Request") === "true",
       },
+      tokenLimitExceptionHandler: this.getTokenLimitExceptionHandler(),
       sessionHeaders: {
         sessionId: this.headers.get("Helicone-Session-Id") ?? null,
         path: this.headers.get("Helicone-Session-Path") ?? null,
@@ -403,6 +413,25 @@ export class HeliconeHeaders implements IHeliconeHeaders {
     };
   }
 
+  private getTokenLimitExceptionHandler(): Nullable<HeliconeTokenLimitExceptionHandler> {
+    const handler = this.headers.get("Helicone-Token-Limit-Exception-Handler");
+    if (!handler) {
+      return null;
+    }
+
+    const normalized = handler.toLowerCase();
+    switch (normalized) {
+      case HeliconeTokenLimitExceptionHandler.Truncate:
+        return HeliconeTokenLimitExceptionHandler.Truncate;
+      case HeliconeTokenLimitExceptionHandler.MiddleOut:
+        return HeliconeTokenLimitExceptionHandler.MiddleOut;
+      case HeliconeTokenLimitExceptionHandler.Fallback:
+        return HeliconeTokenLimitExceptionHandler.Fallback;
+      default:
+        return null;
+    }
+  }
+
   private getRetryHeaders(): IHeliconeHeaders["retryHeaders"] {
     const retryEnabled = this.headers.get("helicone-retry-enabled");
     if (retryEnabled === null) {
diff --git a/worker/src/lib/models/HeliconeProxyRequest.ts b/worker/src/lib/models/HeliconeProxyRequest.ts
index a70a70e3f6..fb95478a8c 100644
--- a/worker/src/lib/models/HeliconeProxyRequest.ts
+++ b/worker/src/lib/models/HeliconeProxyRequest.ts
@@ -5,7 +5,10 @@ import { approvedDomains } from "@helicone-package/cost/providers/mappings";
 import { RequestWrapper } from "../RequestWrapper";
 import { buildTargetUrl } from "../clients/ProviderClient";
 import { Result, ok } from "../util/results";
-import { IHeliconeHeaders } from "./HeliconeHeaders";
+import {
+  HeliconeTokenLimitExceptionHandler,
+  IHeliconeHeaders,
+} from "./HeliconeHeaders";
 
 import { parseJSXObject } from "@helicone/prompts";
 import { TemplateWithInputs } from "@helicone/prompts/dist/objectParser";
@@ -26,6 +29,16 @@ export type RetryOptions = {
 export type HeliconeProperties = Record<string, string>;
 type Nullable<T> = T | null;
 
+import {
+  applyFallbackStrategy,
+  applyMiddleOutStrategy,
+  applyTruncateStrategy,
+  estimateTokenCount,
+  getModelTokenLimit,
+  parseRequestPayload,
+  resolvePrimaryModel,
+} from "../util/tokenLimitException";
+
 // This neatly formats and holds all of the state that a request can come into Helicone
 export interface HeliconeProxyRequest {
   provider: Provider;
@@ -162,6 +175,16 @@ export class HeliconeProxyRequestMapper {
       body = await this.request.unsafeGetBodyText();
     }
 
+    // Apply token limit exception handler here and update buffer if changed
+    const bodyWithTokenLimitExceptionHandler =
+      this.applyTokenLimitExceptionHandler(body);
+    if (typeof bodyWithTokenLimitExceptionHandler === "string") {
+      body = bodyWithTokenLimitExceptionHandler;
+      await this.request.requestBodyBuffer.tempSetBody(
+        bodyWithTokenLimitExceptionHandler
+      );
+    }
+
     return {
       data: {
         heliconePromptTemplate: await this.getHeliconeTemplate(),
@@ -197,6 +220,90 @@ export class HeliconeProxyRequestMapper {
     };
   }
 
+  public applyTokenLimitExceptionHandler(
+    body: ValidRequestBody
+  ): ValidRequestBody | undefined {
+    const handler = this.request.heliconeHeaders.tokenLimitExceptionHandler;
+    if (!handler) {
+      return;
+    }
+
+    const parsedBody = parseRequestPayload(body);
+    if (!parsedBody) {
+      return;
+    }
+
+    const primaryModel = resolvePrimaryModel(
+      parsedBody,
+      this.request.heliconeHeaders.modelOverride
+    );
+    const estimatedTokens = estimateTokenCount(parsedBody, primaryModel);
+
+    if (!primaryModel) {
+      return;
+    }
+
+    const modelContextLimit = getModelTokenLimit(this.provider, primaryModel);
+
+    // Extract requested completion/output limit directly here (provider-agnostic best-effort)
+    const anyBody = parsedBody as any;
+    const completionCandidates: Array<unknown> = [
+      anyBody?.max_completion_tokens,
+      anyBody?.max_tokens,
+      anyBody?.max_output_tokens,
+      anyBody?.maxOutputTokens,
+      anyBody?.response?.max_tokens,
+      anyBody?.response?.max_output_tokens,
+      anyBody?.response?.maxOutputTokens,
+      anyBody?.generation_config?.max_output_tokens,
+      anyBody?.generation_config?.maxOutputTokens,
+      anyBody?.generationConfig?.max_output_tokens,
+      anyBody?.generationConfig?.maxOutputTokens,
+    ];
+    const requestedCompletionTokens = (() => {
+      for (const val of completionCandidates) {
+        if (typeof val === "number" && Number.isFinite(val) && val > 0) {
+          return Math.floor(val);
+        }
+      }
+      return 0;
+    })();
+    const tokenLimit =
+      modelContextLimit === null
+        ? null
+        : Math.max(
+            0,
+            modelContextLimit -
+              (requestedCompletionTokens || modelContextLimit * 0.1)
+          );
+
+    if (
+      estimatedTokens === null ||
+      tokenLimit === null ||
+      (estimatedTokens <= tokenLimit &&
+        handler != HeliconeTokenLimitExceptionHandler.Fallback) //needed to sort the extra model passed in request
+    ) {
+      return;
+    }
+
+    // TODO: Add some indicator as to what was applied so users understand why their request looks different
+    switch (handler) {
+      case HeliconeTokenLimitExceptionHandler.Truncate:
+        return applyTruncateStrategy(parsedBody);
+      case HeliconeTokenLimitExceptionHandler.MiddleOut:
+        return applyMiddleOutStrategy(parsedBody, primaryModel, tokenLimit);
+      case HeliconeTokenLimitExceptionHandler.Fallback:
+        return applyFallbackStrategy(
+          parsedBody,
+          primaryModel,
+          estimatedTokens,
+          tokenLimit
+        );
+      default:
+        return;
+    }
+  }
+
   private validateApiConfiguration(api_base: string | undefined): boolean {
     return (
       api_base === undefined ||
diff --git a/worker/src/lib/util/tokenLimitException.ts b/worker/src/lib/util/tokenLimitException.ts
new file mode 100644
index 0000000000..f6f17ca422
--- /dev/null
+++ b/worker/src/lib/util/tokenLimitException.ts
@@ -0,0 +1,714 @@
+import { Provider } from "@helicone-package/llm-mapper/types";
+import { registry } from "@helicone-package/cost/models/registry";
+import { heliconeProviderToModelProviderName } from "@helicone-package/cost/models/provider-helpers";
+import type { ModelProviderName } from "@helicone-package/cost/models/providers";
+import type { ModelProviderConfig } from "@helicone-package/cost/models/types";
+import { ValidRequestBody } from "../../RequestBodyBuffer/IRequestBodyBuffer";
+
+export type LLMMessage = {
+  role?: string;
+  content?: unknown;
+  [key: string]: unknown;
+};
+
+export type ParsedRequestPayload = {
+  model?: string;
+  messages?: LLMMessage[];
+  tools?: unknown;
+};
+
+const DEFAULT_TOKEN_HEURISTIC = 0.25;
+
+const MODEL_TOKEN_HEURISTICS: Record<string, number> = {
+  "gpt-4o": 0.25,
+  "gpt-3.5-turbo": 0.2,
+  "gpt-4o-mini": 0.25,
+  "gpt-4o-nano": 0.15,
+  "gpt-o3": 0.25,
+};
+
+const NORMALIZATION_PATTERNS: Array<[RegExp, string]> = [
+  [/<!--[\s\S]*?-->/g, ""],
+  [/\b(id|uuid):[a-f0-9-]{36}\b/gi, ""],
+  [/\s*,\s*/g, ","],
+  [/\s*\.\s*/g, "."],
+  [/\s*:\s*/g, ":"],
+  [/\s*;\s*/g, ";"],
+  [/\s*\(\s*/g, "("],
+  [/\s*\)\s*/g, ")"],
+  [/\s*\{\s*/g, "{"],
+  [/\s*\}\s*/g, "}"],
+  [/\s*\[\s*/g, "["],
+  [/\s*\]\s*/g, "]"],
+  [/\s*=\s*/g, "="],
+  [/\s*>\s*/g, ">"],
+  [/\s*<\s*/g, "<"],
+];
+
+export function truncateAndNormalizeText(
+  input: string | null | undefined
+): string {
+  if (!input) {
+    return "";
+  }
+
+  let normalized = input;
+
+  for (const [pattern, replacement] of NORMALIZATION_PATTERNS) {
+    normalized = normalized.replace(pattern, replacement);
+  }
+
+  normalized = normalized.replace(/\s+/g, " ").trim();
+
+  return normalized;
+}
+
+export function middleOutMessagesToFitLimit<T extends LLMMessage>(
+  messages: T[],
+  maxTokens: number,
+  estimateTokens: (candidate: T[]) => number | null
+): T[] {
+  if (!Array.isArray(messages) || messages.length === 0) {
+    return [];
+  }
+
+  if (!Number.isFinite(maxTokens) || maxTokens <= 0) {
+    return messages.slice(0, Math.min(messages.length, 1));
+  }
+
+  type Chunk = {
+    messageIndex: number;
+    order: number;
+    content: string;
+  };
+
+  const original: T[] = messages.slice();
+
+  const DEFAULT_CHUNK_SIZE = 1000;
+  const DEFAULT_CHUNK_OVERLAP = 0;
+  // Important: avoid char-level splitting (""), which explodes chunk counts
+  // and severely hurts performance for large inputs. Keep word/line separators only.
+  const DEFAULT_SEPARATORS = ["\n\n", "\n", ".", " "];
+
+  function splitTextRecursive(
+    text: string,
+    chunkSize: number = DEFAULT_CHUNK_SIZE,
+    chunkOverlap: number = DEFAULT_CHUNK_OVERLAP,
+    separators: string[] = DEFAULT_SEPARATORS
+  ): string[] {
+    if (chunkSize <= 0) return [text];
+    if (text.length <= chunkSize) return [text];
+
+    let chosenSep = separators.find((s) => s !== "" && text.includes(s));
+    if (chosenSep === undefined)
+      chosenSep = separators[separators.length - 1] ?? " ";
+
+    const splits = text.split(chosenSep);
+    const chunks: string[] = [];
+    const joiner = chosenSep;
+
+    let current: string[] = [];
+    let currentLen = 0;
+
+    for (const piece of splits) {
+      const extra = current.length > 0 && joiner ? joiner.length : 0;
+      if (currentLen + extra + piece.length > chunkSize && current.length > 0) {
+        const chunk = current.join(joiner);
+        if (chunk.length > chunkSize) {
+          const nextSeps = separators.slice(
+            Math.max(0, separators.indexOf(chosenSep) + 1)
+          );
+          const subs = splitTextRecursive(
+            chunk,
+            chunkSize,
+            chunkOverlap,
+            nextSeps
+          );
+          chunks.push(...subs);
+        } else {
+          chunks.push(chunk);
+        }
+
+        if (chunkOverlap > 0) {
+          let remaining = chunkOverlap;
+          const overlapped: string[] = [];
+          for (let i = current.length - 1; i >= 0 && remaining > 0; i--) {
+            const token = current[i];
+            const tokenLen =
+              token.length + (i > 0 && joiner ? joiner.length : 0);
+            overlapped.unshift(token);
+            remaining -= tokenLen;
+          }
+          current = overlapped;
+          currentLen = overlapped.join(joiner).length;
+        } else {
+          current = [];
+          currentLen = 0;
+        }
+      }
+
+      if (piece.length > 0) {
+        if (current.length > 0 && joiner) currentLen += joiner.length;
+        current.push(piece);
+        currentLen += piece.length;
+      }
+    }
+
+    if (current.length > 0) {
+      const chunk = current.join(joiner);
+      if (chunk.length > chunkSize) {
+        const nextSeps = separators.slice(
+          Math.max(0, separators.indexOf(chosenSep) + 1)
+        );
+        const subs = splitTextRecursive(
+          chunk,
+          chunkSize,
+          chunkOverlap,
+          nextSeps
+        );
+        chunks.push(...subs);
+      } else {
+        chunks.push(chunk);
+      }
+    }
+
+    return chunks;
+  }
+
+  const chunks: Chunk[] = [];
+  const stringMessageIndexes = new Set<number>();
+
+  for (let i = 0; i < original.length; i++) {
+    const m = original[i];
+    if (typeof m?.content === "string" && m.content.length > 0) {
+      stringMessageIndexes.add(i);
+      const parts = splitTextRecursive(m.content);
+      for (let order = 0; order < parts.length; order++) {
+        chunks.push({ messageIndex: i, order, content: parts[order] });
+      }
+    }
+  }
+
+  if (chunks.length === 0) {
+    const working = original.slice();
+    let currentEstimate = estimateTokens(working);
+    if (currentEstimate === null || currentEstimate <= maxTokens) {
+      return working;
+    }
+
+    while (working.length > 2) {
+      const middleIndex = Math.floor(working.length / 2);
+      working.splice(middleIndex, 1);
+      currentEstimate = estimateTokens(working);
+      if (currentEstimate === null || currentEstimate <= maxTokens) {
+        break;
+      }
+    }
+    return working;
+  }
+
+  function buildMessagesFromKept(kept: Set<number>): T[] {
+    const byMessage = new Map<number, string[]>();
+    for (let idx = 0; idx < chunks.length; idx++) {
+      if (!kept.has(idx)) continue;
+      const c = chunks[idx];
+      if (!byMessage.has(c.messageIndex)) byMessage.set(c.messageIndex, []);
+      byMessage.get(c.messageIndex)!.push(c.content);
+    }
+
+    return original.map((m, i) => {
+      const clone = { ...(m as any) } as T;
+      if (typeof clone?.content === "string") {
+        const parts = byMessage.get(i) ?? [];
+        (clone as any).content = parts.join("");
+      }
+      return clone;
+    });
+  }
+
+  function buildMessagesWithOnlyChunk(chunkIndex: number): T[] {
+    const target = chunks[chunkIndex];
+    return original.map((m, i) => {
+      const clone = { ...(m as any) } as T;
+      if (typeof clone?.content === "string") {
+        (clone as any).content =
+          i === target.messageIndex ? target.content : "";
+      }
+      return clone;
+    });
+  }
+
+  // Compute a per-chunk weight once using a simple proportional heuristic
+  // derived from a single full estimate call.
+  const baseTokens = estimateTokens([]) ?? 0;
+  // Total content characters across all chunks
+  const totalChars = chunks.reduce((acc, c) => acc + c.content.length, 0);
+  // Approximate total tokens if we kept everything (one estimate call on full content)
+  const allKept = new Set<number>(chunks.map((_, i) => i));
+  const fullEstimate = estimateTokens(buildMessagesFromKept(allKept));
+
+  // If estimation failed, fall back to current messages (no trimming)
+  if (fullEstimate === null) {
+    return buildMessagesFromKept(allKept);
+  }
+
+  // If already within budget, return original
+  if (fullEstimate <= maxTokens) {
+    return buildMessagesFromKept(allKept);
+  }
+
+  const budgetForChunks = Math.max(0, maxTokens - baseTokens);
+  const contentTokens = Math.max(0, fullEstimate - baseTokens);
+  const tokensPerChar = totalChars > 0 ? contentTokens / totalChars : 0;
+
+  // Build weights for each chunk once
+  const weights = chunks.map((c) => {
+    const raw = Math.floor(tokensPerChar * c.content.length);
+    // Ensure non-empty chunks have at least weight 1 when there is content budget
+    if (contentTokens > 0 && c.content.length > 0) {
+      return Math.max(1, raw);
+    }
+    return Math.max(0, raw);
+  });
+
+  // Early fallback: if budget can't even cover any content tokens, keep nothing from content
+  if (budgetForChunks <= 0 || contentTokens <= 0 || totalChars === 0) {
+    const keptNone = new Set<number>();
+    const finalNone = buildMessagesFromKept(keptNone).filter((m) => {
+      if (typeof (m as any)?.content === "string") {
+        return ((m as any).content as string).length > 0;
+      }
+      return true;
+    });
+    return finalNone as T[];
+  }
+
+  // Simple contiguous middle removal: remove a centered window until we've
+  // removed at least the number of tokens we need to cut. Keep the rest.
+  const n = weights.length;
+  const kept = new Set<number>();
+  const cutTokens = Math.max(0, contentTokens - budgetForChunks);
+  if (n === 0 || cutTokens <= 0) {
+    // Nothing to remove
+    for (let i = 0; i < n; i++) kept.add(i);
+  } else {
+    // Centered window [L, R) to remove
+    let L = Math.floor((n - 1) / 2);
+    let R = L + 1;
+    let removed = 0;
+
+    // Optionally include the very center chunk for odd lengths
+    removed += weights[L];
+    L -= 1;
+
+    let takeRight = true;
+    while (removed < cutTokens && (L >= 0 || R < n)) {
+      if (takeRight && R < n) {
+        removed += weights[R];
+        R += 1;
+      } else if (L >= 0) {
+        removed += weights[L];
+        L -= 1;
+      } else if (R < n) {
+        removed += weights[R];
+        R += 1;
+      }
+      takeRight = !takeRight;
+    }
+
+    // Keep everything outside the removal window
+    for (let i = 0; i < Math.max(0, L + 1); i++) kept.add(i);
+    for (let i = Math.min(n, R); i < n; i++) kept.add(i);
+  }
+
+  const finalMessages = buildMessagesFromKept(kept).filter((m) => {
+    if (typeof (m as any)?.content === "string") {
+      return ((m as any).content as string).length > 0;
+    }
+    return true;
+  });
+
+  return finalMessages as T[];
+}
+
+export function getTokenHeuristic(model: string | null | undefined): number {
+  if (!model) {
+    return DEFAULT_TOKEN_HEURISTIC;
+  }
+
+  const normalizedModel = model.toLowerCase();
+  if (normalizedModel in MODEL_TOKEN_HEURISTICS) {
+    return MODEL_TOKEN_HEURISTICS[normalizedModel];
+  }
+
+  for (const [prefix, heuristic] of Object.entries(MODEL_TOKEN_HEURISTICS)) {
+    if (normalizedModel.startsWith(prefix)) {
+      return heuristic;
+    }
+  }
+
+  return DEFAULT_TOKEN_HEURISTIC;
+}
+
+export function extractModelCandidates(modelField: unknown): string[] {
+  if (typeof modelField !== "string") {
+    return [];
+  }
+
+  return modelField
+    .split(",")
+    .map((candidate) => candidate.trim())
+    .filter((candidate) => candidate.length > 0);
+}
+
+export function getPrimaryModel(modelField: unknown): string | null {
+  const candidates = extractModelCandidates(modelField);
+  return candidates[0] ?? null;
+}
+
+export function selectFallbackModel(modelField: unknown): string | null {
+  const candidates = extractModelCandidates(modelField);
+  if (candidates.length === 0) {
+    return null;
+  }
+  return candidates[1] ?? candidates[0];
+}
+
+export function serializeTools(tools: unknown): string {
+  if (!tools) {
+    return "";
+  }
+  if (typeof tools === "string") {
+    return tools;
+  }
+  try {
+    return JSON.stringify(tools);
+  } catch (error) {
+    return "";
+  }
+}
+
+export function parseRequestPayload(
+  body: ValidRequestBody
+): ParsedRequestPayload | null {
+  if (!body || typeof body !== "string") {
+    return null;
+  }
+
+  try {
+    const parsed = JSON.parse(body);
+    if (!parsed || typeof parsed !== "object") {
+      return null;
+    }
+    return parsed as ParsedRequestPayload;
+  } catch (error) {
+    return null;
+  }
+}
+
+export function estimateTokenCount(
+  parsedBody: ParsedRequestPayload | null,
+  primaryModel: string | null
+): number | null {
+  if (!parsedBody) {
+    return null;
+  }
+  try {
+    let contentText = "";
+    if (parsedBody.messages) {
+      for (const message of parsedBody.messages) {
+        if (typeof message?.content === "string") {
+          contentText += message.content;
+        }
+      }
+    }
+    const toolsText = serializeTools(parsedBody.tools);
+
+    const combinedText = [toolsText, contentText]
+      .filter((segment) => segment.length > 0)
+      .join(" ");
+
+    const heuristic = getTokenHeuristic(primaryModel ?? undefined);
+    const estimated = Math.ceil(
+      (combinedText.length + toolsText.length) * heuristic
+    );
+
+    return Number.isFinite(estimated) ? estimated : null;
+  } catch (error) {
+    return null;
+  }
+}
+
+/**
+ * Attempts to read the requested completion/output token limit from the parsed body.
+ * Supports multiple common field names used across providers. Falls back to 0.
+ */
+// Note: completion token extraction is done within HeliconeProxyRequest.applyTokenLimitExceptionHandler
+
+export function getModelTokenLimit(
+  provider: Provider,
+  model: string | null | undefined
+): number | null {
+  if (!model) {
+    return null;
+  }
+
+  const providerName = heliconeProviderToModelProviderName(provider);
+  if (!providerName) {
+    return null;
+  }
+
+  const config = findModelProviderConfig(model, providerName);
+  if (!config || typeof config.contextLength !== "number") {
+    return null;
+  }
+
+  return config.contextLength;
+}
+
+export function findModelProviderConfig(
+  model: string,
+  providerName: ModelProviderName
+): ModelProviderConfig | null {
+  const directConfig = lookupProviderConfig(model, providerName);
+  if (directConfig) {
+    return directConfig;
+  }
+  return searchProviderModels(model, providerName);
+}
+
+export function lookupProviderConfig(
+  model: string,
+  providerName: ModelProviderName
+): ModelProviderConfig | null {
+  const candidates = buildLookupCandidates(model);
+  for (const candidate of candidates) {
+    const result = registry.getModelProviderConfigByProviderModelId(
+      candidate,
+      providerName
+    );
+    if (result.error === null && result.data) {
+      return result.data;
+    }
+  }
+  return null;
+}
+
+export function searchProviderModels(
+  model: string,
+  providerName: ModelProviderName
+): ModelProviderConfig | null {
+  const providerModelsResult = registry.getProviderModels(providerName);
+  if (providerModelsResult.error !== null || !providerModelsResult.data) {
+    return null;
+  }
+
+  for (const canonicalModel of providerModelsResult.data.values()) {
+    const configsResult = registry.getModelProviderConfigs(canonicalModel);
+    if (configsResult.error !== null || !configsResult.data) {
+      continue;
+    }
+
+    for (const config of configsResult.data) {
+      if (config.provider !== providerName) {
+        continue;
+      }
+
+      if (modelIdentifierMatches(model, config.providerModelId)) {
+        return config;
+      }
+    }
+  }
+
+  return null;
+}
+
+export function buildLookupCandidates(model: string): string[] {
+  const trimmed = model.trim();
+  if (!trimmed) {
+    return [];
+  }
+
+  const candidates = new Set<string>();
+  candidates.add(trimmed);
+
+  const lower = trimmed.toLowerCase();
+  if (lower !== trimmed) {
+    candidates.add(lower);
+  }
+
+  const delimiters = [":", "-"];
+  for (const delimiter of delimiters) {
+    let current = trimmed;
+    while (current.includes(delimiter)) {
+      current = current.substring(0, current.lastIndexOf(delimiter));
+      const normalized = current.trim();
+      if (!normalized || candidates.has(normalized)) {
+        continue;
+      }
+      candidates.add(normalized);
+      candidates.add(normalized.toLowerCase());
+    }
+  }
+
+  return Array.from(candidates);
+}
+
+export function modelIdentifierMatches(
+  requestModel: string,
+  providerModelId: string
+): boolean {
+  const requestVariants = buildModelIdentifierVariants(requestModel);
+  const providerVariants = buildModelIdentifierVariants(providerModelId);
+
+  for (const requestVariant of requestVariants) {
+    for (const providerVariant of providerVariants) {
+      if (requestVariant === providerVariant) {
+        return true;
+      }
+
+      if (
+        requestVariant.endsWith(`/${providerVariant}`) ||
+        requestVariant.endsWith(`:${providerVariant}`) ||
+        requestVariant.endsWith(`-${providerVariant}`)
+      ) {
+        return true;
+      }
+
+      if (
+        providerVariant.endsWith(`/${requestVariant}`) ||
+        providerVariant.endsWith(`:${requestVariant}`) ||
+        providerVariant.endsWith(`-${requestVariant}`)
+      ) {
+        return true;
+      }
+    }
+  }
+
+  const sanitizedRequest = sanitizeModelIdentifier(requestModel);
+  const sanitizedProvider = sanitizeModelIdentifier(providerModelId);
+
+  if (sanitizedRequest.length === 0 || sanitizedProvider.length === 0) {
+    return false;
+  }
+
+  const index = sanitizedRequest.indexOf(sanitizedProvider);
+  if (index > 0) {
+    return true;
+  }
+
+  return false;
+}
+
+export function buildModelIdentifierVariants(identifier: string): string[] {
+  const trimmed = identifier.trim();
+  if (!trimmed) {
+    return [];
+  }
+
+  const lower = trimmed.toLowerCase();
+  const variants = new Set<string>([trimmed, lower]);
+
+  const delimiterParts = lower.split(/[:/]/);
+  if (delimiterParts.length > 1) {
+    const lastPart = delimiterParts[delimiterParts.length - 1];
+    if (lastPart) {
+      variants.add(lastPart);
+    }
+  }
+
+  return Array.from(variants).filter((variant) => variant.length > 0);
+}
+
+export function sanitizeModelIdentifier(identifier: string): string {
+  return identifier.toLowerCase().replace(/[^a-z0-9]/g, "");
+}
+
+export function resolvePrimaryModel(
+  parsedBody: ParsedRequestPayload | null,
+  headerModelOverride: unknown
+): string | null {
+  const headerModel = getPrimaryModel(headerModelOverride);
+
+  if (!parsedBody) {
+    return headerModel;
+  }
+
+  const bodyModel = getPrimaryModel(parsedBody.model);
+  return bodyModel ?? headerModel;
+}
+
+export function applyTruncateStrategy(
+  parsedBody: ParsedRequestPayload
+): ValidRequestBody | undefined {
+  if (!parsedBody.messages) {
+    return;
+  }
+
+  for (const message of parsedBody.messages) {
+    if (typeof message?.content === "string") {
+      message.content = truncateAndNormalizeText(message.content);
+    }
+  }
+
+  return JSON.stringify(parsedBody);
+}
+
+export function applyMiddleOutStrategy(
+  parsedBody: ParsedRequestPayload,
+  primaryModel: string,
+  tokenLimit: number
+): ValidRequestBody | undefined {
+  if (!Array.isArray(parsedBody.messages)) {
+    return;
+  }
+
+  const originalMessages = (parsedBody.messages ?? []) as LLMMessage[];
+
+  const trimmedMessages = middleOutMessagesToFitLimit(
+    originalMessages,
+    tokenLimit,
+    (candidate) =>
+      estimateTokenCount(
+        {
+          ...parsedBody,
+          messages: candidate,
+        },
+        primaryModel
+      )
+  );
+
+  const changed =
+    JSON.stringify(trimmedMessages) !== JSON.stringify(originalMessages);
+  if (!changed) {
+    return;
+  }
+
+  const finalPayload: ParsedRequestPayload = {
+    ...parsedBody,
+    messages: trimmedMessages,
+  };
+
+  return JSON.stringify(finalPayload);
+}
+
+export function applyFallbackStrategy(
+  parsedBody: ParsedRequestPayload,
+  primaryModel: string,
+  estimatedTokens: number,
+  tokenLimit: number
+): ValidRequestBody | undefined {
+  const fallbackModel = selectFallbackModel(parsedBody.model);
+  if (!fallbackModel) {
+    return;
+  }
+
+  if (estimatedTokens >= tokenLimit) {
+    parsedBody.model = fallbackModel;
+
+    return JSON.stringify(parsedBody);
+  }
+
+  parsedBody.model = primaryModel;
+  return JSON.stringify(parsedBody);
+}