diff --git a/docs/docs.json b/docs/docs.json
index 9b69ae9af8..a865ca8043 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -36,7 +36,8 @@
"gateway/prompt-integration",
"features/advanced-usage/llm-security",
"features/advanced-usage/moderations",
- "features/advanced-usage/retries"
+ "features/advanced-usage/retries",
+ "features/advanced-usage/token-limit-exception-handlers"
]
},
{
diff --git a/docs/features/advanced-usage/token-limit-exception-handlers.mdx b/docs/features/advanced-usage/token-limit-exception-handlers.mdx
new file mode 100644
index 0000000000..d54c4a5b4e
--- /dev/null
+++ b/docs/features/advanced-usage/token-limit-exception-handlers.mdx
@@ -0,0 +1,122 @@
+---
+title: "Token Limit Exception Handlers"
+sidebarTitle: "Large Context"
+description: "Automatically handle requests that exceed a model's context window using truncate, middle-out, or fallback strategies."
+"twitter:title": "Large Context - Token Limit Exception Handlers"
+---
+
+import QuestionsSection from "/snippets/questions-section.mdx";
+
+When prompts get large, requests can exceed the model's maximum context window. Helicone can automatically apply strategies to keep your request within limits or switch to a fallback model — without changing your app code.
+
+## What This Does
+
+- Estimates tokens for your request based on model and content
+- Accounts for reserved output tokens (e.g., `max_tokens`, `max_output_tokens`)
+- Applies a chosen strategy only when the estimated input exceeds the allowed context
+
+
+Helicone uses provider-aware heuristics to estimate tokens and a best-effort approach across different request shapes.
+
+
+## Strategies
+
+- Truncate (`truncate`): Normalize and trim message content to reduce token count.
+- Middle-out (`middle-out`): Preserve the beginning and end of messages while trimming middle content to fit the limit.
+- Fallback (`fallback`): Switch to an alternate model when the request is too large. Provide multiple candidates in the request body `model` field as a comma-separated list (first is primary, second is fallback).
+
+
+For `fallback`, Helicone picks the second candidate if needed. When under the limit, Helicone normalizes the `model` to the primary. If your body lacks `model`, set `Helicone-Model-Override`.
+
+
+## Quick Start
+
+Add the `Helicone-Token-Limit-Exception-Handler` header to enable a strategy.
+
+
+```typescript Node.js
+import { OpenAI } from "openai";
+
+const client = new OpenAI({
+ baseURL: "https://ai-gateway.helicone.ai/v1",
+ apiKey: process.env.HELICONE_API_KEY,
+});
+
+// Middle-out strategy
+await client.chat.completions.create(
+ {
+ model: "gpt-4o", // or "gpt-4o, gpt-4o-mini" for fallback
+ messages: [
+ { role: "user", content: "A very long prompt ..." }
+ ],
+ max_tokens: 256
+ },
+ {
+ headers: {
+ "Helicone-Token-Limit-Exception-Handler": "middle-out"
+ }
+ }
+);
+```
+
+```python Python
+from openai import OpenAI
+import os
+
+client = OpenAI(
+ base_url="https://ai-gateway.helicone.ai/v1",
+ api_key=os.getenv("HELICONE_API_KEY"),
+)
+
+# Fallback strategy with model candidates
+resp = client.chat.completions.create(
+ model="gpt-4o, gpt-4o-mini",
+ messages=[{"role": "user", "content": "A very long prompt ..."}],
+ max_tokens=256,
+ extra_headers={
+ "Helicone-Token-Limit-Exception-Handler": "fallback",
+ }
+)
+```
+
+```bash cURL
+curl --request POST \
+ --url https://ai-gateway.helicone.ai/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -H "Authorization: Bearer $HELICONE_API_KEY" \
+ -H "Helicone-Token-Limit-Exception-Handler: truncate" \
+ --data '{
+ "model": "gpt-4o",
+ "messages": [{"role": "user", "content": "A very long prompt ..."}],
+ "max_tokens": 256
+ }'
+```
+
+
+
+## Configuration
+
+Enable and control via headers:
+
+
+ One of: `truncate`, `middle-out`, `fallback`.
+
+
+
+ Optional. Used for token estimation and model selection when the request body doesn't include a `model` or you need to override it.
+
+
+### Fallback Model Selection
+
+- Provide candidates in the body: `model: "primary, fallback"`
+- Helicone chooses the fallback when input exceeds the allowed context
+- When under the limit, Helicone normalizes the `model` to the primary
+
+## Notes
+
+- Token estimation is heuristic and provider-aware; behavior is best-effort across request shapes.
+- Allowed context accounts for requested completion tokens (e.g., `max_tokens`).
+- Changes are applied before the provider call; your logged request reflects the applied strategy.
+
+
+
diff --git a/docs/helicone-headers/header-directory.mdx b/docs/helicone-headers/header-directory.mdx
index 0233151151..c5f330aa6f 100644
--- a/docs/helicone-headers/header-directory.mdx
+++ b/docs/helicone-headers/header-directory.mdx
@@ -150,6 +150,17 @@ The URL to proxy the request to when using _gateway.helicone.ai_. For example, `
Whether to exclude the request from the response. Set to `true` or `false`.
+
+ Control how Helicone handles requests that would exceed a model's context window. Accepted values:
+
+ - `truncate` — Best-effort normalization and trimming of message content to reduce token count.
+ - `middle-out` — Preserve the beginning and end of messages while removing middle content to fit within the limit. Uses token estimation to keep high-value context.
+ - `fallback` — Switch to an alternate model when input exceeds the context limit. Provide multiple candidates in the request body's `model` field as a comma-separated list (e.g., `"gpt-4o, gpt-4o-mini"`). Helicone picks the second model as the fallback when needed. When under the limit, Helicone normalizes the `model` field to the primary model.
+
+
+ If your request body does not include a `model` or you need to override it for estimation, set `Helicone-Model-Override`. For fallbacks, specify multiple `model` candidates in the body; only the first two are considered.
+
+
Whether to cache your responses. Set to `true` or `false`. You can customize the behavior of the cache feature by setting additional headers in your request.
diff --git a/worker/src/lib/models/HeliconeHeaders.ts b/worker/src/lib/models/HeliconeHeaders.ts
index 117fce12ab..91753312d2 100644
--- a/worker/src/lib/models/HeliconeHeaders.ts
+++ b/worker/src/lib/models/HeliconeHeaders.ts
@@ -12,6 +12,11 @@ export type HeliconeFallback = {
};
export type HeliconeBearerKeyType = "standard" | "rate-limited";
+export enum HeliconeTokenLimitExceptionHandler {
+ Truncate = "truncate",
+ MiddleOut = "middle-out",
+ Fallback = "fallback",
+}
export interface IHeliconeHeaders {
heliconeAuth: Nullable;
@@ -58,6 +63,7 @@ export interface IHeliconeHeaders {
omitResponse: boolean;
omitRequest: boolean;
};
+ tokenLimitExceptionHandler: Nullable;
sessionHeaders: {
sessionId: Nullable;
path: Nullable;
@@ -125,6 +131,7 @@ export class HeliconeHeaders implements IHeliconeHeaders {
promptName: Nullable;
userId: Nullable;
omitHeaders: { omitResponse: boolean; omitRequest: boolean };
+ tokenLimitExceptionHandler: Nullable;
sessionHeaders: {
sessionId: Nullable;
path: Nullable;
@@ -178,6 +185,8 @@ export class HeliconeHeaders implements IHeliconeHeaders {
};
this.promptName = heliconeHeaders.promptName;
this.omitHeaders = heliconeHeaders.omitHeaders;
+ this.tokenLimitExceptionHandler =
+ heliconeHeaders.tokenLimitExceptionHandler;
this.sessionHeaders = heliconeHeaders.sessionHeaders;
this.userId = heliconeHeaders.userId;
this.heliconeProperties = this.getHeliconeProperties(heliconeHeaders);
@@ -364,6 +373,7 @@ export class HeliconeHeaders implements IHeliconeHeaders {
omitResponse: this.headers.get("Helicone-Omit-Response") === "true",
omitRequest: this.headers.get("Helicone-Omit-Request") === "true",
},
+ tokenLimitExceptionHandler: this.getTokenLimitExceptionHandler(),
sessionHeaders: {
sessionId: this.headers.get("Helicone-Session-Id") ?? null,
path: this.headers.get("Helicone-Session-Path") ?? null,
@@ -403,6 +413,25 @@ export class HeliconeHeaders implements IHeliconeHeaders {
};
}
+ private getTokenLimitExceptionHandler(): Nullable {
+ const handler = this.headers.get("Helicone-Token-Limit-Exception-Handler");
+ if (!handler) {
+ return null;
+ }
+
+ const normalized = handler.toLowerCase();
+ switch (normalized) {
+ case HeliconeTokenLimitExceptionHandler.Truncate:
+ return HeliconeTokenLimitExceptionHandler.Truncate;
+ case HeliconeTokenLimitExceptionHandler.MiddleOut:
+ return HeliconeTokenLimitExceptionHandler.MiddleOut;
+ case HeliconeTokenLimitExceptionHandler.Fallback:
+ return HeliconeTokenLimitExceptionHandler.Fallback;
+ default:
+ return null;
+ }
+ }
+
private getRetryHeaders(): IHeliconeHeaders["retryHeaders"] {
const retryEnabled = this.headers.get("helicone-retry-enabled");
if (retryEnabled === null) {
diff --git a/worker/src/lib/models/HeliconeProxyRequest.ts b/worker/src/lib/models/HeliconeProxyRequest.ts
index a70a70e3f6..fb95478a8c 100644
--- a/worker/src/lib/models/HeliconeProxyRequest.ts
+++ b/worker/src/lib/models/HeliconeProxyRequest.ts
@@ -5,7 +5,10 @@ import { approvedDomains } from "@helicone-package/cost/providers/mappings";
import { RequestWrapper } from "../RequestWrapper";
import { buildTargetUrl } from "../clients/ProviderClient";
import { Result, ok } from "../util/results";
-import { IHeliconeHeaders } from "./HeliconeHeaders";
+import {
+ HeliconeTokenLimitExceptionHandler,
+ IHeliconeHeaders,
+} from "./HeliconeHeaders";
import { parseJSXObject } from "@helicone/prompts";
import { TemplateWithInputs } from "@helicone/prompts/dist/objectParser";
@@ -26,6 +29,16 @@ export type RetryOptions = {
export type HeliconeProperties = Record;
type Nullable = T | null;
+import {
+ applyFallbackStrategy,
+ applyMiddleOutStrategy,
+ applyTruncateStrategy,
+ estimateTokenCount,
+ getModelTokenLimit,
+ parseRequestPayload,
+ resolvePrimaryModel,
+} from "../util/tokenLimitException";
+
// This neatly formats and holds all of the state that a request can come into Helicone
export interface HeliconeProxyRequest {
provider: Provider;
@@ -162,6 +175,16 @@ export class HeliconeProxyRequestMapper {
body = await this.request.unsafeGetBodyText();
}
+ // Apply token limit exception handler here and update buffer if changed
+ const bodyWithTokenLimitExceptionHandler =
+ this.applyTokenLimitExceptionHandler(body);
+ if (typeof bodyWithTokenLimitExceptionHandler === "string") {
+ body = bodyWithTokenLimitExceptionHandler;
+ await this.request.requestBodyBuffer.tempSetBody(
+ bodyWithTokenLimitExceptionHandler
+ );
+ }
+
return {
data: {
heliconePromptTemplate: await this.getHeliconeTemplate(),
@@ -197,6 +220,90 @@ export class HeliconeProxyRequestMapper {
};
}
+ public applyTokenLimitExceptionHandler(
+ body: ValidRequestBody
+ ): ValidRequestBody | undefined {
+ const handler = this.request.heliconeHeaders.tokenLimitExceptionHandler;
+ if (!handler) {
+ return;
+ }
+
+ const parsedBody = parseRequestPayload(body);
+ if (!parsedBody) {
+ return;
+ }
+
+ const primaryModel = resolvePrimaryModel(
+ parsedBody,
+ this.request.heliconeHeaders.modelOverride
+ );
+ const estimatedTokens = estimateTokenCount(parsedBody, primaryModel);
+
+ if (!primaryModel) {
+ return;
+ }
+
+ const modelContextLimit = getModelTokenLimit(this.provider, primaryModel);
+
+ // Extract requested completion/output limit directly here (provider-agnostic best-effort)
+ const anyBody = parsedBody as any;
+ const completionCandidates: Array = [
+ anyBody?.max_completion_tokens,
+ anyBody?.max_tokens,
+ anyBody?.max_output_tokens,
+ anyBody?.maxOutputTokens,
+ anyBody?.response?.max_tokens,
+ anyBody?.response?.max_output_tokens,
+ anyBody?.response?.maxOutputTokens,
+ anyBody?.generation_config?.max_output_tokens,
+ anyBody?.generation_config?.maxOutputTokens,
+ anyBody?.generationConfig?.max_output_tokens,
+ anyBody?.generationConfig?.maxOutputTokens,
+ ];
+ const requestedCompletionTokens = (() => {
+ for (const val of completionCandidates) {
+ if (typeof val === "number" && Number.isFinite(val) && val > 0) {
+ return Math.floor(val);
+ }
+ }
+ return 0;
+ })();
+ const tokenLimit =
+ modelContextLimit === null
+ ? null
+ : Math.max(
+ 0,
+ modelContextLimit -
+ (requestedCompletionTokens || modelContextLimit * 0.1)
+ );
+
+ if (
+ estimatedTokens === null ||
+ tokenLimit === null ||
+ (estimatedTokens <= tokenLimit &&
+ handler != HeliconeTokenLimitExceptionHandler.Fallback) //needed to sort the extra model passed in request
+ ) {
+ return;
+ }
+
+ // TODO: Add some indicator as to what was applied so users understand why their request looks different
+ switch (handler) {
+ case HeliconeTokenLimitExceptionHandler.Truncate:
+ return applyTruncateStrategy(parsedBody);
+ case HeliconeTokenLimitExceptionHandler.MiddleOut:
+ return applyMiddleOutStrategy(parsedBody, primaryModel, tokenLimit);
+ case HeliconeTokenLimitExceptionHandler.Fallback:
+ return applyFallbackStrategy(
+ parsedBody,
+ primaryModel,
+ estimatedTokens,
+ tokenLimit
+ );
+ default:
+ return;
+ }
+ }
+
private validateApiConfiguration(api_base: string | undefined): boolean {
return (
api_base === undefined ||
diff --git a/worker/src/lib/util/tokenLimitException.ts b/worker/src/lib/util/tokenLimitException.ts
new file mode 100644
index 0000000000..f6f17ca422
--- /dev/null
+++ b/worker/src/lib/util/tokenLimitException.ts
@@ -0,0 +1,714 @@
+import { Provider } from "@helicone-package/llm-mapper/types";
+import { registry } from "@helicone-package/cost/models/registry";
+import { heliconeProviderToModelProviderName } from "@helicone-package/cost/models/provider-helpers";
+import type { ModelProviderName } from "@helicone-package/cost/models/providers";
+import type { ModelProviderConfig } from "@helicone-package/cost/models/types";
+import { ValidRequestBody } from "../../RequestBodyBuffer/IRequestBodyBuffer";
+
+export type LLMMessage = {
+ role?: string;
+ content?: unknown;
+ [key: string]: unknown;
+};
+
+export type ParsedRequestPayload = {
+ model?: string;
+ messages?: LLMMessage[];
+ tools?: unknown;
+};
+
+const DEFAULT_TOKEN_HEURISTIC = 0.25;
+
+const MODEL_TOKEN_HEURISTICS: Record = {
+ "gpt-4o": 0.25,
+ "gpt-3.5-turbo": 0.2,
+ "gpt-4o-mini": 0.25,
+ "gpt-4o-nano": 0.15,
+ "gpt-o3": 0.25,
+};
+
+const NORMALIZATION_PATTERNS: Array<[RegExp, string]> = [
+ [//g, ""],
+ [/\b(id|uuid):[a-f0-9-]{36}\b/gi, ""],
+ [/\s*,\s*/g, ","],
+ [/\s*\.\s*/g, "."],
+ [/\s*:\s*/g, ":"],
+ [/\s*;\s*/g, ";"],
+ [/\s*\(\s*/g, "("],
+ [/\s*\)\s*/g, ")"],
+ [/\s*\{\s*/g, "{"],
+ [/\s*\}\s*/g, "}"],
+ [/\s*\[\s*/g, "["],
+ [/\s*\]\s*/g, "]"],
+ [/\s*=\s*/g, "="],
+ [/\s*>\s*/g, ">"],
+ [/\s*<\s*/g, "<"],
+];
+
+export function truncateAndNormalizeText(
+ input: string | null | undefined
+): string {
+ if (!input) {
+ return "";
+ }
+
+ let normalized = input;
+
+ for (const [pattern, replacement] of NORMALIZATION_PATTERNS) {
+ normalized = normalized.replace(pattern, replacement);
+ }
+
+ normalized = normalized.replace(/\s+/g, " ").trim();
+
+ return normalized;
+}
+
+export function middleOutMessagesToFitLimit(
+ messages: T[],
+ maxTokens: number,
+ estimateTokens: (candidate: T[]) => number | null
+): T[] {
+ if (!Array.isArray(messages) || messages.length === 0) {
+ return [];
+ }
+
+ if (!Number.isFinite(maxTokens) || maxTokens <= 0) {
+ return messages.slice(0, Math.min(messages.length, 1));
+ }
+
+ type Chunk = {
+ messageIndex: number;
+ order: number;
+ content: string;
+ };
+
+ const original: T[] = messages.slice();
+
+ const DEFAULT_CHUNK_SIZE = 1000;
+ const DEFAULT_CHUNK_OVERLAP = 0;
+ // Important: avoid char-level splitting (""), which explodes chunk counts
+ // and severely hurts performance for large inputs. Keep word/line separators only.
+ const DEFAULT_SEPARATORS = ["\n\n", "\n", ".", " "];
+
+ function splitTextRecursive(
+ text: string,
+ chunkSize: number = DEFAULT_CHUNK_SIZE,
+ chunkOverlap: number = DEFAULT_CHUNK_OVERLAP,
+ separators: string[] = DEFAULT_SEPARATORS
+ ): string[] {
+ if (chunkSize <= 0) return [text];
+ if (text.length <= chunkSize) return [text];
+
+ let chosenSep = separators.find((s) => s !== "" && text.includes(s));
+ if (chosenSep === undefined)
+ chosenSep = separators[separators.length - 1] ?? " ";
+
+ const splits = text.split(chosenSep);
+ const chunks: string[] = [];
+ const joiner = chosenSep;
+
+ let current: string[] = [];
+ let currentLen = 0;
+
+ for (const piece of splits) {
+ const extra = current.length > 0 && joiner ? joiner.length : 0;
+ if (currentLen + extra + piece.length > chunkSize && current.length > 0) {
+ const chunk = current.join(joiner);
+ if (chunk.length > chunkSize) {
+ const nextSeps = separators.slice(
+ Math.max(0, separators.indexOf(chosenSep) + 1)
+ );
+ const subs = splitTextRecursive(
+ chunk,
+ chunkSize,
+ chunkOverlap,
+ nextSeps
+ );
+ chunks.push(...subs);
+ } else {
+ chunks.push(chunk);
+ }
+
+ if (chunkOverlap > 0) {
+ let remaining = chunkOverlap;
+ const overlapped: string[] = [];
+ for (let i = current.length - 1; i >= 0 && remaining > 0; i--) {
+ const token = current[i];
+ const tokenLen =
+ token.length + (i > 0 && joiner ? joiner.length : 0);
+ overlapped.unshift(token);
+ remaining -= tokenLen;
+ }
+ current = overlapped;
+ currentLen = overlapped.join(joiner).length;
+ } else {
+ current = [];
+ currentLen = 0;
+ }
+ }
+
+ if (piece.length > 0) {
+ if (current.length > 0 && joiner) currentLen += joiner.length;
+ current.push(piece);
+ currentLen += piece.length;
+ }
+ }
+
+ if (current.length > 0) {
+ const chunk = current.join(joiner);
+ if (chunk.length > chunkSize) {
+ const nextSeps = separators.slice(
+ Math.max(0, separators.indexOf(chosenSep) + 1)
+ );
+ const subs = splitTextRecursive(
+ chunk,
+ chunkSize,
+ chunkOverlap,
+ nextSeps
+ );
+ chunks.push(...subs);
+ } else {
+ chunks.push(chunk);
+ }
+ }
+
+ return chunks;
+ }
+
+ const chunks: Chunk[] = [];
+ const stringMessageIndexes = new Set();
+
+ for (let i = 0; i < original.length; i++) {
+ const m = original[i];
+ if (typeof m?.content === "string" && m.content.length > 0) {
+ stringMessageIndexes.add(i);
+ const parts = splitTextRecursive(m.content);
+ for (let order = 0; order < parts.length; order++) {
+ chunks.push({ messageIndex: i, order, content: parts[order] });
+ }
+ }
+ }
+
+ if (chunks.length === 0) {
+ const working = original.slice();
+ let currentEstimate = estimateTokens(working);
+ if (currentEstimate === null || currentEstimate <= maxTokens) {
+ return working;
+ }
+
+ while (working.length > 2) {
+ const middleIndex = Math.floor(working.length / 2);
+ working.splice(middleIndex, 1);
+ currentEstimate = estimateTokens(working);
+ if (currentEstimate === null || currentEstimate <= maxTokens) {
+ break;
+ }
+ }
+ return working;
+ }
+
+ function buildMessagesFromKept(kept: Set): T[] {
+ const byMessage = new Map();
+ for (let idx = 0; idx < chunks.length; idx++) {
+ if (!kept.has(idx)) continue;
+ const c = chunks[idx];
+ if (!byMessage.has(c.messageIndex)) byMessage.set(c.messageIndex, []);
+ byMessage.get(c.messageIndex)!.push(c.content);
+ }
+
+ return original.map((m, i) => {
+ const clone = { ...(m as any) } as T;
+ if (typeof clone?.content === "string") {
+ const parts = byMessage.get(i) ?? [];
+ (clone as any).content = parts.join("");
+ }
+ return clone;
+ });
+ }
+
+ function buildMessagesWithOnlyChunk(chunkIndex: number): T[] {
+ const target = chunks[chunkIndex];
+ return original.map((m, i) => {
+ const clone = { ...(m as any) } as T;
+ if (typeof clone?.content === "string") {
+ (clone as any).content =
+ i === target.messageIndex ? target.content : "";
+ }
+ return clone;
+ });
+ }
+
+ // Compute a per-chunk weight once using a simple proportional heuristic
+ // derived from a single full estimate call.
+ const baseTokens = estimateTokens([]) ?? 0;
+ // Total content characters across all chunks
+ const totalChars = chunks.reduce((acc, c) => acc + c.content.length, 0);
+ // Approximate total tokens if we kept everything (one estimate call on full content)
+ const allKept = new Set(chunks.map((_, i) => i));
+ const fullEstimate = estimateTokens(buildMessagesFromKept(allKept));
+
+ // If estimation failed, fall back to current messages (no trimming)
+ if (fullEstimate === null) {
+ return buildMessagesFromKept(allKept);
+ }
+
+ // If already within budget, return original
+ if (fullEstimate <= maxTokens) {
+ return buildMessagesFromKept(allKept);
+ }
+
+ const budgetForChunks = Math.max(0, maxTokens - baseTokens);
+ const contentTokens = Math.max(0, fullEstimate - baseTokens);
+ const tokensPerChar = totalChars > 0 ? contentTokens / totalChars : 0;
+
+ // Build weights for each chunk once
+ const weights = chunks.map((c) => {
+ const raw = Math.floor(tokensPerChar * c.content.length);
+ // Ensure non-empty chunks have at least weight 1 when there is content budget
+ if (contentTokens > 0 && c.content.length > 0) {
+ return Math.max(1, raw);
+ }
+ return Math.max(0, raw);
+ });
+
+ // Early fallback: if budget can't even cover any content tokens, keep nothing from content
+ if (budgetForChunks <= 0 || contentTokens <= 0 || totalChars === 0) {
+ const keptNone = new Set();
+ const finalNone = buildMessagesFromKept(keptNone).filter((m) => {
+ if (typeof (m as any)?.content === "string") {
+ return ((m as any).content as string).length > 0;
+ }
+ return true;
+ });
+ return finalNone as T[];
+ }
+
+ // Simple contiguous middle removal: remove a centered window until we've
+ // removed at least the number of tokens we need to cut. Keep the rest.
+ const n = weights.length;
+ const kept = new Set();
+ const cutTokens = Math.max(0, contentTokens - budgetForChunks);
+ if (n === 0 || cutTokens <= 0) {
+ // Nothing to remove
+ for (let i = 0; i < n; i++) kept.add(i);
+ } else {
+ // Centered window [L, R) to remove
+ let L = Math.floor((n - 1) / 2);
+ let R = L + 1;
+ let removed = 0;
+
+ // Optionally include the very center chunk for odd lengths
+ removed += weights[L];
+ L -= 1;
+
+ let takeRight = true;
+ while (removed < cutTokens && (L >= 0 || R < n)) {
+ if (takeRight && R < n) {
+ removed += weights[R];
+ R += 1;
+ } else if (L >= 0) {
+ removed += weights[L];
+ L -= 1;
+ } else if (R < n) {
+ removed += weights[R];
+ R += 1;
+ }
+ takeRight = !takeRight;
+ }
+
+ // Keep everything outside the removal window
+ for (let i = 0; i < Math.max(0, L + 1); i++) kept.add(i);
+ for (let i = Math.min(n, R); i < n; i++) kept.add(i);
+ }
+
+ const finalMessages = buildMessagesFromKept(kept).filter((m) => {
+ if (typeof (m as any)?.content === "string") {
+ return ((m as any).content as string).length > 0;
+ }
+ return true;
+ });
+
+ return finalMessages as T[];
+}
+
+export function getTokenHeuristic(model: string | null | undefined): number {
+ if (!model) {
+ return DEFAULT_TOKEN_HEURISTIC;
+ }
+
+ const normalizedModel = model.toLowerCase();
+ if (normalizedModel in MODEL_TOKEN_HEURISTICS) {
+ return MODEL_TOKEN_HEURISTICS[normalizedModel];
+ }
+
+ for (const [prefix, heuristic] of Object.entries(MODEL_TOKEN_HEURISTICS)) {
+ if (normalizedModel.startsWith(prefix)) {
+ return heuristic;
+ }
+ }
+
+ return DEFAULT_TOKEN_HEURISTIC;
+}
+
+export function extractModelCandidates(modelField: unknown): string[] {
+ if (typeof modelField !== "string") {
+ return [];
+ }
+
+ return modelField
+ .split(",")
+ .map((candidate) => candidate.trim())
+ .filter((candidate) => candidate.length > 0);
+}
+
+export function getPrimaryModel(modelField: unknown): string | null {
+ const candidates = extractModelCandidates(modelField);
+ return candidates[0] ?? null;
+}
+
+export function selectFallbackModel(modelField: unknown): string | null {
+ const candidates = extractModelCandidates(modelField);
+ if (candidates.length === 0) {
+ return null;
+ }
+ return candidates[1] ?? candidates[0];
+}
+
+export function serializeTools(tools: unknown): string {
+ if (!tools) {
+ return "";
+ }
+ if (typeof tools === "string") {
+ return tools;
+ }
+ try {
+ return JSON.stringify(tools);
+ } catch (error) {
+ return "";
+ }
+}
+
+export function parseRequestPayload(
+ body: ValidRequestBody
+): ParsedRequestPayload | null {
+ if (!body || typeof body !== "string") {
+ return null;
+ }
+
+ try {
+ const parsed = JSON.parse(body);
+ if (!parsed || typeof parsed !== "object") {
+ return null;
+ }
+ return parsed as ParsedRequestPayload;
+ } catch (error) {
+ return null;
+ }
+}
+
+export function estimateTokenCount(
+ parsedBody: ParsedRequestPayload | null,
+ primaryModel: string | null
+): number | null {
+ if (!parsedBody) {
+ return null;
+ }
+ try {
+ let contentText = "";
+ if (parsedBody.messages) {
+ for (const message of parsedBody.messages) {
+ if (typeof message?.content === "string") {
+ contentText += message.content;
+ }
+ }
+ }
+ const toolsText = serializeTools(parsedBody.tools);
+
+ const combinedText = [toolsText, contentText]
+ .filter((segment) => segment.length > 0)
+ .join(" ");
+
+ const heuristic = getTokenHeuristic(primaryModel ?? undefined);
+ const estimated = Math.ceil(
+ (combinedText.length + toolsText.length) * heuristic
+ );
+
+ return Number.isFinite(estimated) ? estimated : null;
+ } catch (error) {
+ return null;
+ }
+}
+
+/**
+ * Attempts to read the requested completion/output token limit from the parsed body.
+ * Supports multiple common field names used across providers. Falls back to 0.
+ */
+// Note: completion token extraction is done within HeliconeProxyRequest.applyTokenLimitExceptionHandler
+
+export function getModelTokenLimit(
+ provider: Provider,
+ model: string | null | undefined
+): number | null {
+ if (!model) {
+ return null;
+ }
+
+ const providerName = heliconeProviderToModelProviderName(provider);
+ if (!providerName) {
+ return null;
+ }
+
+ const config = findModelProviderConfig(model, providerName);
+ if (!config || typeof config.contextLength !== "number") {
+ return null;
+ }
+
+ return config.contextLength;
+}
+
+export function findModelProviderConfig(
+ model: string,
+ providerName: ModelProviderName
+): ModelProviderConfig | null {
+ const directConfig = lookupProviderConfig(model, providerName);
+ if (directConfig) {
+ return directConfig;
+ }
+ return searchProviderModels(model, providerName);
+}
+
+export function lookupProviderConfig(
+ model: string,
+ providerName: ModelProviderName
+): ModelProviderConfig | null {
+ const candidates = buildLookupCandidates(model);
+ for (const candidate of candidates) {
+ const result = registry.getModelProviderConfigByProviderModelId(
+ candidate,
+ providerName
+ );
+ if (result.error === null && result.data) {
+ return result.data;
+ }
+ }
+ return null;
+}
+
+export function searchProviderModels(
+ model: string,
+ providerName: ModelProviderName
+): ModelProviderConfig | null {
+ const providerModelsResult = registry.getProviderModels(providerName);
+ if (providerModelsResult.error !== null || !providerModelsResult.data) {
+ return null;
+ }
+
+ for (const canonicalModel of providerModelsResult.data.values()) {
+ const configsResult = registry.getModelProviderConfigs(canonicalModel);
+ if (configsResult.error !== null || !configsResult.data) {
+ continue;
+ }
+
+ for (const config of configsResult.data) {
+ if (config.provider !== providerName) {
+ continue;
+ }
+
+ if (modelIdentifierMatches(model, config.providerModelId)) {
+ return config;
+ }
+ }
+ }
+
+ return null;
+}
+
+export function buildLookupCandidates(model: string): string[] {
+ const trimmed = model.trim();
+ if (!trimmed) {
+ return [];
+ }
+
+ const candidates = new Set();
+ candidates.add(trimmed);
+
+ const lower = trimmed.toLowerCase();
+ if (lower !== trimmed) {
+ candidates.add(lower);
+ }
+
+ const delimiters = [":", "-"];
+ for (const delimiter of delimiters) {
+ let current = trimmed;
+ while (current.includes(delimiter)) {
+ current = current.substring(0, current.lastIndexOf(delimiter));
+ const normalized = current.trim();
+ if (!normalized || candidates.has(normalized)) {
+ continue;
+ }
+ candidates.add(normalized);
+ candidates.add(normalized.toLowerCase());
+ }
+ }
+
+ return Array.from(candidates);
+}
+
+export function modelIdentifierMatches(
+ requestModel: string,
+ providerModelId: string
+): boolean {
+ const requestVariants = buildModelIdentifierVariants(requestModel);
+ const providerVariants = buildModelIdentifierVariants(providerModelId);
+
+ for (const requestVariant of requestVariants) {
+ for (const providerVariant of providerVariants) {
+ if (requestVariant === providerVariant) {
+ return true;
+ }
+
+ if (
+ requestVariant.endsWith(`/${providerVariant}`) ||
+ requestVariant.endsWith(`:${providerVariant}`) ||
+ requestVariant.endsWith(`-${providerVariant}`)
+ ) {
+ return true;
+ }
+
+ if (
+ providerVariant.endsWith(`/${requestVariant}`) ||
+ providerVariant.endsWith(`:${requestVariant}`) ||
+ providerVariant.endsWith(`-${requestVariant}`)
+ ) {
+ return true;
+ }
+ }
+ }
+
+ const sanitizedRequest = sanitizeModelIdentifier(requestModel);
+ const sanitizedProvider = sanitizeModelIdentifier(providerModelId);
+
+ if (sanitizedRequest.length === 0 || sanitizedProvider.length === 0) {
+ return false;
+ }
+
+ const index = sanitizedRequest.indexOf(sanitizedProvider);
+ if (index > 0) {
+ return true;
+ }
+
+ return false;
+}
+
+export function buildModelIdentifierVariants(identifier: string): string[] {
+ const trimmed = identifier.trim();
+ if (!trimmed) {
+ return [];
+ }
+
+ const lower = trimmed.toLowerCase();
+ const variants = new Set([trimmed, lower]);
+
+ const delimiterParts = lower.split(/[:/]/);
+ if (delimiterParts.length > 1) {
+ const lastPart = delimiterParts[delimiterParts.length - 1];
+ if (lastPart) {
+ variants.add(lastPart);
+ }
+ }
+
+ return Array.from(variants).filter((variant) => variant.length > 0);
+}
+
+export function sanitizeModelIdentifier(identifier: string): string {
+ return identifier.toLowerCase().replace(/[^a-z0-9]/g, "");
+}
+
+export function resolvePrimaryModel(
+ parsedBody: ParsedRequestPayload | null,
+ headerModelOverride: unknown
+): string | null {
+ const headerModel = getPrimaryModel(headerModelOverride);
+
+ if (!parsedBody) {
+ return headerModel;
+ }
+
+ const bodyModel = getPrimaryModel(parsedBody.model);
+ return bodyModel ?? headerModel;
+}
+
+export function applyTruncateStrategy(
+ parsedBody: ParsedRequestPayload
+): ValidRequestBody | undefined {
+ if (!parsedBody.messages) {
+ return;
+ }
+
+ for (const message of parsedBody.messages) {
+ if (typeof message?.content === "string") {
+ message.content = truncateAndNormalizeText(message.content);
+ }
+ }
+
+ return JSON.stringify(parsedBody);
+}
+
+export function applyMiddleOutStrategy(
+ parsedBody: ParsedRequestPayload,
+ primaryModel: string,
+ tokenLimit: number
+): ValidRequestBody | undefined {
+ if (!Array.isArray(parsedBody.messages)) {
+ return;
+ }
+
+ const originalMessages = (parsedBody.messages ?? []) as LLMMessage[];
+
+ const trimmedMessages = middleOutMessagesToFitLimit(
+ originalMessages,
+ tokenLimit,
+ (candidate) =>
+ estimateTokenCount(
+ {
+ ...parsedBody,
+ messages: candidate,
+ },
+ primaryModel
+ )
+ );
+
+ const changed =
+ JSON.stringify(trimmedMessages) !== JSON.stringify(originalMessages);
+ if (!changed) {
+ return;
+ }
+
+ const finalPayload: ParsedRequestPayload = {
+ ...parsedBody,
+ messages: trimmedMessages,
+ };
+
+ return JSON.stringify(finalPayload);
+}
+
+export function applyFallbackStrategy(
+ parsedBody: ParsedRequestPayload,
+ primaryModel: string,
+ estimatedTokens: number,
+ tokenLimit: number
+): ValidRequestBody | undefined {
+ const fallbackModel = selectFallbackModel(parsedBody.model);
+ if (!fallbackModel) {
+ return;
+ }
+
+ if (estimatedTokens >= tokenLimit) {
+ parsedBody.model = fallbackModel;
+
+ return JSON.stringify(parsedBody);
+ }
+
+ parsedBody.model = primaryModel;
+ return JSON.stringify(parsedBody);
+}