Helicone · H2Shami · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/worker/src/lib/models/HeliconeHeaders.ts b/worker/src/lib/models/HeliconeHeaders.ts
@@ -12,6 +12,11 @@ export type HeliconeFallback = {
 };
 
 export type HeliconeBearerKeyType = "standard" | "rate-limited";
+export enum HeliconeTokenLimitExceptionHandler {
+  Truncate = "truncate",
+  MiddleOut = "middle-out",
+  Fallback = "fallback",
+}
 
 export interface IHeliconeHeaders {
   heliconeAuth: Nullable<string>;
@@ -58,6 +63,7 @@ export interface IHeliconeHeaders {
     omitResponse: boolean;
     omitRequest: boolean;
   };
+  tokenLimitExceptionHandler: Nullable<HeliconeTokenLimitExceptionHandler>;
   sessionHeaders: {
     sessionId: Nullable<string>;
     path: Nullable<string>;
@@ -125,6 +131,7 @@ export class HeliconeHeaders implements IHeliconeHeaders {
   promptName: Nullable<string>;
   userId: Nullable<string>;
   omitHeaders: { omitResponse: boolean; omitRequest: boolean };
+  tokenLimitExceptionHandler: Nullable<HeliconeTokenLimitExceptionHandler>;
   sessionHeaders: {
     sessionId: Nullable<string>;
     path: Nullable<string>;
@@ -178,6 +185,8 @@ export class HeliconeHeaders implements IHeliconeHeaders {
     };
     this.promptName = heliconeHeaders.promptName;
     this.omitHeaders = heliconeHeaders.omitHeaders;
+    this.tokenLimitExceptionHandler =
+      heliconeHeaders.tokenLimitExceptionHandler;
     this.sessionHeaders = heliconeHeaders.sessionHeaders;
     this.userId = heliconeHeaders.userId;
     this.heliconeProperties = this.getHeliconeProperties(heliconeHeaders);
@@ -364,6 +373,7 @@ export class HeliconeHeaders implements IHeliconeHeaders {
         omitResponse: this.headers.get("Helicone-Omit-Response") === "true",
         omitRequest: this.headers.get("Helicone-Omit-Request") === "true",
       },
+      tokenLimitExceptionHandler: this.getTokenLimitExceptionHandler(),
       sessionHeaders: {
         sessionId: this.headers.get("Helicone-Session-Id") ?? null,
         path: this.headers.get("Helicone-Session-Path") ?? null,
@@ -403,6 +413,25 @@ export class HeliconeHeaders implements IHeliconeHeaders {
     };
   }
 
+  private getTokenLimitExceptionHandler(): Nullable<HeliconeTokenLimitExceptionHandler> {
+    const handler = this.headers.get("Helicone-Token-Limit-Exception-Handler");
+    if (!handler) {
+      return null;
+    }
+
+    const normalized = handler.toLowerCase();
+    switch (normalized) {
+      case HeliconeTokenLimitExceptionHandler.Truncate:
+        return HeliconeTokenLimitExceptionHandler.Truncate;
+      case HeliconeTokenLimitExceptionHandler.MiddleOut:
+        return HeliconeTokenLimitExceptionHandler.MiddleOut;
+      case HeliconeTokenLimitExceptionHandler.Fallback:
+        return HeliconeTokenLimitExceptionHandler.Fallback;
+      default:
+        return null;
+    }
+  }
+
   private getRetryHeaders(): IHeliconeHeaders["retryHeaders"] {
     const retryEnabled = this.headers.get("helicone-retry-enabled");
     if (retryEnabled === null) {

diff --git a/worker/src/lib/models/HeliconeProxyRequest.ts b/worker/src/lib/models/HeliconeProxyRequest.ts
@@ -5,7 +5,10 @@ import { approvedDomains } from "@helicone-package/cost/providers/mappings";
 import { RequestWrapper } from "../RequestWrapper";
 import { buildTargetUrl } from "../clients/ProviderClient";
 import { Result, ok } from "../util/results";
-import { IHeliconeHeaders } from "./HeliconeHeaders";
+import {
+  HeliconeTokenLimitExceptionHandler,
+  IHeliconeHeaders,
+} from "./HeliconeHeaders";
 
 import { parseJSXObject } from "@helicone/prompts";
 import { TemplateWithInputs } from "@helicone/prompts/dist/objectParser";
@@ -26,6 +29,16 @@ export type RetryOptions = {
 export type HeliconeProperties = Record<string, string>;
 type Nullable<T> = T | null;
 
+import {
+  applyFallbackStrategy,
+  applyMiddleOutStrategy,
+  applyTruncateStrategy,
+  estimateTokenCount,
+  getModelTokenLimit,
+  parseRequestPayload,
+  resolvePrimaryModel,
+} from "../util/tokenLimitException";
+
 // This neatly formats and holds all of the state that a request can come into Helicone
 export interface HeliconeProxyRequest {
   provider: Provider;
@@ -162,6 +175,16 @@ export class HeliconeProxyRequestMapper {
       body = await this.request.unsafeGetBodyText();
     }
 
+    // Apply token limit exception handler here and update buffer if changed
+    const bodyWithTokenLimitExceptionHandler =
+      this.applyTokenLimitExceptionHandler(body);
+    if (typeof bodyWithTokenLimitExceptionHandler === "string") {
+      body = bodyWithTokenLimitExceptionHandler;
+      await this.request.requestBodyBuffer.tempSetBody(
+        bodyWithTokenLimitExceptionHandler
+      );
+    }
+
     return {
       data: {
         heliconePromptTemplate: await this.getHeliconeTemplate(),
@@ -197,6 +220,90 @@ export class HeliconeProxyRequestMapper {
     };
   }
 
+  public applyTokenLimitExceptionHandler(
+    body: ValidRequestBody
+  ): ValidRequestBody | undefined {
+    const handler = this.request.heliconeHeaders.tokenLimitExceptionHandler;
+    if (!handler) {
+      return;
+    }
+
+    const parsedBody = parseRequestPayload(body);
+    if (!parsedBody) {
+      return;
+    }
+
+    const primaryModel = resolvePrimaryModel(
+      parsedBody,
+      this.request.heliconeHeaders.modelOverride
+    );
+    const estimatedTokens = estimateTokenCount(parsedBody, primaryModel);
+
+    if (!primaryModel) {
+      return;
+    }
+
+    const modelContextLimit = getModelTokenLimit(this.provider, primaryModel);
+
+    // Extract requested completion/output limit directly here (provider-agnostic best-effort)
+    const anyBody = parsedBody as any;
+    const completionCandidates: Array<unknown> = [
+      anyBody?.max_completion_tokens,
+      anyBody?.max_tokens,
+      anyBody?.max_output_tokens,
+      anyBody?.maxOutputTokens,
+      anyBody?.response?.max_tokens,
+      anyBody?.response?.max_output_tokens,
+      anyBody?.response?.maxOutputTokens,
+      anyBody?.generation_config?.max_output_tokens,
+      anyBody?.generation_config?.maxOutputTokens,
+      anyBody?.generationConfig?.max_output_tokens,
+      anyBody?.generationConfig?.maxOutputTokens,
+    ];
+    const requestedCompletionTokens = (() => {
+      for (const val of completionCandidates) {
+        if (typeof val === "number" && Number.isFinite(val) && val > 0) {
+          return Math.floor(val);
+        }
+      }
+      return 0;
+    })();
+    const tokenLimit =
+      modelContextLimit === null
+        ? null
+        : Math.max(
+            0,
+            modelContextLimit -
+              (requestedCompletionTokens || modelContextLimit * 0.1)
+          );
+
+    if (
+      estimatedTokens === null ||
+      tokenLimit === null ||
+      (estimatedTokens <= tokenLimit &&
+        handler != HeliconeTokenLimitExceptionHandler.Fallback) //needed to sort the extra model passed in request
+    ) {
+      return;
+    }
+
+    // TODO: Add some indicator as to what was applied so users understand why their request looks different
+    switch (handler) {
+      case HeliconeTokenLimitExceptionHandler.Truncate:
+        return applyTruncateStrategy(parsedBody);
+      case HeliconeTokenLimitExceptionHandler.MiddleOut:
+        return applyMiddleOutStrategy(parsedBody, primaryModel, tokenLimit);
+      case HeliconeTokenLimitExceptionHandler.Fallback:
+        return applyFallbackStrategy(
+          parsedBody,
+          primaryModel,
+          estimatedTokens,
+          tokenLimit
+        );
+      default:
+        return;
+    }
+  }
+
   private validateApiConfiguration(api_base: string | undefined): boolean {
     return (
       api_base === undefined ||