Skip to content
Open
29 changes: 29 additions & 0 deletions worker/src/lib/models/HeliconeHeaders.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ export type HeliconeFallback = {
};

export type HeliconeBearerKeyType = "standard" | "rate-limited";
export enum HeliconeTokenLimitExceptionHandler {
Truncate = "truncate",
MiddleOut = "middle-out",
Fallback = "fallback",
}

export interface IHeliconeHeaders {
heliconeAuth: Nullable<string>;
Expand Down Expand Up @@ -58,6 +63,7 @@ export interface IHeliconeHeaders {
omitResponse: boolean;
omitRequest: boolean;
};
tokenLimitExceptionHandler: Nullable<HeliconeTokenLimitExceptionHandler>;
sessionHeaders: {
sessionId: Nullable<string>;
path: Nullable<string>;
Expand Down Expand Up @@ -125,6 +131,7 @@ export class HeliconeHeaders implements IHeliconeHeaders {
promptName: Nullable<string>;
userId: Nullable<string>;
omitHeaders: { omitResponse: boolean; omitRequest: boolean };
tokenLimitExceptionHandler: Nullable<HeliconeTokenLimitExceptionHandler>;
sessionHeaders: {
sessionId: Nullable<string>;
path: Nullable<string>;
Expand Down Expand Up @@ -178,6 +185,8 @@ export class HeliconeHeaders implements IHeliconeHeaders {
};
this.promptName = heliconeHeaders.promptName;
this.omitHeaders = heliconeHeaders.omitHeaders;
this.tokenLimitExceptionHandler =
heliconeHeaders.tokenLimitExceptionHandler;
this.sessionHeaders = heliconeHeaders.sessionHeaders;
this.userId = heliconeHeaders.userId;
this.heliconeProperties = this.getHeliconeProperties(heliconeHeaders);
Expand Down Expand Up @@ -364,6 +373,7 @@ export class HeliconeHeaders implements IHeliconeHeaders {
omitResponse: this.headers.get("Helicone-Omit-Response") === "true",
omitRequest: this.headers.get("Helicone-Omit-Request") === "true",
},
tokenLimitExceptionHandler: this.getTokenLimitExceptionHandler(),
sessionHeaders: {
sessionId: this.headers.get("Helicone-Session-Id") ?? null,
path: this.headers.get("Helicone-Session-Path") ?? null,
Expand Down Expand Up @@ -403,6 +413,25 @@ export class HeliconeHeaders implements IHeliconeHeaders {
};
}

private getTokenLimitExceptionHandler(): Nullable<HeliconeTokenLimitExceptionHandler> {
const handler = this.headers.get("Helicone-Token-Limit-Exception-Handler");
if (!handler) {
return null;
}

const normalized = handler.toLowerCase();
switch (normalized) {
case HeliconeTokenLimitExceptionHandler.Truncate:
return HeliconeTokenLimitExceptionHandler.Truncate;
case HeliconeTokenLimitExceptionHandler.MiddleOut:
return HeliconeTokenLimitExceptionHandler.MiddleOut;
case HeliconeTokenLimitExceptionHandler.Fallback:
return HeliconeTokenLimitExceptionHandler.Fallback;
default:
return null;
}
}

private getRetryHeaders(): IHeliconeHeaders["retryHeaders"] {
const retryEnabled = this.headers.get("helicone-retry-enabled");
if (retryEnabled === null) {
Expand Down
109 changes: 108 additions & 1 deletion worker/src/lib/models/HeliconeProxyRequest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ import { approvedDomains } from "@helicone-package/cost/providers/mappings";
import { RequestWrapper } from "../RequestWrapper";
import { buildTargetUrl } from "../clients/ProviderClient";
import { Result, ok } from "../util/results";
import { IHeliconeHeaders } from "./HeliconeHeaders";
import {
HeliconeTokenLimitExceptionHandler,
IHeliconeHeaders,
} from "./HeliconeHeaders";

import { parseJSXObject } from "@helicone/prompts";
import { TemplateWithInputs } from "@helicone/prompts/dist/objectParser";
Expand All @@ -26,6 +29,16 @@ export type RetryOptions = {
export type HeliconeProperties = Record<string, string>;
type Nullable<T> = T | null;

import {
applyFallbackStrategy,
applyMiddleOutStrategy,
applyTruncateStrategy,
estimateTokenCount,
getModelTokenLimit,
parseRequestPayload,
resolvePrimaryModel,
} from "../util/tokenLimitException";

// This neatly formats and holds all of the state that a request can come into Helicone
export interface HeliconeProxyRequest {
provider: Provider;
Expand Down Expand Up @@ -162,6 +175,16 @@ export class HeliconeProxyRequestMapper {
body = await this.request.unsafeGetBodyText();
}

// Apply token limit exception handler here and update buffer if changed
const bodyWithTokenLimitExceptionHandler =
this.applyTokenLimitExceptionHandler(body);
if (typeof bodyWithTokenLimitExceptionHandler === "string") {
body = bodyWithTokenLimitExceptionHandler;
await this.request.requestBodyBuffer.tempSetBody(
bodyWithTokenLimitExceptionHandler
);
}

return {
data: {
heliconePromptTemplate: await this.getHeliconeTemplate(),
Expand Down Expand Up @@ -197,6 +220,90 @@ export class HeliconeProxyRequestMapper {
};
}

public applyTokenLimitExceptionHandler(
body: ValidRequestBody
): ValidRequestBody | undefined {
const handler = this.request.heliconeHeaders.tokenLimitExceptionHandler;
if (!handler) {
return;
}

const parsedBody = parseRequestPayload(body);
if (!parsedBody) {
return;
}

const primaryModel = resolvePrimaryModel(
parsedBody,
this.request.heliconeHeaders.modelOverride
);
const estimatedTokens = estimateTokenCount(parsedBody, primaryModel);

if (!primaryModel) {
return;
}

const modelContextLimit = getModelTokenLimit(this.provider, primaryModel);

// Extract requested completion/output limit directly here (provider-agnostic best-effort)
const anyBody = parsedBody as any;
const completionCandidates: Array<unknown> = [
anyBody?.max_completion_tokens,
anyBody?.max_tokens,
anyBody?.max_output_tokens,
anyBody?.maxOutputTokens,
anyBody?.response?.max_tokens,
anyBody?.response?.max_output_tokens,
anyBody?.response?.maxOutputTokens,
anyBody?.generation_config?.max_output_tokens,
anyBody?.generation_config?.maxOutputTokens,
anyBody?.generationConfig?.max_output_tokens,
anyBody?.generationConfig?.maxOutputTokens,
];
const requestedCompletionTokens = (() => {
for (const val of completionCandidates) {
if (typeof val === "number" && Number.isFinite(val) && val > 0) {
return Math.floor(val);
}
}
return 0;
})();
const tokenLimit =
modelContextLimit === null
? null
: Math.max(
0,
modelContextLimit -
(requestedCompletionTokens || modelContextLimit * 0.1)
);

if (
estimatedTokens === null ||
tokenLimit === null ||
(estimatedTokens <= tokenLimit &&
handler != HeliconeTokenLimitExceptionHandler.Fallback) //needed to sort the extra model passed in request
) {
return;
}

// TODO: Add some indicator as to what was applied so users understand why their request looks different
switch (handler) {
case HeliconeTokenLimitExceptionHandler.Truncate:
return applyTruncateStrategy(parsedBody);
case HeliconeTokenLimitExceptionHandler.MiddleOut:
return applyMiddleOutStrategy(parsedBody, primaryModel, tokenLimit);
case HeliconeTokenLimitExceptionHandler.Fallback:
return applyFallbackStrategy(
parsedBody,
primaryModel,
estimatedTokens,
tokenLimit
);
default:
return;
}
}

private validateApiConfiguration(api_base: string | undefined): boolean {
return (
api_base === undefined ||
Expand Down
Loading
Loading