feat: improve OpenAI model APIs and examples to better support audio,…

… images, and tool calling (#707)
hypermodeinc · Jan 22, 2025 · 67bd5cb · 67bd5cb
1 parent 8dca6e5
commit 67bd5cb
Show file tree

Hide file tree

Showing 27 changed files with 3,294 additions and 1,129 deletions.
diff --git a/.trunk/configs/cspell.json b/.trunk/configs/cspell.json
@@ -134,6 +134,7 @@
     "omitif",
     "omitnull",
     "openai",
+    "openspeech",
     "operationreport",
     "PEMS",
     "pgconn",
@@ -175,6 +176,7 @@
     "textgeneration",
     "tidwall",
     "tinygo",
+    "toolcalling",
     "tseslint",
     "tsrv",
     "typedarray",

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,8 @@
 - chore: remove localHypermodeModels list and handle 404s properly instead in local dev
   [#703](https://github.com/hypermodeinc/modus/pull/703)
 - fix: remove fallback to default time zone [#706](https://github.com/hypermodeinc/modus/pull/706)
+- feat: improve OpenAI model APIs and examples to better support audio, images, and tool calling
+  [#707](https://github.com/hypermodeinc/modus/pull/707)
 
 ## 2025-01-09 - CLI 0.16.6
 

diff --git a/sdk/assemblyscript/examples/textgeneration/assembly/index.ts b/sdk/assemblyscript/examples/textgeneration/assembly/index.ts
@@ -4,109 +4,10 @@
  * See the LICENSE file that accompanied this code for further details.
  */
 
-import { JSON } from "json-as";
-import { models } from "@hypermode/modus-sdk-as";
-import { Product, sampleProductJson } from "./product";
+// The examples have been split into separate files for clarity.
+// See each file for more details about the specific example.
 
-import {
-  OpenAIChatModel,
-  ResponseFormat,
-  SystemMessage,
-  UserMessage,
-} from "@hypermode/modus-sdk-as/models/openai/chat";
-
-// In this example, we will generate text using the OpenAI Chat model.
-// See https://platform.openai.com/docs/api-reference/chat/create for more details
-// about the options available on the model, which you can set on the input object.
-
-// This model name should match the one defined in the modus.json manifest file.
-const modelName: string = "text-generator";
-
-// This function generates some text based on the instruction and prompt provided.
-export function generateText(instruction: string, prompt: string): string {
-  // The imported ChatModel interface follows the OpenAI Chat completion model input format.
-  const model = models.getModel<OpenAIChatModel>(modelName);
-  const input = model.createInput([
-    new SystemMessage(instruction),
-    new UserMessage(prompt),
-  ]);
-
-  // This is one of many optional parameters available for the OpenAI Chat model.
-  input.temperature = 0.7;
-
-  // Here we invoke the model with the input we created.
-  const output = model.invoke(input);
-
-  // The output is also specific to the ChatModel interface.
-  // Here we return the trimmed content of the first choice.
-  return output.choices[0].message.content.trim();
-}
-
-// This function generates a single product.
-export function generateProduct(category: string): Product {
-  // We can get creative with the instruction and prompt to guide the model
-  // in generating the desired output.  Here we provide a sample JSON of the
-  // object we want the model to generate.
-  const instruction = `Generate a product for the category provided.
-Only respond with valid JSON object in this format:
-${sampleProductJson}`;
-  const prompt = `The category is "${category}".`;
-
-  // Set up the input for the model, creating messages for the instruction and prompt.
-  const model = models.getModel<OpenAIChatModel>(modelName);
-  const input = model.createInput([
-    new SystemMessage(instruction),
-    new UserMessage(prompt),
-  ]);
-
-  // Let's increase the temperature to get more creative responses.
-  // Be careful though, if the temperature is too high, the model may generate invalid JSON.
-  input.temperature = 1.2;
-
-  // This model also has a response format parameter that can be set to JSON,
-  // Which, along with the instruction, can help guide the model in generating valid JSON output.
-  input.responseFormat = ResponseFormat.Json;
-
-  // Here we invoke the model with the input we created.
-  const output = model.invoke(input);
-
-  // The output should contain the JSON string we asked for.
-  const json = output.choices[0].message.content.trim();
-  const product = JSON.parse<Product>(json);
-  return product;
-}
-
-// This function generates multiple products.
-export function generateProducts(category: string, quantity: i32): Product[] {
-  // Similar to the previous example above, we can tailor the instruction and prompt
-  // to guide the model in generating the desired output.  Note that understanding the behavior
-  // of the model is important to get the desired results.  In this case, we need the model
-  // to return an _object_ containing an array, not an array of objects directly.  That's because
-  // the model will not reliably generate an array of objects directly.
-  const instruction = `Generate ${quantity} products for the category provided.
-Only respond with valid JSON object containing a valid JSON array named 'list', in this format:
-{"list":[${sampleProductJson}]}`;
-  const prompt = `The category is "${category}".`;
-
-  // Set up the input for the model, creating messages for the instruction and prompt.
-  const model = models.getModel<OpenAIChatModel>(modelName);
-  const input = model.createInput([
-    new SystemMessage(instruction),
-    new UserMessage(prompt),
-  ]);
-
-  // Adjust the model inputs, just like in the previous example.
-  // Be careful, if the temperature is too high, the model may generate invalid JSON.
-  input.temperature = 1.2;
-  input.responseFormat = ResponseFormat.Json;
-
-  // Here we invoke the model with the input we created.
-  const output = model.invoke(input);
-
-  // The output should contain the JSON string we asked for.
-  const json = output.choices[0].message.content.trim();
-
-  // We can parse that JSON to a compatible object, to get the data we're looking for.
-  const results = JSON.parse<Map<string, Product[]>>(json);
-  return results.get("list");
-}
+export * from "./simple";
+export * from "./products";
+export * from "./media";
+export * from "./toolcalling";
diff --git a/sdk/assemblyscript/examples/textgeneration/assembly/media.ts b/sdk/assemblyscript/examples/textgeneration/assembly/media.ts
@@ -0,0 +1,181 @@
+/*
+ * This example is part of the Modus project, licensed under the Apache License 2.0.
+ * You may modify and use this example in accordance with the license.
+ * See the LICENSE file that accompanied this code for further details.
+ */
+
+import { models, http } from "@hypermode/modus-sdk-as";
+
+import {
+  OpenAIChatModel,
+  DeveloperMessage,
+  SystemMessage,
+  UserMessage,
+  TextContentPart,
+  AudioContentPart,
+  ImageContentPart,
+  Image,
+  Audio,
+} from "@hypermode/modus-sdk-as/models/openai/chat";
+
+// These examples demonstrate how to use audio or image data with OpenAI chat models.
+// Currently, audio can be used for input or output, but images can be used only for input.
+
+/**
+ * This type is used in these examples to represent images or audio.
+ */
+class Media {
+  // The content type of the media.
+  contentType!: string;
+
+  // The binary data of the media.
+  // This value will be base64 encoded when used in an API response.
+  data!: Uint8Array;
+
+  // A text description or transcription of the media.
+  text!: string;
+}
+
+/**
+ * This function generates an audio response based on the instruction and prompt provided.
+ */
+export function generateAudio(instruction: string, prompt: string): Media {
+  // Note, this is similar to the generateText example, but with audio output requested.
+
+  // We'll generate the audio using an audio-enabled OpenAI chat model.
+  const model = models.getModel<OpenAIChatModel>("audio-model");
+
+  const input = model.createInput([
+    new SystemMessage(instruction),
+    new UserMessage(prompt),
+  ]);
+
+  input.temperature = 0.7;
+
+  // Request audio output from the model.
+  // Note, this is a convenience method that requests audio modality and sets the voice and format.
+  // You can also set these values manually on the input object, if you prefer.
+  input.requestAudioOutput("ash", "wav");
+
+  const output = model.invoke(input);
+
+  // Return the audio and its transcription.
+  // Note that the message Content field will be empty for audio responses.
+  // Instead, the text will be in the Message.Audio.Transcript field.
+  const audio = output.choices[0].message.audio!;
+
+  const media = <Media>{
+    contentType: "audio/wav",
+    data: audio.data,
+    text: audio.transcript.trim(),
+  };
+
+  return media;
+}
+
+/**
+ * This function generates text that describes the image at the provided url.
+ * In this example the image url is passed to the model, and the model retrieves the image.
+ */
+export function describeImage(url: string): string {
+  // Note that because the model retrieves the image, any URL can be used.
+  // However, this means that there is a risk of sending data to an unauthorized host, if the URL is not hardcoded or sanitized.
+  // See the describeRandomImage function below for a safer approach.
+
+  const model = models.getModel<OpenAIChatModel>("text-generator");
+
+  const input = model.createInput([
+    UserMessage.fromParts([
+      new TextContentPart("Describe this image."),
+      new ImageContentPart(Image.fromURL(url)),
+    ]),
+  ]);
+
+  const output = model.invoke(input);
+
+  return output.choices[0].message.content.trim();
+}
+
+/**
+ * This function fetches a random image, and then generates text that describes it.
+ * In this example the image is retrieved by the function before passing it as data to the model.
+ */
+export function describeRandomImage(): Media {
+  // Because this approach fetches the image directly, it is safer than the describeImage function above.
+  // The host URL is allow-listed in the modus.json file, so we can trust the image source.
+
+  // Fetch a random image from the Picsum API.  We'll just hardcode the size to make the demo simple to call.
+  const response = http.fetch("https://picsum.photos/640/480");
+  const data = Uint8Array.wrap(response.body);
+  const contentType = response.headers.get("Content-Type")!;
+
+  // Describe the image using the OpenAI chat model.
+  const model = models.getModel<OpenAIChatModel>("text-generator");
+
+  const input = model.createInput([
+    UserMessage.fromParts([
+      new TextContentPart("Describe this image."),
+      new ImageContentPart(Image.fromData(data, contentType)),
+    ]),
+  ]);
+
+  input.temperature = 0.7;
+
+  const output = model.invoke(input);
+
+  // Return the image and its generated description.
+  const text = output.choices[0].message.content.trim();
+  const media = <Media>{
+    contentType,
+    data,
+    text,
+  };
+
+  return media;
+}
+
+/**
+ * This function fetches a random "Harvard Sentences" speech file from OpenSpeech, and then generates a transcript from it.
+ * The sentences are from https://www.cs.columbia.edu/~hgs/audio/harvard.html
+ */
+export function transcribeRandomSpeech(): Media {
+  // Pick a random file number from the list of available here:
+  // https://www.voiptroubleshooter.com/open_speech/american.html
+  const numbers: i32[] = [
+    10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 30, 31, 32, 34, 35, 36, 37, 38, 39,
+    40, 57, 58, 59, 60, 61,
+  ];
+  const num = numbers[<i32>(Math.random() * numbers.length)];
+
+  // Fetch the speech file corresponding to the number.
+  const url = `https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_00${num}_8k.wav`;
+  const response = http.fetch(url);
+  const data = Uint8Array.wrap(response.body);
+
+  // Transcribe the audio using an audio-enabled OpenAI chat model.
+  const model = models.getModel<OpenAIChatModel>("audio-model");
+
+  const input = model.createInput([
+    new DeveloperMessage(
+      "Do not include any newlines or surrounding quotation marks in the response. Omit any explanation beyond the request.",
+    ),
+    UserMessage.fromParts([
+      new TextContentPart(
+        "Provide an exact transcription of the contents of this audio file.",
+      ),
+      new AudioContentPart(Audio.fromData(data, "wav")),
+    ]),
+  ]);
+
+  const output = model.invoke(input);
+
+  // Return the audio file and its transcript.
+  const text = output.choices[0].message.content.trim();
+  const media = <Media>{
+    contentType: "audio/wav",
+    data,
+    text,
+  };
+
+  return media;
+}
diff --git a/sdk/assemblyscript/examples/textgeneration/assembly/product.ts b/sdk/assemblyscript/examples/textgeneration/assembly/product.ts