Add Together and AzureOpenAI examples (sgl-project#184)

psg-mit · Feb 12, 2024 · bb824da · bb824da
1 parent 9312132
commit bb824da
Show file tree

Hide file tree

Showing 8 changed files with 262 additions and 15 deletions.
diff --git a/examples/quick_start/anthropic_example_chat.py b/examples/quick_start/anthropic_example_chat.py
@@ -23,7 +23,7 @@ def single():
     for m in state.messages():
         print(m["role"], ":", m["content"])
 
-    print("answer_1", state["answer_1"])
+    print("\n-- answer_1 --\n", state["answer_1"])
 
 
 def stream():

diff --git a/examples/quick_start/azure_openai_example_chat.py b/examples/quick_start/azure_openai_example_chat.py
@@ -0,0 +1,76 @@
+"""
+Usage:
+export AZURE_OPENAI_API_KEY=sk-******
+python3 openai_example_chat.py
+"""
+import sglang as sgl
+import os
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch([
+        {"question_1": "What is the capital of the United States?",
+         "question_2": "List two local attractions."},
+
+        {"question_1": "What is the capital of France?",
+         "question_2": "What is the population of this city?"},
+    ])
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="azure-gpt-4",
+        api_version="2023-07-01-preview",
+        azure_endpoint="https://oai-arena-sweden.openai.azure.com/",
+        api_key=os.environ["AZURE_OPENAI_API_KEY"],
+        is_azure=True,
+    )
+    sgl.set_default_backend(backend)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/quick_start/gemini_example_chat.py b/examples/quick_start/gemini_example_chat.py
@@ -23,7 +23,7 @@ def single():
     for m in state.messages():
         print(m["role"], ":", m["content"])
 
-    print("answer_1", state["answer_1"])
+    print("\n-- answer_1 --\n", state["answer_1"])
 
 
 def stream():

diff --git a/examples/quick_start/openai_example_chat.py b/examples/quick_start/openai_example_chat.py
@@ -24,7 +24,7 @@ def single():
     for m in state.messages():
         print(m["role"], ":", m["content"])
 
-    print("answer_1", state["answer_1"])
+    print("\n-- answer_1 --\n", state["answer_1"])
 
 
 def stream():

diff --git a/examples/quick_start/srt_example_chat.py b/examples/quick_start/srt_example_chat.py
@@ -22,7 +22,7 @@ def single():
     for m in state.messages():
         print(m["role"], ":", m["content"])
 
-    print("answer_1", state["answer_1"])
+    print("\n-- answer_1 --\n", state["answer_1"])
 
 
 def stream():

diff --git a/examples/quick_start/together_example_chat.py b/examples/quick_start/together_example_chat.py
@@ -0,0 +1,74 @@
+"""
+Usage:
+export TOGETHER_API_KEY=sk-******
+python3 together_example_chat.py
+"""
+import sglang as sgl
+import os
+
+
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.system("You are a helpful assistant.")
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+
+
+def single():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+    )
+
+    for m in state.messages():
+        print(m["role"], ":", m["content"])
+
+    print("\n-- answer_1 --\n", state["answer_1"])
+
+
+def stream():
+    state = multi_turn_question.run(
+        question_1="What is the capital of the United States?",
+        question_2="List two local attractions.",
+        stream=True
+    )
+
+    for out in state.text_iter():
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = multi_turn_question.run_batch([
+        {"question_1": "What is the capital of the United States?",
+         "question_2": "List two local attractions."},
+
+        {"question_1": "What is the capital of France?",
+         "question_2": "What is the population of this city?"},
+    ])
+
+    for s in states:
+        print(s.messages())
+
+
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
+        base_url="https://api.together.xyz/v1",
+        api_key=os.environ.get("TOGETHER_API_KEY"),
+    )
+    sgl.set_default_backend(backend)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/examples/quick_start/together_example_complete.py b/examples/quick_start/together_example_complete.py
@@ -0,0 +1,74 @@
+"""
+Usage:
+export TOGETHER_API_KEY=sk-******
+python3 together_example_complete.py
+"""
+
+import sglang as sgl
+import os
+
+
+@sgl.function
+def few_shot_qa(s, question):
+    s += (
+"""The following are questions with answers.
+Q: What is the capital of France?
+A: Paris
+Q: What is the capital of Germany?
+A: Berlin
+Q: What is the capital of Italy?
+A: Rome
+""")
+    s += "Q: " + question + "\n"
+    s += "A:" + sgl.gen("answer", stop="\n", temperature=0)
+
+
+def single():
+    state = few_shot_qa.run(question="What is the capital of the United States?")
+    answer = state["answer"].strip().lower()
+
+    assert "washington" in answer, f"answer: {state['answer']}"
+
+    print(state.text())
+
+
+def stream():
+    state = few_shot_qa.run(
+        question="What is the capital of the United States?",
+        stream=True)
+
+    for out in state.text_iter("answer"):
+        print(out, end="", flush=True)
+    print()
+
+
+def batch():
+    states = few_shot_qa.run_batch([
+        {"question": "What is the capital of the United States?"},
+        {"question": "What is the capital of China?"},
+    ])
+
+    for s in states:
+        print(s["answer"])
+
+
+if __name__ == "__main__":
+    backend = sgl.OpenAI(
+        model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
+        is_chat_model=False,
+        base_url="https://api.together.xyz/v1",
+        api_key=os.environ.get("TOGETHER_API_KEY"),
+    )
+    sgl.set_default_backend(backend)
+
+    # Run a single request
+    print("\n========== single ==========\n")
+    single()
+
+    # Stream output
+    print("\n========== stream ==========\n")
+    stream()
+
+    # Run a batch of requests
+    print("\n========== batch ==========\n")
+    batch()
diff --git a/python/sglang/backend/openai.py b/python/sglang/backend/openai.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 from sglang.backend.base_backend import BaseBackend
-from sglang.lang.chat_template import get_chat_template
+from sglang.lang.chat_template import get_chat_template_by_model_path, ChatTemplate
 from sglang.lang.interpreter import StreamExecutor
 from sglang.lang.ir import SglSamplingParams
 
@@ -41,23 +41,39 @@ def create_logit_bias_int(tokenizer):
 
 
 class OpenAI(BaseBackend):
-    def __init__(self, model_name, *args, **kwargs):
+    def __init__(self, model_name: str,
+                 is_chat_model: Optional[bool] = None,
+                 chat_template: Optional[ChatTemplate] = None,
+                 is_azure: bool = False,
+                 *args, **kwargs):
         super().__init__()
 
         if isinstance(openai, Exception):
             raise openai
 
-        self.client = openai.OpenAI(*args, **kwargs)
+        if is_azure:
+            self.client = openai.AzureOpenAI(*args, **kwargs)
+        else:
+            self.client = openai.OpenAI(*args, **kwargs)
+
         self.model_name = model_name
-        self.tokenizer = tiktoken.encoding_for_model(model_name)
+        try:
+            self.tokenizer = tiktoken.encoding_for_model(model_name)
+        except KeyError:
+            self.tokenizer = tiktoken.get_encoding("cl100k_base")
         self.logit_bias_int = create_logit_bias_int(self.tokenizer)
 
-        if model_name in INSTRUCT_MODEL_NAMES:
-            self.is_chat_model = False
+        self.chat_template = chat_template or get_chat_template_by_model_path(model_name)
+
+        if is_chat_model is not None:
+            self.is_chat_model = is_chat_model
         else:
-            self.is_chat_model = True
+            if model_name in INSTRUCT_MODEL_NAMES:
+                self.is_chat_model = False
+            else:
+                self.is_chat_model = True
 
-        self.chat_template = get_chat_template("default")
+        self.chat_begin_str = self.chat_template.role_prefix_and_suffix["assistant"][0]
 
     def get_chat_template(self):
         return self.chat_template
@@ -69,7 +85,7 @@ def generate(
     ):
         if sampling_params.dtype is None:
             if self.is_chat_model:
-                if not s.text_.endswith("ASSISTANT:"):
+                if not s.text_.endswith(self.chat_begin_str):
                     raise RuntimeError(
                         "This use case is not supported. "
                         "For OpenAI chat models, sgl.gen must be right after sgl.assistant"
@@ -122,7 +138,11 @@ def generate_stream(
     ):
         if sampling_params.dtype is None:
             if self.is_chat_model:
-                assert s.text_.endswith("ASSISTANT:")
+                if not s.text_.endswith(self.chat_begin_str):
+                    raise RuntimeError(
+                        "This use case is not supported. "
+                        "For OpenAI chat models, sgl.gen must be right after sgl.assistant"
+                    )
                 prompt = s.messages_
             else:
                 prompt = s.text_
@@ -241,7 +261,10 @@ def openai_completion_stream(client, retries=3, is_chat=None, prompt=None, **kwa
                     messages=prompt, stream=True, **kwargs
                 )
                 for ret in generator:
-                    content = ret.choices[0].delta.content
+                    try:
+                        content = ret.choices[0].delta.content
+                    except IndexError:
+                        content = None
                     yield content or "", {}
             else:
                 generator = client.completions.create(