Merge pull request #12 from jepler/llama_cpp

Add llama.cpp support
jepler · Sep 24, 2023 · 1b700aa · 1b700aa
2 parents 02de0b3 + 26912b1
commit 1b700aa
Show file tree

Hide file tree

Showing 8 changed files with 200 additions and 75 deletions.
diff --git a/README.md b/README.md
@@ -25,11 +25,11 @@ Put your OpenAI API key in the platform configuration directory for chap, e.g.,
 
 ## commandline usage
 
- * chap ask "What advice would you give a 20th century human visiting the 21st century for the first time?"
+ * `chap ask "What advice would you give a 20th century human visiting the 21st century for the first time?"`
 
- * chap render --last
+ * `chap render --last`
 
- * chap import chatgpt-style-chatlog.json
+ * `chap import chatgpt-style-chatlog.json` (for files from pionxzh/chatgpt-exporter)
 
 ## interactive terminal usage
  * chap tui
@@ -49,9 +49,16 @@ You can set the "system message" with the `-S` flag.
 
 You can select the text generating backend with the `-b` flag:
  * openai\_chatgpt: the default, paid API, best quality results
- * textgen: Works with https://github.com/oobabooga/text-generation-webui and can run locally with various models, basic and low quality. Needs the server URL in *$configuration_directory/textgen\_url*.
+ * llama_cpp: Works with (llama.cpp's http server)[https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md] and can run locally with various models. Set the server URL with `-B url:...`.
+ * textgen: Works with https://github.com/oobabooga/text-generation-webui and can run locally with various models. Needs the server URL in *$configuration_directory/textgen\_url*.
  * lorem: local non-AI lorem generator for testing
 
+## Environment variables
+
+The backend can be set with `CHAP_BACKEND`.
+Backend settings can be set with `CHAP_<backend_name>_<parameter_name>`, with `backend_name` and `parameter_name` all in caps.
+For instance, `CHAP_LLAMA_CPP_URL=http://server.local:8080/completion` changes the default server URL for the llama_cpp back-end.
+
 ## Importing from ChatGPT
 
 The userscript https://github.com/pionxzh/chatgpt-exporter can export chat logs from chat.openai.com in a json format.

diff --git a/src/chap/backends/llama_cpp.py b/src/chap/backends/llama_cpp.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: 2023 Jeff Epler <[email protected]>
+#
+# SPDX-License-Identifier: MIT
+
+import asyncio
+import json
+from dataclasses import dataclass
+
+import httpx
+
+from ..session import Assistant, User
+
+
+class LlamaCpp:
+    @dataclass
+    class Parameters:
+        url: str = "http://localhost:8080/completion"
+        """The URL of a llama.cpp server's completion endpoint."""
+
+    def __init__(self):
+        self.parameters = self.Parameters()
+
+    system_message = """\
+A dialog, where USER interacts with AI. AI is helpful, kind, obedient, honest, and knows its own limits.
+"""
+
+    def make_full_query(self, messages, max_query_size):
+        del messages[1:-max_query_size]
+        rows = []
+        for m in messages:
+            content = (m.content or "").strip()
+            if not content:
+                continue
+            if m.role == "system":
+                rows.append(f"ASSISTANT'S RULE: {content}\n")
+            elif m.role == "assistant":
+                rows.append(f"ASSISTANT: {content}\n")
+            elif m.role == "user":
+                rows.append(f"USER: {content}")
+        rows.append("ASSISTANT: ")
+        full_query = ("\n".join(rows)).rstrip()
+        return full_query
+
+    async def aask(
+        self, session, query, *, max_query_size=5, timeout=60
+    ):  # pylint: disable=unused-argument,too-many-locals,too-many-branches
+        params = {
+            "prompt": self.make_full_query(
+                session.session + [User(query)], max_query_size
+            ),
+            "stream": True,
+        }
+        new_content = []
+        try:
+            async with httpx.AsyncClient(timeout=timeout) as client:
+                async with client.stream(
+                    "POST",
+                    self.parameters.url,
+                    json=params,
+                ) as response:
+                    if response.status_code == 200:
+                        async for line in response.aiter_lines():
+                            if line.startswith("data:"):
+                                data = line.removeprefix("data:").strip()
+                                j = json.loads(data)
+                                content = j.get("content")
+                                if not new_content:
+                                    content = content.lstrip()
+                                if content:
+                                    new_content.append(content)
+                                    yield content
+                                if j.get("stop"):
+                                    break
+                    else:
+                        content = f"\nFailed with {response=!r}"
+                        new_content.append(content)
+                        yield content
+
+        except httpx.HTTPError as e:
+            content = f"\nException: {e!r}"
+            new_content.append(content)
+            yield content
+
+        session.session.extend([User(query), Assistant("".join(new_content))])
+
+    def ask(self, session, query, *, max_query_size=5, timeout=60):
+        asyncio.run(
+            self.aask(session, query, max_query_size=max_query_size, timeout=timeout)
+        )
+        return session.session[-1].message
+
+
+def factory():
+    """Uses the llama.cpp completion web API"""
+    return LlamaCpp()
diff --git a/src/chap/backends/lorem.py b/src/chap/backends/lorem.py
@@ -56,4 +56,5 @@ def ask(
 
 
 def factory():
+    """That just prints 'lorem' text. Useful for testing."""
     return Lorem()
diff --git a/src/chap/backends/openai_chatgpt.py b/src/chap/backends/openai_chatgpt.py
@@ -175,4 +175,5 @@ def get_key(cls):
 
 
 def factory():
+    """Uses the OpenAI chat completion API"""
     return ChatGPT()
diff --git a/src/chap/backends/textgen.py b/src/chap/backends/textgen.py
@@ -134,4 +134,5 @@ def ask(self, session, query, *, max_query_size=5, timeout=60):
 
 
 def factory():
+    """Uses the textgen completion API"""
     return Textgen()
diff --git a/src/chap/commands/cat.py b/src/chap/commands/cat.py
@@ -4,11 +4,10 @@
 
 import click
 
-from ..core import uses_existing_session
+from ..core import command_uses_existing_session
 
 
-@click.command
-@uses_existing_session
+@command_uses_existing_session
 @click.option("--no-system", is_flag=True)
 def main(obj, no_system):
     """Print session in plaintext"""

diff --git a/src/chap/commands/render.py b/src/chap/commands/render.py
@@ -7,7 +7,7 @@
 from markdown_it import MarkdownIt
 from rich.markdown import Markdown
 
-from ..core import uses_existing_session
+from ..core import command_uses_existing_session
 
 
 def to_markdown(message):
@@ -25,8 +25,7 @@ def to_markdown(message):
     return m
 
 
-@click.command
-@uses_existing_session
+@command_uses_existing_session
 @click.option("--no-system", is_flag=True)
 def main(obj, no_system):
     """Print session with formatting"""
Original file line number	Diff line number	Diff line change
Expand Up		@@ -56,4 +56,5 @@ def ask(


		def factory():
		"""That just prints 'lorem' text. Useful for testing."""
		return Lorem()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -175,4 +175,5 @@ def get_key(cls):


		def factory():
		"""Uses the OpenAI chat completion API"""
		return ChatGPT()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -134,4 +134,5 @@ def ask(self, session, query, *, max_query_size=5, timeout=60):


		def factory():
		"""Uses the textgen completion API"""
		return Textgen()