Merge pull request #93 from intelligentnode/92-integrate-llamacpp

92 integrate llamacpp
intelligentnode · Feb 19, 2025 · 6d0c444 · 6d0c444
2 parents b0ac6c0 + 6644908
commit 6d0c444
Show file tree

Hide file tree

Showing 10 changed files with 1,116 additions and 44 deletions.
diff --git a/PIPREADME.md b/PIPREADME.md
@@ -18,6 +18,7 @@ pip install intelli
 ```
 
 # Latest changes
+- Support llama.cpp & GGUF models for fast inference.
 - Add deepseek and Llama3 integration [doc](https://docs.intellinode.ai/docs/python/chatbot/nvidia-chat).
 - Add offline speech2text Whisper [doc](https://docs.intellinode.ai/docs/python/offline-chatbot/whisper).
 - Add Anthropic claude 3.5 as a chatbot provider.

diff --git a/README.md b/README.md
@@ -129,6 +129,9 @@ results = wrapper.generate_images(image_input)
 ## Keras Agent
 Load gemma or mistral models offline using keras agent, [check the docs](https://docs.intellinode.ai/docs/python/flows/kagent).
 
+## GGUF Models
+Llama CPP provides an efficient way to run language models locally with support for models in the new **GGUF** format,  [check the docs](https://docs.intellinode.ai/docs/python/offline-chatbot/llamacpp).  
+
 # Repository Setup
 1. Install the requirements.
 ```shell

diff --git a/instructions/run_integration_text.sh b/instructions/run_integration_text.sh
@@ -20,6 +20,9 @@ python3 -m unittest intelli.test.integration.test_googleai_wrapper
 # anthropic
 python3 -m unittest intelli.test.integration.test_anthropic_wrapper
 
+# wrapper with llama.cpp
+pytest -s intelli/test/integration/test_llama_cpp_wrapper.py
+
 ## controllers
 # embedding
 python3 -m unittest intelli.test.integration.test_remote_embed_model
@@ -43,6 +46,9 @@ python3 -m unittest intelli.test.integration.test_azure_chatbot
 # chatbot with data
 python3 -m unittest intelli.test.integration.test_chatbot_with_data
 
+# chatbot with llama.cpp
+pytest -s intelli/test/integration/test_chatbot_cpp.py
+
 ## flows
 # basic flow
 python3 -m unittest intelli.test.integration.test_flow_sequence

diff --git a/intelli/function/chatbot.py b/intelli/function/chatbot.py
@@ -9,22 +9,26 @@
 from intelli.wrappers.anthropic_wrapper import AnthropicWrapper
 from intelli.wrappers.keras_wrapper import KerasWrapper
 from intelli.wrappers.nvidia_wrapper import NvidiaWrapper
+from intelli.wrappers.llama_cpp_wrapper import IntelliLlamaCPPWrapper
 from enum import Enum
 
+
 class ChatProvider(Enum):
     OPENAI = "openai"
     GEMINI = "gemini"
     MISTRAL = "mistral"
     ANTHROPIC = "anthropic"
     KERAS = "keras"
     NVIDIA = "nvidia"
+    LLAMACPP = "llamacpp"
+
 
 class Chatbot:
 
     def __init__(self, api_key=None, provider=None, options=None):
         if options is None:
             options = {}
-        
+
         self.api_key = api_key
         self.provider = self._get_provider(provider)
         self.options = options
@@ -36,11 +40,14 @@ def __init__(self, api_key=None, provider=None, options=None):
             print("Please obtain NVIDIA API Key from https://build.nvidia.com/")
 
     def add_rag(self, options):
-        self.extended_search = IntellicloudWrapper(options['one_key'],
-                                                   options.get('api_base', None)) if 'one_key' in options else None
-
+        self.extended_search = (
+            IntellicloudWrapper(options["one_key"], options.get("api_base", None))
+            if "one_key" in options
+            else None
+        )
+
     def _get_provider(self, provider):
-        
+
         if isinstance(provider, str):
             provider = provider.lower()
             if provider not in (p.value for p in ChatProvider):
@@ -50,10 +57,10 @@ def _get_provider(self, provider):
             return provider.value
         else:
             raise ValueError(f"Unsupported provider: {provider}")
-    
+
     def _initialize_provider(self):
         if self.provider == ChatProvider.OPENAI.value:
-            proxy_helper = self.options.get('proxy_helper', None)
+            proxy_helper = self.options.get("proxy_helper", None)
             return OpenAIWrapper(self.api_key, proxy_helper=proxy_helper)
         elif self.provider == ChatProvider.MISTRAL.value:
             return MistralAIWrapper(self.api_key)
@@ -62,7 +69,9 @@ def _initialize_provider(self):
         elif self.provider == ChatProvider.ANTHROPIC.value:
             return AnthropicWrapper(self.api_key)
         elif self.provider == ChatProvider.KERAS.value:
-            return KerasWrapper(self.options['model_name'], self.options.get('model_params', {}))
+            return KerasWrapper(
+                self.options["model_name"], self.options.get("model_params", {})
+            )
         elif self.provider == ChatProvider.NVIDIA.value:
             nvidia_options = self.options.get("nvidiaOptions", {})
             base_url = self.options.get("baseUrl", {})
@@ -72,6 +81,13 @@ def _initialize_provider(self):
                 return NvidiaWrapper(self.api_key, base_url=base_url)
             else:
                 return NvidiaWrapper(self.api_key)
+        elif self.provider == ChatProvider.LLAMACPP.value:
+            # assume options has "model_path" and optionally "model_params"
+            model_path = self.options.get("model_path")
+            model_params = self.options.get("model_params", {"n_ctx": 512})
+            return IntelliLlamaCPPWrapper(
+                model_path=model_path, model_params=model_params
+            )
         else:
             raise ValueError(f"Unsupported provider: {self.provider}")
 
@@ -86,14 +102,26 @@ def chat(self, chat_input):
         get_input_method = f"get_{self.provider}_input"
         chat_method = getattr(self, f"_chat_{self.provider}", None)
         if not chat_method:
-            raise NotImplementedError(f"{self.provider.capitalize()} chat is not implemented.")
+            raise NotImplementedError(
+                f"{self.provider.capitalize()} chat is not implemented."
+            )
 
         params = getattr(chat_input, get_input_method)()
         result = chat_method(params)
-        return {'result': result, 'references': references} if chat_input.attach_reference else result
+        return (
+            {"result": result, "references": references}
+            if chat_input.attach_reference
+            else result
+        )
+
+    def _chat_llamacpp(self, params):
+        # assume the wrapper returns a dict with key "choices" containing a list of text responses.
+        response = self.wrapper.generate_text(params)
+        # extract the text.
+        return [response["choices"][0]["text"]]
 
     def _chat_keras(self, params):
-        response = self.wrapper.generate(params['prompt'], params['max_length'])
+        response = self.wrapper.generate(params["prompt"], params["max_length"])
         return [response]
 
     def _chat_openai(self, params):
@@ -102,7 +130,7 @@ def _chat_openai(self, params):
 
     def _chat_mistral(self, params):
         response = self.wrapper.generate_text(params)
-        return [choice['message']['content'] for choice in response.get('choices', [])]
+        return [choice["message"]["content"] for choice in response.get("choices", [])]
 
     def _chat_gemini(self, params):
         response = self.wrapper.generate_content(params)
@@ -117,8 +145,8 @@ def _chat_gemini(self, params):
     def _chat_anthropic(self, params):
         response = self.wrapper.generate_text(params)
 
-        return [message['text'] for message in response['content']]
-    
+        return [message["text"] for message in response["content"]]
+
     def _chat_nvidia(self, params):
         result = self.wrapper.generate_text(params)
         choices = result.get("choices", [])
@@ -132,7 +160,9 @@ def stream(self, chat_input):
         streaming_method = getattr(self, f"_stream_{self.provider}", None)
 
         if not streaming_method:
-            raise NotImplementedError(f"Streaming is not implemented for {self.provider}.")
+            raise NotImplementedError(
+                f"Streaming is not implemented for {self.provider}."
+            )
 
         if self.extended_search:
             _ = self._augment_with_semantic_search(chat_input)
@@ -142,38 +172,55 @@ def stream(self, chat_input):
         for content in streaming_method(params):
             yield content
 
+    def _stream_llamacpp(self, params):
+        params["stream"] = True
+        # Stream text chunks from the llama-cpp wrapper.
+        for chunk in self.wrapper.generate_text_stream(params):
+            yield chunk
+
     def _stream_openai(self, params):
         """
         Private helper method to stream text from OpenAI and parse each content chunk.
         """
-        params['stream'] = True
+        params["stream"] = True
         for response in self.wrapper.generate_chat_text(params):
-            if response.strip() and response.startswith('data: ') and response != 'data: [DONE]':
-                json_content = response[len('data: '):].strip()
+            if (
+                response.strip()
+                and response.startswith("data: ")
+                and response != "data: [DONE]"
+            ):
+                json_content = response[len("data: ") :].strip()
 
                 try:
                     data_chunk = json.loads(json_content)
-                    content = data_chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
+                    content = (
+                        data_chunk.get("choices", [{}])[0]
+                        .get("delta", {})
+                        .get("content", "")
+                    )
                     if content:
                         yield content
                 except json.JSONDecodeError as e:
                     print("Error decoding JSON:", e)
 
     def _stream_anthropic(self, params):
         """Stream text from Anthropic and directly yield text content."""
-        params['stream'] = True
+        params["stream"] = True
 
         for line in self.wrapper.stream_text(params):
             # process lines starting with 'data:'
             if line.startswith("data:"):
                 try:
 
-                    json_payload = line[len("data:"):]
+                    json_payload = line[len("data:") :]
                     line_data = json.loads(json_payload)
 
-                    if 'type' in line_data and line_data['type'] == 'content_block_delta' and 'text' in line_data[
-                        'delta']:
-                        yield line_data['delta']['text']
+                    if (
+                        "type" in line_data
+                        and line_data["type"] == "content_block_delta"
+                        and "text" in line_data["delta"]
+                    ):
+                        yield line_data["delta"]["text"]
                 except json.JSONDecodeError as e:
                     print("Error decoding JSON from stream:", e)
 
@@ -182,52 +229,69 @@ def _stream_nvidia(self, params):
         stream = self.wrapper.generate_text_stream(params)
         for line in stream:
             if line.strip() and line.startswith("data: ") and line != "data: [DONE]":
-                json_content = line[len("data: "):].strip()
+                json_content = line[len("data: ") :].strip()
                 try:
                     data_chunk = json.loads(json_content)
-                    content = data_chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
+                    content = (
+                        data_chunk.get("choices", [{}])[0]
+                        .get("delta", {})
+                        .get("content", "")
+                    )
                     if content:
                         yield content
                 except json.JSONDecodeError as e:
                     print("Error decoding JSON:", e)
-    
+
     # helpers
     def _parse_openai_responses(self, results):
         responses = []
-        for choice in results.get('choices', []):
-            response = choice.get('message', {}).get('content', '')
-            if choice.get('finish_reason') == 'function_call' and 'function_call' in choice.get('message', {}):
-                response['function_call'] = choice['message']['function_call']
+        for choice in results.get("choices", []):
+            response = choice.get("message", {}).get("content", "")
+            if choice.get(
+                "finish_reason"
+            ) == "function_call" and "function_call" in choice.get("message", {}):
+                response["function_call"] = choice["message"]["function_call"]
             responses.append(response)
         return responses
 
     def _augment_with_semantic_search(self, chat_input):
-        last_user_message = chat_input.messages[-1].content if chat_input.messages else ""
+        last_user_message = (
+            chat_input.messages[-1].content if chat_input.messages else ""
+        )
         references = []
         if last_user_message:
             # Perform the semantic search based on the last user message.
-            filters = {'document_name': chat_input.doc_name} if chat_input.doc_name else None
-            search_results = self.extended_search.semantic_search(last_user_message, chat_input.search_k,
-                                                                  filters=filters)
+            filters = (
+                {"document_name": chat_input.doc_name} if chat_input.doc_name else None
+            )
+            search_results = self.extended_search.semantic_search(
+                last_user_message, chat_input.search_k, filters=filters
+            )
 
             # Accumulate document names from the search results for references.
             references = {}
             for doc in search_results:
-                doc_name = doc['document_name']
+                doc_name = doc["document_name"]
                 if doc_name not in references:
-                    references[doc_name] = {'pages': []}
+                    references[doc_name] = {"pages": []}
                 # Assuming each 'doc' can contain multiple 'pages' or similar structures, adjust as necessary.
-                references[doc_name]['pages'].extend(doc.get('pages', []))
+                references[doc_name]["pages"].extend(doc.get("pages", []))
 
             # Generate context data based on the semantic search results.
-            context_data = '\n'.join(
-                data['text'] for doc in search_results for data in doc['data'] if 'text' in data
+            context_data = "\n".join(
+                data["text"]
+                for doc in search_results
+                for data in doc["data"]
+                if "text" in data
             ).strip()
 
             # Load the static prompt template for an augmented chatbot response.
-            augmented_message_template = self.system_helper.load_static_prompt("augmented_chatbot")
-            augmented_message = augmented_message_template.replace("${semantic_search}", context_data).replace(
-                "${user_query}", last_user_message)
+            augmented_message_template = self.system_helper.load_static_prompt(
+                "augmented_chatbot"
+            )
+            augmented_message = augmented_message_template.replace(
+                "${semantic_search}", context_data
+            ).replace("${user_query}", last_user_message)
 
             # Replace the content of the last user message with the augmented message in the ChatModelInput.
             chat_input.messages[-1].content = augmented_message

diff --git a/intelli/model/input/chatbot_input.py b/intelli/model/input/chatbot_input.py
@@ -143,3 +143,34 @@ def get_nvidia_input(self):
         }
         return params
 
+    def get_llamacpp_input(self):
+        """
+        Create an input prompt for llama.cpp.
+
+        This method concatenates the conversation messages into a plain text prompt.
+        It prefixes system, user, and assistant messages, and ends with an 'Assistant:' prompt.
+
+        Returns:
+            A dictionary with keys:
+                - prompt: the final text prompt,
+                - max_tokens: maximum tokens to generate,
+                - temperature: sampling temperature,
+                plus any additional options.
+        """
+        prompt = ""
+        for msg in self.messages:
+            if msg.role == 'system':
+                prompt += f"System: {msg.content}\n"
+            elif msg.role == 'user':
+                prompt += f"User: {msg.content}\n"
+            elif msg.role == 'assistant':
+                prompt += f"Assistant: {msg.content}\n"
+        if not prompt.endswith("Assistant: "):
+            prompt += "Assistant: "
+        params = {
+            "prompt": prompt,
+            "max_tokens": self.max_tokens or 180,
+            "temperature": self.temperature,
+            **self.options
+        }
+        return params