Skip to content

Commit

Permalink
Merge pull request #93 from intelligentnode/92-integrate-llamacpp
Browse files Browse the repository at this point in the history
92 integrate llamacpp
  • Loading branch information
intelligentnode authored Feb 19, 2025
2 parents b0ac6c0 + 6644908 commit 6d0c444
Show file tree
Hide file tree
Showing 10 changed files with 1,116 additions and 44 deletions.
1 change: 1 addition & 0 deletions PIPREADME.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pip install intelli
```

# Latest changes
- Support llama.cpp & GGUF models for fast inference.
- Add deepseek and Llama3 integration [doc](https://docs.intellinode.ai/docs/python/chatbot/nvidia-chat).
- Add offline speech2text Whisper [doc](https://docs.intellinode.ai/docs/python/offline-chatbot/whisper).
- Add Anthropic claude 3.5 as a chatbot provider.
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ results = wrapper.generate_images(image_input)
## Keras Agent
Load gemma or mistral models offline using keras agent, [check the docs](https://docs.intellinode.ai/docs/python/flows/kagent).

## GGUF Models
Llama CPP provides an efficient way to run language models locally with support for models in the new **GGUF** format, [check the docs](https://docs.intellinode.ai/docs/python/offline-chatbot/llamacpp).

# Repository Setup
1. Install the requirements.
```shell
Expand Down
6 changes: 6 additions & 0 deletions instructions/run_integration_text.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ python3 -m unittest intelli.test.integration.test_googleai_wrapper
# anthropic
python3 -m unittest intelli.test.integration.test_anthropic_wrapper

# wrapper with llama.cpp
pytest -s intelli/test/integration/test_llama_cpp_wrapper.py

## controllers
# embedding
python3 -m unittest intelli.test.integration.test_remote_embed_model
Expand All @@ -43,6 +46,9 @@ python3 -m unittest intelli.test.integration.test_azure_chatbot
# chatbot with data
python3 -m unittest intelli.test.integration.test_chatbot_with_data

# chatbot with llama.cpp
pytest -s intelli/test/integration/test_chatbot_cpp.py

## flows
# basic flow
python3 -m unittest intelli.test.integration.test_flow_sequence
Expand Down
150 changes: 107 additions & 43 deletions intelli/function/chatbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,26 @@
from intelli.wrappers.anthropic_wrapper import AnthropicWrapper
from intelli.wrappers.keras_wrapper import KerasWrapper
from intelli.wrappers.nvidia_wrapper import NvidiaWrapper
from intelli.wrappers.llama_cpp_wrapper import IntelliLlamaCPPWrapper
from enum import Enum


class ChatProvider(Enum):
OPENAI = "openai"
GEMINI = "gemini"
MISTRAL = "mistral"
ANTHROPIC = "anthropic"
KERAS = "keras"
NVIDIA = "nvidia"
LLAMACPP = "llamacpp"


class Chatbot:

def __init__(self, api_key=None, provider=None, options=None):
if options is None:
options = {}

self.api_key = api_key
self.provider = self._get_provider(provider)
self.options = options
Expand All @@ -36,11 +40,14 @@ def __init__(self, api_key=None, provider=None, options=None):
print("Please obtain NVIDIA API Key from https://build.nvidia.com/")

def add_rag(self, options):
self.extended_search = IntellicloudWrapper(options['one_key'],
options.get('api_base', None)) if 'one_key' in options else None

self.extended_search = (
IntellicloudWrapper(options["one_key"], options.get("api_base", None))
if "one_key" in options
else None
)

def _get_provider(self, provider):

if isinstance(provider, str):
provider = provider.lower()
if provider not in (p.value for p in ChatProvider):
Expand All @@ -50,10 +57,10 @@ def _get_provider(self, provider):
return provider.value
else:
raise ValueError(f"Unsupported provider: {provider}")

def _initialize_provider(self):
if self.provider == ChatProvider.OPENAI.value:
proxy_helper = self.options.get('proxy_helper', None)
proxy_helper = self.options.get("proxy_helper", None)
return OpenAIWrapper(self.api_key, proxy_helper=proxy_helper)
elif self.provider == ChatProvider.MISTRAL.value:
return MistralAIWrapper(self.api_key)
Expand All @@ -62,7 +69,9 @@ def _initialize_provider(self):
elif self.provider == ChatProvider.ANTHROPIC.value:
return AnthropicWrapper(self.api_key)
elif self.provider == ChatProvider.KERAS.value:
return KerasWrapper(self.options['model_name'], self.options.get('model_params', {}))
return KerasWrapper(
self.options["model_name"], self.options.get("model_params", {})
)
elif self.provider == ChatProvider.NVIDIA.value:
nvidia_options = self.options.get("nvidiaOptions", {})
base_url = self.options.get("baseUrl", {})
Expand All @@ -72,6 +81,13 @@ def _initialize_provider(self):
return NvidiaWrapper(self.api_key, base_url=base_url)
else:
return NvidiaWrapper(self.api_key)
elif self.provider == ChatProvider.LLAMACPP.value:
# assume options has "model_path" and optionally "model_params"
model_path = self.options.get("model_path")
model_params = self.options.get("model_params", {"n_ctx": 512})
return IntelliLlamaCPPWrapper(
model_path=model_path, model_params=model_params
)
else:
raise ValueError(f"Unsupported provider: {self.provider}")

Expand All @@ -86,14 +102,26 @@ def chat(self, chat_input):
get_input_method = f"get_{self.provider}_input"
chat_method = getattr(self, f"_chat_{self.provider}", None)
if not chat_method:
raise NotImplementedError(f"{self.provider.capitalize()} chat is not implemented.")
raise NotImplementedError(
f"{self.provider.capitalize()} chat is not implemented."
)

params = getattr(chat_input, get_input_method)()
result = chat_method(params)
return {'result': result, 'references': references} if chat_input.attach_reference else result
return (
{"result": result, "references": references}
if chat_input.attach_reference
else result
)

def _chat_llamacpp(self, params):
# assume the wrapper returns a dict with key "choices" containing a list of text responses.
response = self.wrapper.generate_text(params)
# extract the text.
return [response["choices"][0]["text"]]

def _chat_keras(self, params):
response = self.wrapper.generate(params['prompt'], params['max_length'])
response = self.wrapper.generate(params["prompt"], params["max_length"])
return [response]

def _chat_openai(self, params):
Expand All @@ -102,7 +130,7 @@ def _chat_openai(self, params):

def _chat_mistral(self, params):
response = self.wrapper.generate_text(params)
return [choice['message']['content'] for choice in response.get('choices', [])]
return [choice["message"]["content"] for choice in response.get("choices", [])]

def _chat_gemini(self, params):
response = self.wrapper.generate_content(params)
Expand All @@ -117,8 +145,8 @@ def _chat_gemini(self, params):
def _chat_anthropic(self, params):
response = self.wrapper.generate_text(params)

return [message['text'] for message in response['content']]
return [message["text"] for message in response["content"]]

def _chat_nvidia(self, params):
result = self.wrapper.generate_text(params)
choices = result.get("choices", [])
Expand All @@ -132,7 +160,9 @@ def stream(self, chat_input):
streaming_method = getattr(self, f"_stream_{self.provider}", None)

if not streaming_method:
raise NotImplementedError(f"Streaming is not implemented for {self.provider}.")
raise NotImplementedError(
f"Streaming is not implemented for {self.provider}."
)

if self.extended_search:
_ = self._augment_with_semantic_search(chat_input)
Expand All @@ -142,38 +172,55 @@ def stream(self, chat_input):
for content in streaming_method(params):
yield content

def _stream_llamacpp(self, params):
params["stream"] = True
# Stream text chunks from the llama-cpp wrapper.
for chunk in self.wrapper.generate_text_stream(params):
yield chunk

def _stream_openai(self, params):
"""
Private helper method to stream text from OpenAI and parse each content chunk.
"""
params['stream'] = True
params["stream"] = True
for response in self.wrapper.generate_chat_text(params):
if response.strip() and response.startswith('data: ') and response != 'data: [DONE]':
json_content = response[len('data: '):].strip()
if (
response.strip()
and response.startswith("data: ")
and response != "data: [DONE]"
):
json_content = response[len("data: ") :].strip()

try:
data_chunk = json.loads(json_content)
content = data_chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
content = (
data_chunk.get("choices", [{}])[0]
.get("delta", {})
.get("content", "")
)
if content:
yield content
except json.JSONDecodeError as e:
print("Error decoding JSON:", e)

def _stream_anthropic(self, params):
"""Stream text from Anthropic and directly yield text content."""
params['stream'] = True
params["stream"] = True

for line in self.wrapper.stream_text(params):
# process lines starting with 'data:'
if line.startswith("data:"):
try:

json_payload = line[len("data:"):]
json_payload = line[len("data:") :]
line_data = json.loads(json_payload)

if 'type' in line_data and line_data['type'] == 'content_block_delta' and 'text' in line_data[
'delta']:
yield line_data['delta']['text']
if (
"type" in line_data
and line_data["type"] == "content_block_delta"
and "text" in line_data["delta"]
):
yield line_data["delta"]["text"]
except json.JSONDecodeError as e:
print("Error decoding JSON from stream:", e)

Expand All @@ -182,52 +229,69 @@ def _stream_nvidia(self, params):
stream = self.wrapper.generate_text_stream(params)
for line in stream:
if line.strip() and line.startswith("data: ") and line != "data: [DONE]":
json_content = line[len("data: "):].strip()
json_content = line[len("data: ") :].strip()
try:
data_chunk = json.loads(json_content)
content = data_chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
content = (
data_chunk.get("choices", [{}])[0]
.get("delta", {})
.get("content", "")
)
if content:
yield content
except json.JSONDecodeError as e:
print("Error decoding JSON:", e)

# helpers
def _parse_openai_responses(self, results):
responses = []
for choice in results.get('choices', []):
response = choice.get('message', {}).get('content', '')
if choice.get('finish_reason') == 'function_call' and 'function_call' in choice.get('message', {}):
response['function_call'] = choice['message']['function_call']
for choice in results.get("choices", []):
response = choice.get("message", {}).get("content", "")
if choice.get(
"finish_reason"
) == "function_call" and "function_call" in choice.get("message", {}):
response["function_call"] = choice["message"]["function_call"]
responses.append(response)
return responses

def _augment_with_semantic_search(self, chat_input):
last_user_message = chat_input.messages[-1].content if chat_input.messages else ""
last_user_message = (
chat_input.messages[-1].content if chat_input.messages else ""
)
references = []
if last_user_message:
# Perform the semantic search based on the last user message.
filters = {'document_name': chat_input.doc_name} if chat_input.doc_name else None
search_results = self.extended_search.semantic_search(last_user_message, chat_input.search_k,
filters=filters)
filters = (
{"document_name": chat_input.doc_name} if chat_input.doc_name else None
)
search_results = self.extended_search.semantic_search(
last_user_message, chat_input.search_k, filters=filters
)

# Accumulate document names from the search results for references.
references = {}
for doc in search_results:
doc_name = doc['document_name']
doc_name = doc["document_name"]
if doc_name not in references:
references[doc_name] = {'pages': []}
references[doc_name] = {"pages": []}
# Assuming each 'doc' can contain multiple 'pages' or similar structures, adjust as necessary.
references[doc_name]['pages'].extend(doc.get('pages', []))
references[doc_name]["pages"].extend(doc.get("pages", []))

# Generate context data based on the semantic search results.
context_data = '\n'.join(
data['text'] for doc in search_results for data in doc['data'] if 'text' in data
context_data = "\n".join(
data["text"]
for doc in search_results
for data in doc["data"]
if "text" in data
).strip()

# Load the static prompt template for an augmented chatbot response.
augmented_message_template = self.system_helper.load_static_prompt("augmented_chatbot")
augmented_message = augmented_message_template.replace("${semantic_search}", context_data).replace(
"${user_query}", last_user_message)
augmented_message_template = self.system_helper.load_static_prompt(
"augmented_chatbot"
)
augmented_message = augmented_message_template.replace(
"${semantic_search}", context_data
).replace("${user_query}", last_user_message)

# Replace the content of the last user message with the augmented message in the ChatModelInput.
chat_input.messages[-1].content = augmented_message
Expand Down
31 changes: 31 additions & 0 deletions intelli/model/input/chatbot_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,34 @@ def get_nvidia_input(self):
}
return params

def get_llamacpp_input(self):
"""
Create an input prompt for llama.cpp.
This method concatenates the conversation messages into a plain text prompt.
It prefixes system, user, and assistant messages, and ends with an 'Assistant:' prompt.
Returns:
A dictionary with keys:
- prompt: the final text prompt,
- max_tokens: maximum tokens to generate,
- temperature: sampling temperature,
plus any additional options.
"""
prompt = ""
for msg in self.messages:
if msg.role == 'system':
prompt += f"System: {msg.content}\n"
elif msg.role == 'user':
prompt += f"User: {msg.content}\n"
elif msg.role == 'assistant':
prompt += f"Assistant: {msg.content}\n"
if not prompt.endswith("Assistant: "):
prompt += "Assistant: "
params = {
"prompt": prompt,
"max_tokens": self.max_tokens or 180,
"temperature": self.temperature,
**self.options
}
return params
Loading

0 comments on commit 6d0c444

Please sign in to comment.