opea-project · edlee123 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 28, 2024
diff --git a/comps/llms/text-generation/llamacpp/Dockerfile b/comps/llms/text-generation/llamacpp/Dockerfile
@@ -0,0 +1,27 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    libgl1-mesa-glx \
+    libjemalloc-dev
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+# Assumes we're building from the GenAIComps directory.
+COPY ../../../comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r /home/user/comps/llms/text-generation/llamacpp/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+WORKDIR /home/user/comps/llms/text-generation/llamacpp/
+
+ENTRYPOINT ["bash", "entrypoint.sh"]
diff --git a/comps/llms/text-generation/llamacpp/README.md b/comps/llms/text-generation/llamacpp/README.md
@@ -0,0 +1,88 @@
+# Introduction
+
+[llama.cpp](https://github.com/ggerganov/llama.cpp) provides inference in pure C/C++, and enables "LLM inference with minimal setup and state-of-the-art performance on a wide range of hardware - locally and in the cloud".
+
+This OPEA component wraps llama.cpp server so that it can interface with other OPEA components, or for creating OPEA Megaservices.
+
+llama.cpp supports this [hardware](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#supported-backends), and has only been tested on CPU.
+
+To use a CUDA server please refer to [this llama.cpp reference](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#docker) and modify docker_compose_llm.yaml accordingly.
+
+## TLDR
+
+```bash
+cd GenAIComps/
+docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up
+```
+
+Please note it's instructive to run and validate each the llama.cpp server and OPEA component below.
+
+## 1. Run the llama.cpp server
+
+```bash
+cd GenAIComps
+docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llamacpp-server --force-recreate
+```
+
+Notes:
+
+i) If you prefer to run above in the background without screen output use `up -d` . The `--force-recreate` clears cache.
+
+ii) To tear down the llama.cpp server and remove the container:
+
+`docker compose -f comps/llms/text-generation/llamacpp/langchain/docker_compose_llm.yaml llamacpp-server down`
+
+iii) For [llama.cpp settings](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md) please specify them in the docker_compose_llm.yaml file.
+
+#### Verify the llama.cpp Service:
+
+```bash
+curl --request POST \
+    --url http://localhost:8080/completion \
+    --header "Content-Type: application/json" \
+    --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}'
+```
+
+## 2. Run the llama.cpp OPEA Service
+
+This is essentially a wrapper component of Llama.cpp server. OPEA nicely standardizes and verifies LLM inputs with LLMParamsDoc class (see llm.py).
+
+### 2.1 Build the llama.cpp OPEA image:
+
+```bash
+cd GenAIComps/
+docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml up llama-opea-llm
+```
+
+Equivalently, the above can be achieved with `build` and `run` from the Dockerfile. Build:
+
+```bash
+cd GenAIComps/
+docker build --no-cache -t opea/llm-llamacpp:latest \
+  --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy \
+  -f comps/llms/text-generation/llamacpp/Dockerfile .
+```
+
+And run:
+
+```bash
+docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy \
+  opea/llm-llamacpp:latest
+```
+
+### 2.3 Consume the llama.cpp Microservice:
+
+```bash
+curl http://127.0.0.1:9000/v1/chat/completions  -X POST \
+   -d '{"query":"What is Deep Learning?","max_tokens":32,"top_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
+   -H 'Content-Type: application/json'
+```
+
+### Notes
+
+Tearing down services and removing containers:
+
+```bash
+cd GenAIComps/comps/llms/text-generation/llamacpp/
+docker compose -f comps/llms/text-generation/llamacpp/docker_compose_llm.yaml down
+```
diff --git a/comps/llms/text-generation/llamacpp/__init__.py b/comps/llms/text-generation/llamacpp/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml b/comps/llms/text-generation/llamacpp/docker_compose_llm.yaml
@@ -0,0 +1,39 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  llamacpp-server:
+    image: ghcr.io/ggerganov/llama.cpp:server-b4419
+    ports:
+      - 8080:8080
+    environment:
+      # Refer to settings here: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md
+      # Llama.cpp is based on .gguf format, and Hugging Face offers many .gguf format models.
+      LLAMA_ARG_MODEL_URL: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf
+      LLAMA_ARG_CTX_SIZE: 4096
+      LLAMA_ARG_N_PARALLEL: 2
+      LLAMA_ARG_ENDPOINT_METRICS: 1
+      LLAMA_ARG_PORT: 8080
+
+  llamacpp-opea-llm:
+    image: opea/llm-llamacpp:latest
+    build:
+        # Set this to allow COPY comps in the Dockerfile.
+        # When using docker compose with -f, the comps context is 4 levels down from docker_compose_llm.yaml.
+        context: ../../../../
+        dockerfile: ./comps/llms/text-generation/llamacpp/Dockerfile
+    depends_on:
+      - llamacpp-server
+    ports:
+      - "9000:9000"
+    network_mode: "host" # equivalent to: docker run --network host ...
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      # LLAMACPP_ENDPOINT: ${LLAMACPP_ENDPOINT}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
diff --git a/comps/llms/text-generation/llamacpp/entrypoint.sh b/comps/llms/text-generation/llamacpp/entrypoint.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# pip --no-cache-dir install -r requirements-runtime.txt
+
+python llm.py
diff --git a/comps/llms/text-generation/llamacpp/llm.py b/comps/llms/text-generation/llamacpp/llm.py
@@ -0,0 +1,65 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import openai
+from fastapi.responses import StreamingResponse
+
+from comps import CustomLogger, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
+
+logger = CustomLogger("llm_llamacpp")
+logflag = os.getenv("LOGFLAG", False)
+llamacpp_endpoint = os.getenv("LLAMACPP_ENDPOINT", "http://localhost:8080/")
+
+
+# OPEA microservice wrapper of llama.cpp
+# llama.cpp server uses openai API format: https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md
+@register_microservice(
+    name="opea_service@llm_llamacpp",
+    service_type=ServiceType.LLM,
+    endpoint="/v1/chat/completions",
+    host="0.0.0.0",
+    port=9000,
+)
+async def llm_generate(input: LLMParamsDoc):
+    if logflag:
+        logger.info(input)
+        logger.info(llamacpp_endpoint)
+
+    client = openai.OpenAI(
+        base_url=llamacpp_endpoint, api_key="sk-no-key-required"  # "http://<Your api-server IP>:port"
+    )
+
+    # Llama.cpp works with openai API format
+    # The openai api doesn't have top_k parameter
+    # https://community.openai.com/t/which-openai-gpt-models-if-any-allow-specifying-top-k/777982/2
+    chat_completion = client.chat.completions.create(
+        model=input.model,
+        messages=[{"role": "user", "content": input.query}],
+        max_tokens=input.max_tokens,
+        temperature=input.temperature,
+        top_p=input.top_p,
+        frequency_penalty=input.frequency_penalty,
+        presence_penalty=input.presence_penalty,
+        stream=input.streaming,
+    )
+
+    if input.streaming:
+
+        def stream_generator():
+            for c in chat_completion:
+                if logflag:
+                    logger.info(c)
+                yield f"data: {c.model_dump_json()}\n\n"
+            yield "data: [DONE]\n\n"
+
+        return StreamingResponse(stream_generator(), media_type="text/event-stream")
+    else:
+        if logflag:
+            logger.info(chat_completion)
+        return chat_completion
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@llm_llamacpp"].start()
diff --git a/comps/llms/text-generation/llamacpp/requirements.txt b/comps/llms/text-generation/llamacpp/requirements.txt
@@ -0,0 +1,12 @@
+aiohttp
+docarray[full]
+fastapi
+huggingface_hub
+openai
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+prometheus-fastapi-instrumentator
+shortuuid
+transformers
+uvicorn
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Copyright (C) 2024 Intel Corporation
		# SPDX-License-Identifier: Apache-2.0