Skip to content

Commit e18e337

Browse files
authored
Merge branch 'main' into op/layernorm_kernel
2 parents 4dc3b5f + dff774e commit e18e337

File tree

177 files changed

+8323
-3095
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

177 files changed

+8323
-3095
lines changed

.clang-tidy

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Checks: >
3535
-cppcoreguidelines-non-private-member-variables-in-classes,
3636
-cppcoreguidelines-pro-type-reinterpret-cast,
3737
-cppcoreguidelines-macro-usage,
38+
-cppcoreguidelines-owning-memory,
3839
3940
4041
HeaderFilterRegex: '.*'

.github/workflows/build.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ jobs:
3939
# Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
4040
build_type: [Release]
4141

42-
runs-on: [self-hosted, linux, x64, 1gpu]
42+
runs-on: [self-hosted, linux, x64, 1gpu, 32g]
4343

4444
env:
4545
BUILD_TYPE: ${{ matrix.build_type }}

.github/workflows/docker.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ on:
88

99
jobs:
1010
publish_scalellm:
11-
runs-on: [self-hosted, linux, x64, 1gpu]
11+
runs-on: [self-hosted, linux, x64, 1gpu, 128g]
1212
steps:
1313
- uses: olegtarasov/[email protected]
1414
id: tagName

.github/workflows/format.yml

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
name: clang-format
2+
on:
3+
pull_request:
4+
branches:
5+
- main
6+
7+
jobs:
8+
clang-format:
9+
runs-on: ubuntu-latest
10+
steps:
11+
- name: Install clang-format
12+
run: |
13+
sudo apt-get install -y clang-format colordiff
14+
15+
- name: Checkout
16+
uses: actions/checkout@v3
17+
with:
18+
fetch-depth: 0
19+
20+
- name: Run clang-format
21+
run: |
22+
diff=`git-clang-format --extensions="c,h,m,mm,cc,cp,cpp,c++,cxx,hh,hpp,hxx,inc,cu,cuh,proto,protodevel" --diff --commit ${{ github.event.pull_request.base.sha }}`
23+
[ "$diff" = "no modified files to format" ] && exit 0
24+
[ "$diff" = "clang-format did not modify any files" ] && exit 0
25+
26+
printf "\nYou have introduced coding style breakages. You can:\n"
27+
echo "1> Fix the errors with git-clang-format:"
28+
echo " git-clang-format --commit ${{ github.event.pull_request.base.sha }}"
29+
echo "2> Disable checks on section of the code with:"
30+
echo " // clang-format off"
31+
echo " code"
32+
echo " // clang-format on"
33+
34+
printf "\n\033[1mSuggested changes:\n\n"
35+
echo "$diff" | colordiff
36+
exit 1

.gitmodules

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
[submodule "third_party/pybind11"]
22
path = third_party/pybind11
3-
url = git@github.com:pybind/pybind11.git
3+
url = https://github.com/pybind/pybind11.git
44
[submodule "third_party/flashinfer"]
55
path = third_party/flashinfer
6-
url = git@github.com:vectorch-ai/flashinfer.git
6+
url = https://github.com/vectorch-ai/flashinfer.git
77
[submodule "third_party/cutlass"]
88
path = third_party/cutlass
9-
url = git@github.com:NVIDIA/cutlass.git
9+
url = https://github.com/NVIDIA/cutlass.git

CMakeLists.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ else()
171171
endif()
172172

173173
# carry over torch flags to the rest of the project
174-
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -DC10_USE_GLOG")
174+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS} -DC10_USE_GLOG -flto=auto")
175175
message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}")
176176
message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
177177

@@ -182,7 +182,7 @@ set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}
182182
-U__CUDA_NO_HALF_CONVERSIONS__
183183
-U__CUDA_NO_HALF2_OPERATORS__
184184
-U__CUDA_NO_BFLOAT16_CONVERSIONS__)
185-
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math)
185+
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math -Xfatbin -compress-all)
186186
message(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
187187

188188
# enable testing in this directory so we can do a top-level `make test`.

Dockerfile.devel

+30-2
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,27 @@ RUN apt-get update -q -y && \
1414
cmake \
1515
ccache \
1616
python3-dev \
17+
python3-pip \
1718
zip \
1819
pkg-config \
1920
libssl-dev \
2021
libboost-all-dev \
2122
curl \
22-
git
23+
git \
24+
wget
25+
26+
# install jemalloc (optional)
27+
RUN cd /tmp && \
28+
wget -nc --no-check-certificate https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2 && \
29+
tar -xvf jemalloc-5.3.0.tar.bz2 && \
30+
(cd jemalloc-5.3.0 && \
31+
./configure --enable-prof --disable-initial-exec-tls && \
32+
make -j$(nproc) && make install && \
33+
ldconfig)
34+
35+
# install nsys
36+
ADD https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2024_2/nsight-systems-2024.2.1_2024.2.1.106-1_amd64.deb .
37+
RUN apt-get install -y ./nsight-systems-2024.2.1_2024.2.1.106-1_amd64.deb
2338

2439
# install rust
2540
ENV RUSTUP_HOME=/usr/local/rustup
@@ -30,7 +45,20 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
3045
RUN chown -R $UID:$GID /usr/local/rustup
3146
RUN chown -R $UID:$GID /usr/local/cargo
3247

33-
# TODO: install golang
48+
# Install miniconda
49+
RUN wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/Miniconda3-latest-Linux-x86_64.sh
50+
RUN cd /tmp && \
51+
chmod +x Miniconda3-latest-Linux-x86_64.sh && \
52+
bash ./Miniconda3-latest-Linux-x86_64.sh -b -u
53+
54+
# Test activate miniconda
55+
RUN . ${HOME}/miniconda3/etc/profile.d/conda.sh && \
56+
conda activate base && \
57+
conda init
58+
59+
RUN echo "\
60+
. \${HOME}/miniconda3/etc/profile.d/conda.sh\n\
61+
conda activate base\n" >> ${HOME}/.bashrc
3462

3563
CMD ["/bin/bash"]
3664

README.md

+27-41
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,29 @@
11
# ScaleLLM: An efficient LLM Inference solution
2-
[![build and test](https://github.com/vectorch-ai/ScaleLLM/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/vectorch-ai/ScaleLLM/actions/workflows/build.yml) [![GitHub Repo stars](https://img.shields.io/github/stars/vectorch-ai/ScaleLLM?style=social)](https://github.com/vectorch-ai/ScaleLLM/stargazers)
3-
[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
2+
[![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![GitHub Repo stars](https://img.shields.io/github/stars/vectorch-ai/ScaleLLM?style=social)](https://github.com/vectorch-ai/ScaleLLM/stargazers) [![build and test](https://github.com/vectorch-ai/ScaleLLM/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/vectorch-ai/ScaleLLM/actions/workflows/build.yml)
43

5-
[![Discord](https://dcbadge.vercel.app/api/server/PKe5gvBZfn)](https://discord.gg/PKe5gvBZfn)
64

5+
[![Discord](https://dcbadge.vercel.app/api/server/PKe5gvBZfn)](https://discord.gg/PKe5gvBZfn)
76

8-
> **Warning**<br />
9-
> ScaleLLM is currently in the active development stage and may not yet provide the optimal level of inference efficiency. We are fully dedicated to continuously enhancing its efficiency while also adding more features.
7+
[ScaleLLM]() is a cutting-edge inference system engineered for large language models (LLMs), meticulously designed to meet the demands of production environments. It extends its support to a wide range of popular open-source models, including [Llama3](https://github.com/meta-llama/llama3), [Gemma](https://github.com/google-deepmind/gemma), Bloom, GPT-NeoX, and more.
108

9+
ScaleLLM is currently undergoing active development. We are fully committed to consistently enhancing its efficiency while also incorporating additional features. Feel free to explore our [**_Roadmap_**](https://github.com/vectorch-ai/ScaleLLM/issues/84) for more details.
1110

12-
In the coming weeks, we have exciting plans to focus on [**_speculative decoding_**](https://github.com/orgs/vectorch-ai/projects/1) and [**_stateful conversation_**](https://github.com/orgs/vectorch-ai/projects/2), alongside further kernel optimizations. We appreciate your understanding and look forward to delivering an even better solution.
1311

12+
## News:
13+
* [03/2024] - [Advanced feature](https://github.com/vectorch-ai/ScaleLLM/releases/tag/v0.0.7) support for CUDA graph, [dynamic prefix cache](), [dynamic chunked prefill]() and [speculative decoding]().
14+
* [11/2023] - [First release](https://github.com/vectorch-ai/ScaleLLM/releases/tag/v0.0.1) with support for popular [open-source models](#supported-models).
1415

15-
## Latest News:
16-
* [11/2023] - First [official release](https://github.com/vectorch-ai/ScaleLLM/releases/tag/v0.0.1) with support for popular open-source models.
16+
## Key Features
1717

18+
- [High Efficiency](): Excels in high-performance LLM inference, leveraging state-of-the-art techniques and technologies like [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Paged Attention](https://github.com/vllm-project/vllm), [Continuous batching](https://www.anyscale.com/blog/continuous-batching-llm-inference), and more.
19+
- [Tensor Parallelism](): Utilizes tensor parallelism for efficient model execution.
20+
- [OpenAI-compatible API](): An efficient [golang](https://en.wikipedia.org/wiki/Go_(programming_language)) rest api server that compatible with OpenAI.
21+
- [Huggingface models](): Seamless integration with most popular [HF models](#supported-models), supporting safetensors.
22+
- [Customizable](): Offers flexibility for customization to meet your specific needs, and provides an easy way to add new models.
23+
- [Production Ready](): Engineered with production environments in mind, ScaleLLM is equipped with robust system monitoring and management features to ensure a seamless deployment experience.
1824

1925
## Table of contents
2026

21-
- [Overview](#overview)
2227
- [Supported Models](#supported-models)
2328
- [Get Started](#get-started)
2429
- [ScaleLLM server](#scalellm-server)
@@ -32,42 +37,20 @@ In the coming weeks, we have exciting plans to focus on [**_speculative decoding
3237
- [Acknowledgements](#acknowledgements)
3338
- [License](#license)
3439

35-
36-
## Overview
37-
38-
ScaleLLM is a cutting-edge inference system engineered for large language models (LLMs), meticulously designed to meet the demands of production environments. It extends its support to a wide range of popular open-source models, including Llama2, Bloom, GPT-NeoX, and more.
39-
40-
## Key Features
41-
42-
- [High Efficiency](): Excels in high-performance LLM inference, leveraging state-of-the-art techniques and technologies like [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Paged Attention](https://github.com/vllm-project/vllm), [Continuous batching](https://www.anyscale.com/blog/continuous-batching-llm-inference), and more.
43-
- [Tensor Parallelism](): Utilizes tensor parallelism for efficient model execution.
44-
- [OpenAI-compatible API](): An efficient [golang](https://en.wikipedia.org/wiki/Go_(programming_language)) rest api server that compatible with OpenAI.
45-
- [Huggingface models](): Seamless integration with most popular [HF models](#supported-models), supporting safetensors.
46-
- [Customizable](): Offers flexibility for customization to meet your specific needs, and provides an easy way to add new models.
47-
- [Production Ready](): Engineered with production environments in mind, ScaleLLM is equipped with robust system monitoring and management features to ensure a seamless deployment experience.
48-
49-
5040
## Supported Models
5141

52-
Please note that in order to use Yi models, you need to add `--model_type=Yi` to the command line. For example:
53-
```bash
54-
docker run -it --gpus=all --net=host --shm-size=1g \
55-
-v $HOME/.cache/huggingface/hub:/models \
56-
-e HF_MODEL_ID=01-ai/Yi-34B-Chat-4bits \
57-
-e DEVICE=auto \
58-
docker.io/vectorchai/scalellm:latest --logtostderr --model_type=Yi
59-
```
60-
6142
| Models | Tensor Parallel | Quantization | Chat API | HF models examples |
6243
| :--------: | :-------------: | :----------: | :------: | :---------------------------:|
6344
| Aquila | Yes | Yes | Yes | [BAAI/Aquila-7B](https://huggingface.co/BAAI/Aquila-7B), [BAAI/AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B) |
6445
| Bloom | Yes | Yes | No | [bigscience/bloom](https://huggingface.co/bigscience/bloom) |
46+
| Baichuan | Yes | Yes | Yes | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) |
6547
| ChatGLM3 | Yes | Yes | Yes | [THUDM/chatglm3-6b](https://huggingface.co/THUDM/chatglm3-6b) |
48+
| Gemma | Yes | Yes | Yes | [google/gemma-2b](https://huggingface.co/google/gemma-2b) |
6649
| GPT_j | Yes | Yes | No | [EleutherAI/gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b) |
6750
| GPT_NeoX | Yes | Yes | No | [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) |
6851
| GPT2 | Yes | Yes | No | [gpt2](https://huggingface.co/gpt2)|
6952
| InternLM | Yes | Yes | Yes | [internlm/internlm-7b](https://huggingface.co/internlm/internlm-7b) |
70-
| Llama2 | Yes | Yes | Yes | [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b), [TheBloke/Llama-2-13B-chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ), [TheBloke/Llama-2-70B-AWQ](https://huggingface.co/TheBloke/Llama-2-70B-AWQ) |
53+
| Llama3/2 | Yes | Yes | Yes | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct), [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B), [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) |
7154
| Mistral | Yes | Yes | Yes | [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) |
7255
| MPT | Yes | Yes | Yes | [mosaicml/mpt-30b](https://huggingface.co/mosaicml/mpt-30b) |
7356
| Phi2 | Yes | Yes | No | [microsoft/phi-2](https://huggingface.co/microsoft/phi-2) |
@@ -96,9 +79,10 @@ You can download and install Docker from the official website: [Docker Installat
9679
Once you have Docker installed, you can run ScaleLLM Docker container with [latest image](https://hub.docker.com/r/vectorchai/scalellm/tags) using the following command:
9780

9881
```bash
82+
docker pull docker.io/vectorchai/scalellm:latest
9983
docker run -it --gpus=all --net=host --shm-size=1g \
10084
-v $HOME/.cache/huggingface/hub:/models \
101-
-e HF_MODEL_ID=TheBloke/Llama-2-7B-chat-AWQ \
85+
-e HF_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct \
10286
-e DEVICE=cuda:0 \
10387
docker.io/vectorchai/scalellm:latest --logtostderr
10488
```
@@ -109,7 +93,7 @@ This command starts the Docker container with GPU support and various configurat
10993
- `HF_MODEL_REVISION` specifies which Hugging Face model revision you want to run. By default, it is set to `"main"`.
11094
- `DEVICE` specifies the device on which this model should run. By default, it is set to `"auto"`, using all available GPUs. You can also specify specific GPUs by using `"cuda:0,cuda:1"`, or use CPU by using `"cpu"`.
11195
- `HF_MODEL_ALLOW_PATTERN` specifies which types of files are allowed to be downloaded. By default, it will be configured automatically based on tensor type. Only use this option if the default configuration is not working for you.
112-
- `HUGGING_FACE_HUB_TOKEN` specifies the token from [huggingface](https://huggingface.co/settings/tokens) for gated models.
96+
- `HUGGING_FACE_HUB_TOKEN` specifies the token from [huggingface](https://huggingface.co/settings/tokens) for gated models. `-e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN`
11397

11498
> **Warning**<br />
11599
> * The docker image with tag '[latest](https://hub.docker.com/r/vectorchai/scalellm/tags)' could be changed to a new version upon new release. In order to use latest image, you may need to repull the image with specific tag.
@@ -139,6 +123,7 @@ After running the Docker container, two ports are exposed:
139123
You can also start a REST API gateway with [latest image](https://hub.docker.com/r/vectorchai/scalellm-gateway/tags) using the following command:
140124

141125
```bash
126+
docker pull docker.io/vectorchai/scalellm-gateway:latest
142127
docker run -it --net=host \
143128
docker.io/vectorchai/scalellm-gateway:latest --logtostderr
144129
```
@@ -150,6 +135,7 @@ The REST API Server is available on `localhost:8080`. You can use REST API reque
150135
A local Chatbot UI is also available on [localhost:3000](localhost:3000). You can start it with [latest image](https://hub.docker.com/r/vectorchai/chatbot-ui/tags) using the following command:
151136

152137
```bash
138+
docker pull docker.io/vectorchai/chatbot-ui:latest
153139
docker run -it --net=host \
154140
-e OPENAI_API_HOST=http://127.0.0.1:8080 \
155141
-e OPENAI_API_KEY=YOUR_API_KEY \
@@ -162,7 +148,7 @@ Using Docker Compose is the easiest way to run ScaleLLM with all the services to
162148

163149
```bash
164150
curl https://raw.githubusercontent.com/vectorch-ai/ScaleLLM/main/scalellm.yml -sSf > scalellm_compose.yml
165-
HF_MODEL_ID=TheBloke/Llama-2-7B-chat-AWQ DEVICE=cuda docker compose -f ./scalellm_compose.yml up
151+
HF_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct DEVICE=cuda docker compose -f ./scalellm_compose.yml up
166152
```
167153

168154
you will get following running services:
@@ -180,7 +166,7 @@ You can get chat completions with the following example:
180166
curl http://localhost:8080/v1/chat/completions \
181167
-H "Content-Type: application/json" \
182168
-d '{
183-
"model": "TheBloke/Llama-2-7B-chat-AWQ",
169+
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
184170
"messages": [
185171
{
186172
"role": "system",
@@ -205,7 +191,7 @@ openai.api_base = "http://localhost:8080/v1"
205191
print("==== Available models ====")
206192
models = openai.Model.list()
207193

208-
model = "TheBloke/Llama-2-7B-chat-AWQ"
194+
model = "meta-llama/Meta-Llama-3-8B-Instruct"
209195

210196
completion = openai.ChatCompletion.create(
211197
model=model,
@@ -232,7 +218,7 @@ For regular completions, you can use this example:
232218
curl http://localhost:8080/v1/completions \
233219
-H "Content-Type: application/json" \
234220
-d '{
235-
"model": "TheBloke/Llama-2-7B-chat-AWQ",
221+
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
236222
"prompt": "hello",
237223
"max_tokens": 32,
238224
"temperature": 0.7,
@@ -251,7 +237,7 @@ openai.api_base = "http://localhost:8080/v1"
251237
print("==== Available models ====")
252238
models = openai.Model.list()
253239

254-
model = "TheBloke/Llama-2-7B-chat-AWQ"
240+
model = "meta-llama/Meta-Llama-3-8B-Instruct"
255241

256242
completion = openai.Completion.create(
257243
model=model,

bindings/python/CMakeLists.txt

-1
This file was deleted.

0 commit comments

Comments
 (0)