diff --git a/Performance.md b/Performance.md deleted file mode 100644 index 61e05de..0000000 --- a/Performance.md +++ /dev/null @@ -1,106 +0,0 @@ - -### 网络要求估算 - -- PP=8 ,那么通信要求需要 $\times 8$ -- 70B 的 hidden_size 是 $8192$ -- 数据是 `(b)float16`,每个 token 的通信参数量为 $1*8192*2=16,384$ - -在 TPOT 阶段预期速度: 20 token/s -> 0.05s / token - -- 假设通信:计算比为 1:4,那么通信时间为 0.01s - - 即每次通信要在 0.01/8s 完成,即 0.00125s -> 1.25ms - - 70B 的 hidden_size 是 8192,就有 $16,384$ bytes. - - 故要在 0.01/8s 完成,那么网络带宽至少要求 $16,384/0.01*8=26,214,400 bytes/s = 13 Mbps$。 - -在 TTFT 阶段,即首 token 时间预期 3s 内有响应, - -- 假设通信:计算比为 1:2,那么通信时间为 1s,即每次通信要在 1/8s 完成,即 0.125s -> 125ms -- 假设输入 token 数为 1000,那么通信参数量为 $1000*16,384 = 16,384,000$ bytes -- 1/8s 内完成,通信时间为 $16,384,000/1*8=131,072,000 比特/秒 = 131 Mbps$ - -优化点: -- 数据压缩一倍,加速一倍 -- 在 TTFT 阶段做 PP overlap,把输入 token 分块传输。 - -### 通信时间测试 -token 数较少时压缩似乎效果不大,只有在大数据量的时候才有明显效果。 - -=== Starting gRPC Matrix Performance Test (use_zlib: False; use_lz4: False) === - -Time: 2024-11-02 11:40:40 - -| Matrix Shape | Transmission(ms) | Compress(ms) | Total(ms) | Throughput(MB/s) | -| ---- | --------- | --- | --- | --- | -| (1, 8192) | 14.05 | 0.01 | 14.06 | 2.22 | -| (4, 8192) | 29.89 | 0.01 | 29.90 | 4.18 | -| (16, 8192) | 76.95 | 0.01 | 76.97 | 6.50 | -| (32, 8192) | 137.05 | 0.01 | 137.06 | 7.30 | -| (64, 8192) | 262.03 | 0.01 | 262.04 | 7.63 | - -=== Starting gRPC Matrix Performance Test (use_zlib: False; use_lz4: True) === - -Time: 2024-11-02 11:51:41 - -| Matrix Shape | Transmission(ms) | Compress(ms) | Total(ms) | Throughput(MB/s) | -| ---- | --------- | --- | --- | --- | -| (1, 8192) | 18.10 | 0.08 | 18.18 | 1.73 | -| (4, 8192) | 30.82 | 0.19 | 31.01 | 4.06 | -| (16, 8192) | 78.23 | 2.80 | 81.02 | 6.39 | -| (32, 8192) | 136.85 | 5.62 | 142.47 | 7.31 | -| (64, 8192) | 249.84 | 10.37 | 260.20 | 8.01 | - -=== Starting gRPC Matrix Performance Test (use_zlib: True; use_lz4: False) === - -Time: 2024-11-02 11:41:51 - -| Matrix Shape | Transmission(ms) | Compress(ms) | Total(ms) | Throughput(MB/s) | -| ---- | --------- | --- | --- | --- | -| (1, 8192) | 13.95 | 1.96 | 15.91 | 2.24 | -| (4, 8192) | 32.74 | 9.82 | 42.56 | 3.82 | -| (16, 8192) | 77.19 | 32.65 | 109.84 | 6.48 | -| (64, 8192) | 256.25 | 83.67 | 339.92 | 7.80 | - -### 分割线 -- 2 GHz 四核Intel Core i5, 16 GB 3733 MHz LPDDR4X - - Llama-3.2-1B-Instruct 单机时间:10.96 token/s - - Llama-3.2-1B-Instruct 单机时间:5.73 token/s(包含首token生成的时间, transformers 框架 TTFT 时间不方便记录) - -- Apple M3 Pro, 18 GB - -在保证通信带宽的前提下,速度应当更快 - -由于 tokenizer 可能不同,所以输入 tokens 有一点出入,但基本差不多。 - -生成 token 速度(减去首token生成的时间) -bfloat 16 CPU -| PP,TP | Llama-3.2-1B-Instruct | Llama-3.2-3B-Instruct | -| ---- | --------- | --- | -| 2,1(实际) | 8.04 token/s | 3.01 token/s | -| 2,2(实际) | 7.38 token/s | 2.51 token/s | - -包含首 token 生成时间 -| PP,TP | Llama-3.2-1B-Instruct | Llama-3.2-3B-Instruct | -| ---- | --------- | --- | -| 2,1(实际) | 5.49 token/s | 2.42 token/s | -| 2,2(实际) | 5.66 token/s | 2.46 token/s | - - -TODO: Meta-Llama-3-8B-Instruct in GPU - -- 多维数组实现(float32): 单机通信在 0.002 s 左右 (seq-len=1) -- bytes 实现(float32): 单机通信在 0.001 s 左右 (seq-len=1) - - -old version - -For `mlx-community/Llama-3.2-1B-Instruct` - -- mac mini m2 - ![alt text](asserts/image.png) -- m3 pro - ![alt text](asserts/image-1.png) - -for 8b - -- m3 pro (layer=8) + mac mini m2 (layer=24) - ![alt text](asserts/image-2.png) diff --git a/README.md b/README.md index 9cfe3e7..d0cbda9 100644 --- a/README.md +++ b/README.md @@ -74,10 +74,6 @@ - [X] Engine - [X] mlx - [X] torch - - [ ] tinygrad - - [ ] Multi-Request - - [ ] Jit - - [ ] Pipeline - [X] Communication - [X] grpc - [X] Auto Find Node @@ -95,20 +91,5 @@ In Mac Mini M4 | | `mlx-community/Llama-3.2-1B-Instruct-4bit` | `mlx-community/Llama-3.2-1B-Instruct` | `mlx-community/Meta-Llama-3.1-8B-Instruct-4bit` | `mlx-community/Meta-Llama-3.1-8B-Instruct-bf16` | | ------------------------------------ | -------------------------------------------- | --------------------------------------- | ------------------------------------------------- | ------------------------------------------------- | -| Mac Mini M4 (16G) (Engine, Baseline) | 98.10 tok/s | 35.45 tok/s | 20.68 tok/s | No Memory | | Mac Mini M4 (16G) (Local) | 45.36 tok/s | 23.60 tok/s | 15.80 tok/s | No Memory | -| Mac Mini M4 (16G) (Server+Client) | 61.83 tok/s | 34.54 tok/s | 14.91 tok/s | No Memory | | Mac Mini M4 (16G) + M3 Pro (18G) | | 16.33 tok/s | 11.06 tok/s | 5.64 tok/s | - -Q: Why Local is slower than Server+Client? - -A: - -- Local 只有一个进程,启动了 HTTP Serve, Engine 和 Model 都在一个进程中 -- Server+Client 是两个进程,Server 中包含了 HTTP Serve 和 Engine,以及 Embedding 和 LM HEAD;Client 中只有 Model - -但不清楚,为什么 `mlx-community/Meta-Llama-3.1-8B-Instruct-4bit` 这个不大一样,暂时归因到内存压力上。 - -Q: Mac Mini M4 (16G) + M3 Pro (18G) 这一列速度为什么慢? - -A:理想情况下会等于 Mac Mini M4 (16G) (Server+Client),但由于需要进行通信,通信开销占了主要部分,其中主要是延迟问题导致每个 token 生成都需要花费一定时间,哪怕在局域网内。 diff --git a/README_EN.md b/README_EN.md index a281485..92d9abd 100644 --- a/README_EN.md +++ b/README_EN.md @@ -73,10 +73,6 @@ For multi-machine deployment, the default part of the port will be used for runn - [X] Engine - [X] mlx - [X] torch - - [ ] tinygrad - - [ ] Multi-Request - - [ ] Jit - - [ ] Pipeline - [X] Communication - [X] grpc - [X] Auto Find Node @@ -94,20 +90,5 @@ In Mac Mini M4 | | `mlx-community/Llama-3.2-1B-Instruct-4bit` | `mlx-community/Llama-3.2-1B-Instruct` | `mlx-community/Meta-Llama-3.1-8B-Instruct-4bit` | `mlx-community/Meta-Llama-3.1-8B-Instruct-bf16` | | ------------------------------------ | -------------------------------------------- | --------------------------------------- | ------------------------------------------------- | ------------------------------------------------- | -| Mac Mini M4 (16G) (Engine, Baseline) | 98.10 tok/s | 35.45 tok/s | 20.68 tok/s | No Memory | | Mac Mini M4 (16G) (Local) | 45.36 tok/s | 23.60 tok/s | 15.80 tok/s | No Memory | -| Mac Mini M4 (16G) (Server+Client) | 61.83 tok/s | 34.54 tok/s | 14.91 tok/s | No Memory | | Mac Mini M4 (16G) + M3 Pro (18G) | | 16.33 tok/s | 11.06 tok/s | 5.64 tok/s | - -Q: Why Local is slower than Server+Client? - -A: - -- Local only has one process, which starts the HTTP Server, Engine and Model are all in one process. -- Server+Client are two processes, Server contains HTTP Serve and Engine, as well as Embedding and LM HEAD; Client contains only Model - -But unclear, why `mlx-community/Meta-Llama-3.1-8B-Instruct-4bit` is not much different, temporarily attributed to memory pressure. - -Q: Why is the performance of Mac Mini M4 (16G) + M3 Pro (18G) slow? - -A: In an ideal scenario, it would be equivalent to a Mac Mini M4 (16G) (Server+Client), but due to the need for communication, the communication overhead accounts for a significant portion of the total cost. The main issue is that each token generation requires a certain amount of time, even within a local network. diff --git a/RoadMap.md b/RoadMap.md deleted file mode 100644 index b03306e..0000000 --- a/RoadMap.md +++ /dev/null @@ -1,86 +0,0 @@ -# RoadMap - -使用 torch.dist 实现 张量并行,使用 rpc 实现流水并行 - -- [ ] Speed Up - - [x] Merge Linear - - [x] Pipeline Parallel by grpc - - [x] Tensor Parallel by torch.dist - - [x] Sequence KV Cache - - [x] Performance Testing - - [x] Attention - - [x] SDPA - - [x] xformers - - [x] flash_attention -- [x] Decoding Strategy - - [x] Top-K Sampling - - [x] Top-P Sampling - - [x] Temperature Sampling -- [ ] Model - - [ ] LLM - - [x] LLaMA - - [x] Qwen2 - - [ ] Multi-Modal - - [x] Qwen2-VL -- [x] MLX Framework - - [x] With Torch Inference - - [x] Some bugs with multi requests - - [x] Quantization - - [x] MLX Server - - [ ] LoRA Training -- [x] Web UI - - [x] Node Status - - [ ] Display Multi Model - - [x] ChatWeb Demo by Gradio - - [x] Parameters - - [x] System - - [x] Button -- [x] Backend - - [x] OpenAI API format - - [x] Streaming Output - - [x] chat completion(stream) - - [x] chat completion(non-stream) - - [x] using anythingLLM - - [x] Client Send Url and Port - - [ ] Auto Layer Split - - [x] get free layer idx - - [x] fix split layer pipeline - - [x] calculate layer memory and recommend split - - [ ] split model before load - - [x] Async Generation - - [x] Multi-Sequence Batch=1 - - [x] Queuing mechanism - - [x] Continuous Batch - - [x] Test Cases - - [x] Client Disconnect and Abort - - [x] await Event - - [x] Communication - - [x] Communication Time Benchmark - - [x] Async GRPC - - [x] Ring Communication - - [x] Auto Find Node - - [x] WebSocket Communication - - [x] Client Retry Connect - - [x] Client auto update url - - [x] Master Exit -- [ ] KV Cache - - [x] Request/Sequence Cache - - [x] Custom KV Cache Class - - [ ] Conversation KV Cache (in progress) - - [ ] Token-Level Cache - - [ ] Prefix-tree Cache -- [ ] Shard Storage -- [x] Auto Download - - -Master 和 Client 交互方式 http -- Master 先启动,已知模型名和层数 - - Client 启动 grpc,HTTP 发送可连接到地址信息(TODO 内存/显存大小/算力等信息)到 Master - - Master 返回模型名,分配的起始和结束层数(同步操作,不需要状态) - - Client 下载模型,加载模型,向 Master 发送 InitModel 信息完成 - - - 之后 Master 会向 Client 定时发送心跳包,确保 Client 连接正常 -- 如果 Master 重启,Master 会丢失所有的 Client 信息 - - Client 会有定时心跳检查,带着已有状态重新连接 - -remove torch dependency \ No newline at end of file diff --git a/tllm/__init__.py b/tllm/__init__.py index de8017e..794f3b1 100644 --- a/tllm/__init__.py +++ b/tllm/__init__.py @@ -9,7 +9,7 @@ class BackendEnum(Enum): ENABLE_PREFILL_CACHE = os.environ.get("TLLM_ENABLE_PREFILL_CACHE", "true").lower() == "true" - +ENABLE_PREFILL_CACHE = False if importlib.util.find_spec("mlx"): BACKEND = BackendEnum.MLX elif importlib.util.find_spec("torch"): diff --git a/tllm/models/mlx/layers.py b/tllm/models/mlx/layers.py index 7bde2b8..63511df 100644 --- a/tllm/models/mlx/layers.py +++ b/tllm/models/mlx/layers.py @@ -284,7 +284,7 @@ def __call__(self, x: mx.array, mask, cache) -> mx.array: r = self.self_attn(self.input_layernorm(x), mask, cache) h = x + r # no skip some begin token, and skip middle block, https://arxiv.org/abs/2404.03865 - # if 20 <= self.layer_idx <= 24 and x.shape[0] == 1: + # if 24 <= self.layer_idx <= 28 and x.shape[0] == 1: # return h r = self.mlp(self.post_attention_layernorm(h)) out = h + r