-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Candle is a minimalist ML framework for Rust with a focus on performance and ease of use. This commit adds the Quantized LLaMA example. Signed-off-by: Dmitrii Kuvaiskii <[email protected]>
- Loading branch information
Dmitrii Kuvaiskii
committed
Jul 26, 2024
1 parent
ceba8e9
commit 5a3494d
Showing
4 changed files
with
130 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
/candle_quantized | ||
/src | ||
|
||
# model | ||
/*.bin | ||
/*.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# Copyright (C) 2024 Gramine contributors | ||
# SPDX-License-Identifier: BSD-3-Clause | ||
|
||
ARCH_LIBDIR ?= /lib/$(shell $(CC) -dumpmachine) | ||
|
||
ifeq ($(DEBUG),1) | ||
GRAMINE_LOG_LEVEL = debug | ||
else | ||
GRAMINE_LOG_LEVEL = error | ||
endif | ||
|
||
SRCDIR = src | ||
|
||
.PHONY: all | ||
all: candle_quantized candle_quantized.manifest | ||
ifeq ($(SGX),1) | ||
all: candle_quantized.manifest.sgx candle_quantized.sig | ||
endif | ||
|
||
llama-2-7b.ggmlv3.q4_0.bin: | ||
../common_tools/download --output $@ \ | ||
--sha256 bfa26d855e44629c4cf919985e90bd7fa03b77eea1676791519e39a4d45fd4d5 \ | ||
--url https://huggingface.co/TheBloke/Llama-2-7B-GGML/resolve/main/$@ | ||
|
||
tokenizer.json: | ||
../common_tools/download --output $@ \ | ||
--sha256 8eea70c4866c4f1320ba096fc986ac82038a8374dbe135212ba7628835b4a6f1 \ | ||
--url https://huggingface.co/hf-internal-testing/llama-tokenizer/raw/main/$@ | ||
|
||
$(SRCDIR)/candle_quantized/target/release/examples/quantized: llama-2-7b.ggmlv3.q4_0.bin tokenizer.json | ||
mkdir -p $(SRCDIR) && cd $(SRCDIR) && \ | ||
git clone https://github.com/huggingface/candle.git candle_quantized && \ | ||
cd candle_quantized && \ | ||
cargo build --example quantized --release | ||
|
||
candle_quantized: $(SRCDIR)/candle_quantized/target/release/examples/quantized | ||
cp $< $@ | ||
|
||
candle_quantized.manifest: candle_quantized.manifest.template | ||
gramine-manifest \ | ||
-Dlog_level=$(GRAMINE_LOG_LEVEL) \ | ||
-Darch_libdir=$(ARCH_LIBDIR) \ | ||
$< > $@ | ||
|
||
candle_quantized.manifest.sgx candle_quantized.sig: candle_quantized_sgx_sign | ||
@: | ||
|
||
.INTERMEDIATE: candle_quantized_sgx_sign | ||
candle_quantized_sgx_sign: candle_quantized.manifest candle_quantized | ||
gramine-sgx-sign \ | ||
--manifest $< \ | ||
--output $<.sgx | ||
.PHONY: clean | ||
clean: | ||
$(RM) *.token *.sig *.manifest.sgx *.manifest candle_quantized | ||
|
||
.PHONY: distclean | ||
distclean: clean | ||
$(RM) -r $(SRCDIR) *.tar.gz *.bin *.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Candle | ||
|
||
[Candle](https://github.com/huggingface/candle) is a minimalist ML framework for | ||
Rust with a focus on performance (including GPU support) and ease of use. | ||
|
||
This directory contains the Makefile and the template manifest for the most | ||
recent version of Candle as of this writing (v0.6.0). | ||
|
||
# Warning | ||
|
||
The `candle_quantized` app will download ~4GB of data (model + tokenizer). This | ||
happens automatically in the Makefile. | ||
|
||
# Quick Start | ||
|
||
```sh | ||
# build Candle (uses Rust Cargo) and the final manifest | ||
make SGX=1 | ||
|
||
# run Quantized LLaMA (quantized version of the LLaMA model) | ||
# note that for Gramine, the cmdline args are already defined in the manifest file | ||
# example taken from https://github.com/huggingface/candle/tree/0.6.0?tab=readme-ov-file#check-out-our-examples | ||
RAYON_NUM_THREADS=36 ./candle_quantized \ | ||
--model llama-2-7b.ggmlv3.q4_0.bin --tokenizer tokenizer.json --sample-len 200 | ||
RAYON_NUM_THREADS=36 gramine-direct ./candle_quantized | ||
RAYON_NUM_THREADS=36 gramine-sgx ./candle_quantized | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# Copyright (C) 2024 Gramine contributors | ||
# SPDX-License-Identifier: BSD-3-Clause | ||
|
||
loader.entrypoint = "file:{{ gramine.libos }}" | ||
libos.entrypoint = "/candle_quantized" | ||
|
||
loader.log_level = "{{ log_level }}" | ||
|
||
loader.env.LD_LIBRARY_PATH = "/lib:{{ arch_libdir }}" | ||
loader.env.RAYON_NUM_THREADS = { passthrough = true } | ||
|
||
loader.argv = [ "candle_quantized", "--model", "llama-2-7b.ggmlv3.q4_0.bin", | ||
"--tokenizer", "tokenizer.json", "--sample-len", "200" ] | ||
|
||
fs.mounts = [ | ||
{ path = "/candle_quantized", uri = "file:candle_quantized" }, | ||
{ path = "/lib", uri = "file:{{ gramine.runtimedir() }}" }, | ||
{ path = "{{ arch_libdir }}", uri = "file:{{ arch_libdir }}" }, | ||
|
||
{ path = "/llama-2-7b.ggmlv3.q4_0.bin", uri = "file:llama-2-7b.ggmlv3.q4_0.bin" }, | ||
{ path = "/tokenizer.json", uri = "file:tokenizer.json" }, | ||
] | ||
|
||
sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }} | ||
sgx.max_threads = {{ '1' if env.get('EDMM', '0') == '1' else '256' }} | ||
sgx.enclave_size = "32G" | ||
|
||
sgx.trusted_files = [ | ||
"file:candle_quantized", | ||
"file:{{ gramine.libos }}", | ||
"file:{{ gramine.runtimedir() }}/", | ||
"file:{{ arch_libdir }}/libcrypto.so.3", | ||
"file:{{ arch_libdir }}/libgcc_s.so.1", | ||
"file:{{ arch_libdir }}/libssl.so.3", | ||
|
||
"file:llama-2-7b.ggmlv3.q4_0.bin", | ||
"file:tokenizer.json", | ||
] |