feat: Add notebooks and scripts for utilizing vllm

YeonwooSung · Dec 29, 2024 · 0acaffd · 0acaffd
1 parent 6764f4e
commit 0acaffd
Show file tree

Hide file tree

Showing 14 changed files with 1,297 additions and 0 deletions.
diff --git a/LLMs/vllm/README.md b/LLMs/vllm/README.md
@@ -0,0 +1,15 @@
+# vLLM for serving LLMs
+
+## Sample codes
+
+- [CPU offload](./samples/cpu_offload.py)
+- [LoRA Quantization](./samples/lora_with_quantization.py)
+- [Multi-LoRA](./samples/multilora_inference.py)
+- Offline inference
+    - [Audio-Language Inference](./samples/offline_inference_audio_language.py)
+    - [Tensor Parallel Inference](./samples/offline_inference_distributed.py)
+    - [LLM2Vec Embeddings](./samples/offline_inference_embedding.py)
+    - [Run Pixtral](./samples/offline_inference_pixtral.py)
+    - [Run with Profiler](./samples/offline_inference_with_profiler.py)
+    - [Speculation Decoding](./samples/offline_inference_speculator.py)
+    - [Vision-Language multi-image Inference](./samples/offline_inference_vision_language_multi_image.py)
diff --git a/LLMs/vllm/logging_configuration.md b/LLMs/vllm/logging_configuration.md
@@ -0,0 +1,172 @@
+# Logging Configuration
+
+vLLM leverages Python's `logging.config.dictConfig` functionality to enable
+robust and flexible configuration of the various loggers used by vLLM.
+
+vLLM offers two environment variables that can be used to accommodate a range
+of logging configurations that range from simple-and-inflexible to
+more-complex-and-more-flexible.
+
+- No vLLM logging (simple and inflexible)
+  - Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset)
+- vLLM's default logging configuration (simple and inflexible)
+  - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1`
+- Fine-grained custom logging configuration (more complex, more flexible)
+  - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and
+    set `VLLM_LOGGING_CONFIG_PATH=<path-to-logging-config.json>`
+
+
+## Logging Configuration Environment Variables
+
+### `VLLM_CONFIGURE_LOGGING`
+
+`VLLM_CONFIGURE_LOGGING` controls whether or not vLLM takes any action to
+configure the loggers used by vLLM. This functionality is enabled by default,
+but can be disabled by setting `VLLM_CONFIGURE_LOGGING=0` when running vLLM.
+
+If `VLLM_CONFIGURE_LOGGING` is enabled and no value is given for
+`VLLM_LOGGING_CONFIG_PATH`, vLLM will use built-in default configuration to
+configure the root vLLM logger. By default, no other vLLM loggers are
+configured and, as such, all vLLM loggers defer to the root vLLM logger to make
+all logging decisions.
+
+If `VLLM_CONFIGURE_LOGGING` is disabled and a value is given for
+`VLLM_LOGGING_CONFIG_PATH`, an error will occur while starting vLLM.
+
+### `VLLM_LOGGING_CONFIG_PATH`
+
+`VLLM_LOGGING_CONFIG_PATH` allows users to specify a path to a JSON file of
+alternative, custom logging configuration that will be used instead of vLLM's
+built-in default logging configuration. The logging configuration should be
+provided in JSON format following the schema specified by Python's [logging
+configuration dictionary
+schema](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details).
+
+If `VLLM_LOGGING_CONFIG_PATH` is specified, but `VLLM_CONFIGURE_LOGGING` is
+disabled, an error will occur while starting vLLM.
+
+
+## Examples
+
+### Example 1: Customize vLLM root logger
+
+For this example, we will customize the vLLM root logger to use
+[`python-json-logger`](https://github.com/madzak/python-json-logger) to log to
+STDOUT of the console in JSON format with a log level of `INFO`.
+
+To begin, first, create an appropriate JSON logging configuration file:
+
+**/path/to/logging_config.json:**
+
+```json
+{
+  "formatters": {
+    "json": {
+      "class": "pythonjsonlogger.jsonlogger.JsonFormatter"
+    }
+  },
+  "handlers": {
+    "console": {
+      "class" : "logging.StreamHandler",
+      "formatter": "json",
+      "level": "INFO",
+      "stream": "ext://sys.stdout"
+    }
+  },
+  "loggers": {
+    "vllm": {
+      "handlers": ["console"],
+      "level": "INFO",
+      "propagate": false
+    }
+  },
+  "version": 1
+}
+```
+
+Next, install the `python-json-logger` package if it's not already installed:
+
+```bash
+pip install python-json-logger
+```
+
+Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
+to the path of the custom logging configuration JSON file:
+
+```bash
+VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
+```
+
+
+### Example 2: Silence a particular vLLM logger
+
+To silence a particular vLLM logger, it is necessary to provide custom logging
+configuration for the target logger that configures the logger so that it won't
+propagate its log messages to the root vLLM logger.
+
+When custom configuration is provided for any logger, it is also necessary to
+provide configuration for the root vLLM logger since any custom logger
+configuration overrides the built-in default logging configuration used by vLLM.
+
+First, create an appropriate JSON logging configuration file that includes
+configuration for the root vLLM logger and for the logger you wish to silence:
+
+**/path/to/logging_config.json:**
+
+```json
+{
+  "formatters": {
+    "vllm": {
+      "class": "vllm.logging.NewLineFormatter",
+      "datefmt": "%m-%d %H:%M:%S",
+      "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
+    }
+  },
+  "handlers": {
+    "vllm": {
+      "class" : "logging.StreamHandler",
+      "formatter": "vllm",
+      "level": "INFO",
+      "stream": "ext://sys.stdout"
+    }
+  },
+  "loggers": {
+    "vllm": {
+      "handlers": ["vllm"],
+      "level": "DEBUG",
+      "propagage": false
+    },
+    "vllm.example_noisy_logger": {
+      "propagate": false
+    }
+  },
+  "version": 1
+}
+```
+
+Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
+to the path of the custom logging configuration JSON file:
+
+```bash
+VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
+```
+
+
+### Example 3: Disable vLLM default logging configuration
+
+To disable vLLM's default logging configuration and silence all vLLM loggers,
+simple set `VLLM_CONFIGURE_LOGGING=0` when running vLLM. This will prevent vLLM
+for configuring the root vLLM logger, which in turn, silences all other vLLM
+loggers.
+
+```bash
+VLLM_CONFIGURE_LOGGING=0 \
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
+```
+
+
+## Additional resources
+
+- [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)
diff --git a/LLMs/vllm/play_with_vllm.ipynb b/LLMs/vllm/play_with_vllm.ipynb
diff --git a/LLMs/vllm/samples/cpu_offload.py b/LLMs/vllm/samples/cpu_offload.py
@@ -0,0 +1,23 @@
+from vllm import LLM, SamplingParams
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/LLMs/vllm/samples/lora_with_quantization.py b/LLMs/vllm/samples/lora_with_quantization.py
@@ -0,0 +1,127 @@
+import gc
+from typing import List, Optional, Tuple
+
+import torch
+from huggingface_hub import snapshot_download
+
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.lora.request import LoRARequest
+
+
+def create_test_prompts(
+        lora_path: str
+) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
+    return [
+        # this is an example of using quantization without LoRA
+        ("My name is",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=128), None),
+        # the next three examples use quantization with LoRA
+        ("my name is",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=128),
+         LoRARequest("lora-test-1", 1, lora_path)),
+        ("The capital of USA is",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=128),
+         LoRARequest("lora-test-2", 1, lora_path)),
+        ("The capital of France is",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=128),
+         LoRARequest("lora-test-3", 1, lora_path)),
+    ]
+
+
+def process_requests(engine: LLMEngine,
+                     test_prompts: List[Tuple[str, SamplingParams,
+                                              Optional[LoRARequest]]]):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params, lora_request = test_prompts.pop(0)
+            engine.add_request(str(request_id),
+                               prompt,
+                               sampling_params,
+                               lora_request=lora_request)
+            request_id += 1
+
+        request_outputs: List[RequestOutput] = engine.step()
+        for request_output in request_outputs:
+            if request_output.finished:
+                print("----------------------------------------------------")
+                print(f"Prompt: {request_output.prompt}")
+                print(f"Output: {request_output.outputs[0].text}")
+
+
+def initialize_engine(model: str, quantization: str,
+                      lora_repo: Optional[str]) -> LLMEngine:
+    """Initialize the LLMEngine."""
+
+    if quantization == "bitsandbytes":
+        # QLoRA (https://arxiv.org/abs/2305.14314) is a quantization technique.
+        # It quantizes the model when loading, with some config info from the
+        # LoRA adapter repo. So need to set the parameter of load_format and
+        # qlora_adapter_name_or_path as below.
+        engine_args = EngineArgs(model=model,
+                                 quantization=quantization,
+                                 qlora_adapter_name_or_path=lora_repo,
+                                 load_format="bitsandbytes",
+                                 enable_lora=True,
+                                 max_lora_rank=64)
+    else:
+        engine_args = EngineArgs(model=model,
+                                 quantization=quantization,
+                                 enable_lora=True,
+                                 max_loras=4)
+    return LLMEngine.from_engine_args(engine_args)
+
+
+def main():
+    """Main function that sets up and runs the prompt processing."""
+
+    test_configs = [{
+        "name": "qlora_inference_example",
+        'model': "huggyllama/llama-7b",
+        'quantization': "bitsandbytes",
+        'lora_repo': 'timdettmers/qlora-flan-7b'
+    }, {
+        "name": "AWQ_inference_with_lora_example",
+        'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ',
+        'quantization': "awq",
+        'lora_repo': 'jashing/tinyllama-colorist-lora'
+    }, {
+        "name": "GPTQ_inference_with_lora_example",
+        'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ',
+        'quantization': "gptq",
+        'lora_repo': 'jashing/tinyllama-colorist-lora'
+    }]
+
+    for test_config in test_configs:
+        print(
+            f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~"
+        )
+        engine = initialize_engine(test_config['model'],
+                                   test_config['quantization'],
+                                   test_config['lora_repo'])
+        lora_path = snapshot_download(repo_id=test_config['lora_repo'])
+        test_prompts = create_test_prompts(lora_path)
+        process_requests(engine, test_prompts)
+
+        # Clean up the GPU memory for the next test
+        del engine
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+if __name__ == '__main__':
+    main()