diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md index 204f93215..286043987 100644 --- a/docs/configuration/env_vars.md +++ b/docs/configuration/env_vars.md @@ -23,7 +23,9 @@ This document lists the supported diagnostic and profiling, as well as performan | `VLLM_EXPONENTIAL_BUCKETING` | Enables exponential bucket spacing instead of linear spacing. | `true` | | `VLLM_BUCKETING_FROM_FILE` | Enables reading bucket configuration from file | `None` | -## Experimental Parameters +## Developer Mode Parameters + +To enter developer mode use `VLLM_DEVELOPER_MODE`: | Parameter name | Description | Default value | | ------------------ | ------------------------ | ------------- | diff --git a/vllm_gaudi/extension/features.py b/vllm_gaudi/extension/features.py index fd6275524..d657fc279 100644 --- a/vllm_gaudi/extension/features.py +++ b/vllm_gaudi/extension/features.py @@ -12,7 +12,7 @@ def get_user_flags(): flags = [ - Env('VLLM_ENABLE_EXPERIMENTAL_FLAGS', boolean), + Env('VLLM_DEVELOPER_MODE', boolean), Env('VLLM_EXPONENTIAL_BUCKETING', boolean), Env('VLLM_PROMPT_BS_BUCKET_MIN', int), Env('VLLM_PROMPT_BS_BUCKET_STEP', int), diff --git a/vllm_gaudi/extension/runtime.py b/vllm_gaudi/extension/runtime.py index 629a1bcb1..f800843b5 100644 --- a/vllm_gaudi/extension/runtime.py +++ b/vllm_gaudi/extension/runtime.py @@ -63,10 +63,11 @@ def finalize_config(): user_flags = filter_defined(detected, USER_FLAGS) experimental_flags = filter_defined(detected, EXPERIMENTAL_FLAGS) + experimental_flags = {k: v for k, v in user_flags.items() if k not in user_flags} environment_values = filter_defined(detected, ENVIRONMENT_VALUES) feature_values = filter_defined(detected, FEATURE_VALUES) - if len(experimental_flags) > 0 and not detected.VLLM_ENABLE_EXPERIMENTAL_FLAGS: + if len(experimental_flags) > 0 and not detected.VLLM_DEVELOPER_MODE: asterisks = 48 * '*' header = f"{asterisks} Warning! {asterisks}" footer = '*' * len(header) @@ -74,7 +75,7 @@ def finalize_config(): logger().warning( f"Following environment variables are considered experimental: {', '.join(experimental_flags)}") logger().warning( - "From v0.12.0 release using those flags without VLLM_ENABLE_EXPERIMENTAL_FLAGS will trigger a fatal error.") + "From v0.12.0 release using those flags without VLLM_DEVELOPER_MODE will trigger a fatal error.") logger().warning(footer) dump('Environment', environment_values) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 8c16cdda8..de51c390b 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -8,6 +8,7 @@ import os import sys import time +from tqdm import tqdm from dataclasses import dataclass, field, fields from typing import (TYPE_CHECKING, Any, Callable, Optional, TypeAlias, Union, cast) @@ -3610,7 +3611,7 @@ def log_warmup(self, phase, i, max_i, first_dim, second_dim, third_dim, causal=F f"query_len:{second_dim} " f"num_blocks:{third_dim} " f"free_mem:{free_mem}") - logger.info(msg) + tqdm.write(msg) def log_warmup_multimodal(self, phase, i, max_i, batch_size, seq_len, img_args): free_mem = format_bytes(HabanaMemoryProfiler.current_free_device_memory()) @@ -3760,45 +3761,57 @@ def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_bat idx = 0 num_candidates = len(buckets) captured_all = True - for idx, (batch_size, seq_len, num_blocks) in enumerate(reversed(buckets)): - if seq_len > self.max_num_tokens: - continue - # Graph memory usage is proportional to seq dimension in a batch - phase = f"Graph/{'prompt' if is_prompt else 'decode'}" - if is_prompt: - batch_seq = batch_size * seq_len * num_blocks if num_blocks else batch_size * seq_len - else: - batch_seq = batch_size - - graphed_bucket = (batch_size, seq_len, num_blocks, is_prompt) - if graphed_bucket in self.graphed_buckets: - continue - self.graphed_buckets.add(graphed_bucket) - self.log_warmup(phase, idx, num_candidates, batch_size, seq_len, num_blocks) - prompt_cfg, decode_cfg = None, None - with HabanaMemoryProfiler() as mem_prof: + developer_settings = get_config().VLLM_DEVELOPER_MODE + phase = 'Prompt' if is_prompt else 'Decode' + desc = f'{phase} warmup processing: ' + with tqdm(total=num_candidates, desc=desc, unit="item") as pbar: + for idx, (batch_size, seq_len, num_blocks) in enumerate(reversed(buckets)): + if seq_len > self.max_num_tokens: + continue + # Graph memory usage is proportional to seq dimension in a batch if is_prompt: - prompt_cfg = (batch_size, seq_len, num_blocks) + batch_seq = batch_size * seq_len * num_blocks if num_blocks else batch_size * seq_len else: - decode_cfg = (batch_size, 1, num_blocks) - self._prepare_dummy_scenario(prompt_cfg, decode_cfg) - # TODO(kzawora): align_workers - used_mem = mem_prof.consumed_device_memory - total_mem += used_mem - total_batch_seq += batch_seq + batch_seq = batch_size + + graphed_bucket = (batch_size, seq_len, num_blocks, is_prompt) + if graphed_bucket in self.graphed_buckets: + continue + self.graphed_buckets.add(graphed_bucket) + if developer_settings: + self.log_warmup(phase, idx, num_candidates, batch_size, seq_len, num_blocks) + prompt_cfg, decode_cfg = None, None + with HabanaMemoryProfiler() as mem_prof: + if is_prompt: + prompt_cfg = (batch_size, seq_len, num_blocks) + else: + decode_cfg = (batch_size, 1, num_blocks) + self._prepare_dummy_scenario(prompt_cfg, decode_cfg) + # TODO(kzawora): align_workers + used_mem = mem_prof.consumed_device_memory + total_mem += used_mem + total_batch_seq += batch_seq + + pbar.set_postfix_str(f"{idx}/{num_candidates}") + pbar.update(1) return total_mem, total_batch_seq, captured_all def warmup_unified_graphs(self, buckets, kv_cache): idx = 0 num_candidates = len(buckets) - for idx, (query, shared_ctx, unique_ctx, is_causal) in enumerate(reversed(buckets)): - unified_cfg = (query, shared_ctx, unique_ctx, is_causal) - if unified_cfg in self.graphed_buckets: - continue - self.graphed_buckets.add(unified_cfg) - self.log_warmup("Unified CFG", idx, num_candidates, query, shared_ctx, unique_ctx, is_causal) - self._prepare_dummy_unified_scenario(unified_cfg) + developer_settings = get_config().VLLM_DEVELOPER_MODE + with tqdm(total=num_candidates, desc="Unified Attention warmup", unit="item") as pbar: + for idx, (query, shared_ctx, unique_ctx, is_causal) in enumerate(reversed(buckets)): + unified_cfg = (query, shared_ctx, unique_ctx, is_causal) + if unified_cfg in self.graphed_buckets: + continue + self.graphed_buckets.add(unified_cfg) + if developer_settings: + self.log_warmup("Unified CFG", idx, num_candidates, query, shared_ctx, unique_ctx, is_causal) + self._prepare_dummy_unified_scenario(unified_cfg) + pbar.set_postfix_str(f"{idx}/{num_candidates}") + pbar.update(1) def _add_dummy_request(self, requests,