diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6d14162d0..b81bdeaa0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,13 +3,23 @@ ci: autoupdate_schedule: quarterly repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-json + - id: check-yaml + - id: debug-statements + - id: mixed-line-ending + args: [--fix=lf] + - repo: https://github.com/Lucas-C/pre-commit-hooks rev: v1.5.5 hooks: - id: insert-license files: | (?x)^( - auto_round/.*(py|yaml|yml|sh) + auto_round/.*(py|yaml|yml|sh)| + auto_round_extension/.*(py|yaml|yml|sh) )$ args: [ @@ -26,7 +36,14 @@ repos: args: [-w] additional_dependencies: - tomli - exclude: | - (?x)^( - examples/.*(txt|patch) - )$ + + - repo: https://github.com/pycqa/isort + rev: 6.0.1 + hooks: + - id: isort + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.12.4 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix, --no-cache] diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 1a8c885d7..003035922 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -13,6 +13,7 @@ # limitations under the License. import sys + def run_eval(): from auto_round.script.llm import setup_eval_parser args = setup_eval_parser() @@ -58,7 +59,7 @@ def run_fast(): def run_mllm(): if "--eval" in sys.argv: - from auto_round.script.llm import setup_eval_parser, eval + from auto_round.script.llm import eval, setup_eval_parser sys.argv.remove("--eval") args = setup_eval_parser() args.mllm = True @@ -76,7 +77,7 @@ def run_mllm(): def run_lmms(): # from auto_round.script.lmms_eval import setup_lmms_args, eval - from auto_round.script.mllm import setup_lmms_parser, lmms_eval + from auto_round.script.mllm import lmms_eval, setup_lmms_parser args = setup_lmms_parser() lmms_eval(args) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 6a11b075f..056308f41 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -12,54 +12,62 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import os import re import sys +import time +from typing import Any, Union +import accelerate import torch -import copy -import time -from typing import Union, Any -from transformers import set_seed from torch import autocast from tqdm import tqdm -import accelerate +from transformers import set_seed +from auto_round.data_type import QUANT_FUNC_WITH_DTYPE from auto_round.export.export_to_gguf.config import GGUF_CONFIG, GGUF_INNER_CONFIG, ModelType -from auto_round.wrapper import WrapperMultiblock, wrapper_block, unwrapper_block, WrapperLinear, unwrapper_layer +from auto_round.low_cpu_mem.utils import get_layers_before_block from auto_round.utils import ( + SUPPORTED_DTYPES, + SUPPORTED_LAYER_TYPES, + TORCH_VERSION_AT_LEAST_2_6, CpuInfo, + _gguf_args_check, block_forward, check_is_cpu, + check_seqlen_compatible, + check_skippable_keywords, check_to_quantized, + clear_memory, collect_best_params, + compile_func, convert_dtype_str2torch, detect_device, + find_matching_blocks, + flatten_list, get_block_names, + get_layer_config_by_gguf_format, + get_layer_features, + get_layer_names_in_block, + get_lm_head_name, get_module, + get_shared_keys, htcore, + infer_bits_by_data_type, + init_cache, + is_debug_mode, is_optimum_habana_available, + llm_load_model, logger, - to_device, - to_dtype, - get_layer_names_in_block, mv_module_from_gpu, - unsupport_meta_device, clear_memory, - compile_func, - find_matching_blocks, is_debug_mode, - TORCH_VERSION_AT_LEAST_2_6, - SUPPORTED_LAYER_TYPES, - get_layer_features, - set_module, - llm_load_model, reset_params, - init_cache, check_skippable_keywords, get_shared_keys, SUPPORTED_DTYPES, infer_bits_by_data_type, - _gguf_args_check, - check_seqlen_compatible, - get_layer_config_by_gguf_format, get_lm_head_name, flatten_list + set_module, + to_device, + to_dtype, + unsupport_meta_device, ) -from auto_round.data_type import QUANT_FUNC_WITH_DTYPE -from auto_round.low_cpu_mem.utils import get_layers_before_block +from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block class AutoRound(object): @@ -232,9 +240,9 @@ def __init__( self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device ##activation - self.act_group_size = act_group_size if not (act_group_size is None) else self.group_size - self.act_bits = act_bits if not (act_bits is None) else self.bits - self.act_sym = act_sym if not (act_sym is None) else self.sym + self.act_group_size = act_group_size if act_group_size is not None else self.group_size + self.act_bits = act_bits if act_bits is not None else self.bits + self.act_sym = act_sym if act_sym is not None else self.sym self.act_dynamic = act_dynamic self.act_data_type = act_data_type if self.act_data_type is None: @@ -272,7 +280,7 @@ def __init__( self.enable_torch_compile = enable_torch_compile if not self.enable_torch_compile and TORCH_VERSION_AT_LEAST_2_6 and self.act_bits > 8 and not is_debug_mode() \ - and self.low_cpu_mem_usage != True and "fp8" not in self.data_type and "fp8" not in self.act_data_type: + and not self.low_cpu_mem_usage and "fp8" not in self.data_type and "fp8" not in self.act_data_type: logger.info("'enable_torch_compile' is set to `False` by default. " \ "Enabling it can reduce tuning cost by 20%, but it might throw an exception.") @@ -280,7 +288,7 @@ def __init__( self.enable_torch_compile = False logger.warning("reset enable_torch_compile to `False` as activation quantization is enabled") - if self.low_cpu_mem_usage == True and self.enable_torch_compile: + if self.low_cpu_mem_usage and self.enable_torch_compile: self.enable_torch_compile = False logger.warning("reset enable_torch_compile to `False` as low_cpu_mem_usage is enabled") @@ -535,7 +543,7 @@ def parse_format_to_list(self, format: str) -> list: only_gguf = False if only_gguf: self.scale_dtype = torch.float32 - logger.info(f"change `scale_dtype` to `torch.float32`") + logger.info("change `scale_dtype` to `torch.float32`") # Adjust format settings based on compatibility for index in range(len(formats)): @@ -592,7 +600,7 @@ def _check_supported_format(self, format: str) -> bool: f" group_size={self.group_size}, sym={self.sym}, act_bits={self.act_bits}" elif format != "fake": logger.warning( - f"Currently only support to export auto_round format quantized model" + "Currently only support to export auto_round format quantized model" " with fp8 dtype activation for activation quantization." " Change format to fake and save." ) @@ -1320,7 +1328,7 @@ def quantize(self): keys = inputs.keys() input_id_str = [key for key in keys if key.startswith('hidden_state')] if len(input_id_str) != 1: - raise RuntimeError(f"hidden_states arg mismatch error," + raise RuntimeError("hidden_states arg mismatch error," "please raise an issue in https://github.com/intel/auto-round/issues") inputs["input_ids"] = inputs.pop(input_id_str[0], None) if q_inputs is not None: @@ -1853,10 +1861,10 @@ def forward(m, hidden_states=None, *positional_inputs, **kwargs): self.batch_dim = 1 if len(hidden_states.shape) > 1 and hidden_states.shape[1] > self.batch_size: logger.error( - f"this model has not been supported, " - f"please raise an issue in https://github.com/intel/auto-round/issues" - f" or try to set the `batch_size` to 1 and " - f"`gradient_accumulate_steps` to your current batch size.") + "this model has not been supported, " + "please raise an issue in https://github.com/intel/auto-round/issues" + " or try to set the `batch_size` to 1 and " + "`gradient_accumulate_steps` to your current batch size.") exit(-1) if hidden_states is not None: @@ -2094,8 +2102,7 @@ def get_act_max_hook(module, input, output): hook_handles = [] for n, m in model.named_modules(): - # for block - if hasattr(m, "act_dynamic") and m.act_dynamic == False and check_to_quantized(m): + if hasattr(m, "act_dynamic") and not m.act_dynamic and check_to_quantized(m): hook = m.register_forward_hook(get_act_max_hook) hook_handles.append(hook) continue @@ -2405,7 +2412,7 @@ def quant_blocks( model_type=model_type) else: PACKING_LAYER_WITH_FORMAT[target_backend](tmp_m.tmp_name, self.model, self.formats[0]) - pbar.set_description(f"Quantizing done") + pbar.set_description("Quantizing done") pbar.update(1) pbar.close() diff --git a/auto_round/calib_dataset.py b/auto_round/calib_dataset.py index 6fdd2b32b..de4eba825 100644 --- a/auto_round/calib_dataset.py +++ b/auto_round/calib_dataset.py @@ -14,12 +14,12 @@ import json import random +import sys import torch -from datasets import Dataset, IterableDataset, load_dataset, concatenate_datasets -from datasets import Features, Sequence, Value +from datasets import Dataset, Features, IterableDataset, Sequence, Value, concatenate_datasets, load_dataset from torch.utils.data import DataLoader -import sys + from .utils import is_local_path, logger CALIB_DATASETS = {} @@ -136,7 +136,7 @@ def get_pile_dataset(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", split error_message = str(e) # Check for proxy or SSL error if "proxy" in error_message.lower() or isinstance(e, ssl.SSLError) or "SSL" in error_message.upper(): - logger.error(f"Network error detected, please checking proxy settings." \ + logger.error("Network error detected, please checking proxy settings." \ "Error: {error_message}. Or consider using a backup dataset by `pip install modelscope`" \ " and set '--dataset swift/pile-val-backup' in AutoRound API.") else: diff --git a/auto_round/data_type/fp8.py b/auto_round/data_type/fp8.py index 70873e6ed..6253d662e 100644 --- a/auto_round/data_type/fp8.py +++ b/auto_round/data_type/fp8.py @@ -13,9 +13,15 @@ # limitations under the License. import torch -from auto_round.data_type.utils import get_gaudi_fp8_ste_func, float8_e4m3fn_ste, reshape_pad_tensor_by_group_size, \ - revert_tensor_by_pad, float8_e5m2_ste + from auto_round.data_type.register import register_dtype +from auto_round.data_type.utils import ( + float8_e4m3fn_ste, + float8_e5m2_ste, + get_gaudi_fp8_ste_func, + reshape_pad_tensor_by_group_size, + revert_tensor_by_pad, +) @register_dtype(("fp8_sym","fp8","fp8_e4m3")) diff --git a/auto_round/data_type/gguf.py b/auto_round/data_type/gguf.py index eba6db810..0d094f41d 100644 --- a/auto_round/data_type/gguf.py +++ b/auto_round/data_type/gguf.py @@ -13,8 +13,9 @@ # limitations under the License. import torch -from auto_round.data_type.utils import round_ste, reshape_pad_tensor_by_group_size, revert_tensor_by_pad, logger + from auto_round.data_type.register import register_dtype +from auto_round.data_type.utils import logger, reshape_pad_tensor_by_group_size, revert_tensor_by_pad, round_ste from auto_round.utils import get_reciprocal @@ -506,7 +507,7 @@ def quant_tensor_gguf_sym_dq( Returns: Quantized and de-quantized tensor, scale, zero-point """ - from auto_round.export.export_to_gguf.config import QK_K, K_SCALE_SIZE, GGML_QUANT_SIZES + from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, K_SCALE_SIZE, QK_K from auto_round.export.export_to_gguf.packing import make_q3_quants, make_qx_quants if bits not in [3, 6]: diff --git a/auto_round/data_type/int.py b/auto_round/data_type/int.py index 5367b6925..be5d7cf33 100644 --- a/auto_round/data_type/int.py +++ b/auto_round/data_type/int.py @@ -13,8 +13,9 @@ # limitations under the License. import torch -from auto_round.data_type.utils import round_ste, reshape_pad_tensor_by_group_size, revert_tensor_by_pad + from auto_round.data_type.register import register_dtype +from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad, round_ste @register_dtype("int_sym") diff --git a/auto_round/data_type/mxfp.py b/auto_round/data_type/mxfp.py index af115311d..f5ad2aa08 100644 --- a/auto_round/data_type/mxfp.py +++ b/auto_round/data_type/mxfp.py @@ -13,8 +13,9 @@ # limitations under the License. import torch -from auto_round.data_type.utils import floor_ste, round_ste, reshape_pad_tensor_by_group_size, revert_tensor_by_pad -from auto_round.data_type.register import register_dtype, QUANT_FUNC_WITH_DTYPE + +from auto_round.data_type.register import QUANT_FUNC_WITH_DTYPE, register_dtype +from auto_round.data_type.utils import floor_ste, reshape_pad_tensor_by_group_size, revert_tensor_by_pad, round_ste MXFP_FORMAT_CACHE = { # data type: ebits, mbits, emax, max_norm, min_norm diff --git a/auto_round/data_type/nvfp.py b/auto_round/data_type/nvfp.py index d1293e899..147264c68 100644 --- a/auto_round/data_type/nvfp.py +++ b/auto_round/data_type/nvfp.py @@ -16,7 +16,7 @@ from auto_round.data_type.fp8 import float8_e4m3fn_ste from auto_round.data_type.register import register_dtype -from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad, logger +from auto_round.data_type.utils import logger, reshape_pad_tensor_by_group_size, revert_tensor_by_pad # taken from diff --git a/auto_round/data_type/utils.py b/auto_round/data_type/utils.py index 5265d55fc..748417038 100644 --- a/auto_round/data_type/utils.py +++ b/auto_round/data_type/utils.py @@ -12,11 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from functools import lru_cache + import torch + from auto_round.data_type.register import QUANT_FUNC_WITH_DTYPE -from functools import lru_cache from auto_round.utils import logger + def reshape_pad_tensor_by_group_size(data: torch.Tensor, group_size: int): """Reshapes and pads the tensor to ensure that it can be quantized in groups of `group_size`. diff --git a/auto_round/data_type/w4fp8.py b/auto_round/data_type/w4fp8.py index 9b5123759..1dd550b22 100644 --- a/auto_round/data_type/w4fp8.py +++ b/auto_round/data_type/w4fp8.py @@ -15,8 +15,7 @@ import torch from auto_round.data_type.register import register_dtype -from auto_round.data_type.utils import get_gaudi_fp8_ste_func, float8_e4m3fn_ste - +from auto_round.data_type.utils import float8_e4m3fn_ste, get_gaudi_fp8_ste_func # @register_dtype("fp8_gaudi3_to_int_sym") # def progressive_quant_fp8_int4_gaudi3( diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py index 5536c48c4..293379bab 100644 --- a/auto_round/eval/evaluation.py +++ b/auto_round/eval/evaluation.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os from typing import Optional, Union from lm_eval import simple_evaluate as lm_simple_evaluate -import os os.environ["TOKENIZERS_PARALLELISM"] = "false" diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py index 34dd7aea0..d74040419 100644 --- a/auto_round/export/export_to_autogptq/export.py +++ b/auto_round/export/export_to_autogptq/export.py @@ -12,6 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy +import inspect +import json +import os +from concurrent.futures import ThreadPoolExecutor + +import threadpoolctl as tctl + # MIT License # # Copyright (c) 2023 潘其威(William) @@ -34,22 +42,21 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import torch - -import auto_round.export.export_to_autogptq.qlinear_triton -from auto_round.utils import check_to_quantized, get_block_names, \ - get_module, logger, set_module, SUPPORTED_LAYER_TYPES, filter_quantization_config -import copy -import json -import os - import torch.nn as nn import transformers - -import threadpoolctl as tctl -import inspect from tqdm import tqdm -from concurrent.futures import ThreadPoolExecutor -from auto_round.utils import get_autogptq_packing_qlinear + +import auto_round.export.export_to_autogptq.qlinear_triton +from auto_round.utils import ( + SUPPORTED_LAYER_TYPES, + check_to_quantized, + filter_quantization_config, + get_autogptq_packing_qlinear, + get_block_names, + get_module, + logger, + set_module, +) BLOCK_PATTERNS = [ ## copy from transformers optimum "transformer.h", @@ -142,7 +149,7 @@ def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exll flattened_list = [item for sublist in all_blocks for item in sublist] common_prefix = os.path.commonprefix(flattened_list).rstrip('.') if common_prefix not in BLOCK_PATTERNS: - logger.error(f"auto-gptq format may not support loading this quantized model") + logger.error("auto-gptq format may not support loading this quantized model") quantization_config['block_name_to_quantize'] = common_prefix quantization_config.pop("to_quant_block_names", None) diff --git a/auto_round/export/export_to_autogptq/qlinear_triton.py b/auto_round/export/export_to_autogptq/qlinear_triton.py index c51920109..671737d3b 100644 --- a/auto_round/export/export_to_autogptq/qlinear_triton.py +++ b/auto_round/export/export_to_autogptq/qlinear_triton.py @@ -19,6 +19,7 @@ import torch.nn as nn import transformers + class TritonModuleMixin: @classmethod def warmup(cls, model, transpose=False, seqlen=2048): diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 10da020a2..a0ac9ae8b 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -14,20 +14,28 @@ import copy +import inspect import json import os +from concurrent.futures import ThreadPoolExecutor +import threadpoolctl as tctl import torch import torch.nn as nn import transformers - -from auto_round.utils import get_module, logger, set_module, SUPPORTED_LAYER_TYPES, check_to_quantized, \ - filter_quantization_config, SUPPORTED_FORMATS -import threadpoolctl as tctl -import inspect from tqdm import tqdm -from concurrent.futures import ThreadPoolExecutor -from auto_round.utils import get_autogptq_packing_qlinear, check_start_with_block_name + +from auto_round.utils import ( + SUPPORTED_FORMATS, + SUPPORTED_LAYER_TYPES, + check_start_with_block_name, + check_to_quantized, + filter_quantization_config, + get_autogptq_packing_qlinear, + get_module, + logger, + set_module, +) def check_neq_config(config, data_type, bits, group_size, sym): @@ -83,7 +91,7 @@ def dynamic_import_quant_linear_for_packing(backend, bits, group_size, sym, act_ elif "awq" in backend: from ..export_to_awq.utils import WQLinear_GEMM return WQLinear_GEMM - elif "gptq" in backend and not "gptqmodel" in backend: ## have g_idx + elif "gptq" in backend and "gptqmodel" not in backend: ## have g_idx return get_autogptq_packing_qlinear(backend, bits, group_size, sym) else: raise ValueError( @@ -261,7 +269,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex return save_quantized_as_autoround(output_dir, inplace=inplace, backend="auto_round", **kwargs) ##if using sym, we change to gptq sym kernel to avoid compiling from auto_round source - if (kwargs.get("sym") is None or kwargs.get("sym") == True) and ("gptq" not in backend and "awq" not in backend): + if (kwargs.get("sym") is None or kwargs.get("sym")) and ("gptq" not in backend and "awq" not in backend): backend = backend.replace('auto_round', 'auto_round:auto_gptq') model = kwargs["model"] @@ -279,8 +287,8 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex processor = kwargs.get("processor", None) extra_config = {} block_name_to_quantize = quantization_config["block_name_to_quantize"] - if isinstance(block_name_to_quantize, str): \ - block_name_to_quantize = block_name_to_quantize.split(",") + if isinstance(block_name_to_quantize, str): + block_name_to_quantize = block_name_to_quantize.split(",") elif isinstance(block_name_to_quantize, list): for i in range(len(block_name_to_quantize)): block_name_to_quantize[i] = os.path.commonprefix(block_name_to_quantize[i]).rstrip('.') diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index d29e021c2..02ef4d755 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -13,15 +13,24 @@ # limitations under the License. import copy -import os -import torch import json -from auto_round.utils import logger, set_module, SUPPORTED_LAYER_TYPES, check_to_quantized, \ - filter_quantization_config, get_module, check_start_with_block_name +import os +from concurrent.futures import ThreadPoolExecutor + import threadpoolctl as tctl +import torch import transformers from tqdm import tqdm -from concurrent.futures import ThreadPoolExecutor + +from auto_round.utils import ( + SUPPORTED_LAYER_TYPES, + check_start_with_block_name, + check_to_quantized, + filter_quantization_config, + get_module, + logger, + set_module, +) def check_neq_config(config, data_type, bits, group_size, sym): @@ -169,8 +178,8 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round", processor = kwargs.get("processor", None) extra_config = {} block_name_to_quantize = quantization_config["block_name_to_quantize"] - if isinstance(block_name_to_quantize, str): \ - block_name_to_quantize = block_name_to_quantize.split(",") + if isinstance(block_name_to_quantize, str): + block_name_to_quantize = block_name_to_quantize.split(",") elif isinstance(block_name_to_quantize, list): for i in range(len(block_name_to_quantize)): block_name_to_quantize[i] = os.path.commonprefix(block_name_to_quantize[i]).rstrip('.') diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py index 4ef02662e..22c13048e 100644 --- a/auto_round/export/export_to_awq/export.py +++ b/auto_round/export/export_to_awq/export.py @@ -20,22 +20,28 @@ # SOFTWARE. -import os -import torch -import torch.nn as nn - -from auto_round.utils import (logger, get_module, - set_module, - check_to_quantized, - get_block_names, - extract_block_names_to_str, SUPPORTED_LAYER_TYPES, filter_quantization_config) import copy import json -from auto_round.export.export_to_awq.utils import WQLinear_GEMM +import os from concurrent.futures import ThreadPoolExecutor + import threadpoolctl as tctl +import torch +import torch.nn as nn from tqdm import tqdm +from auto_round.export.export_to_awq.utils import WQLinear_GEMM +from auto_round.utils import ( + SUPPORTED_LAYER_TYPES, + check_to_quantized, + extract_block_names_to_str, + filter_quantization_config, + get_block_names, + get_module, + logger, + set_module, +) + def pack_layer(name, model, backend): if name == "lm_head": ##dese not support lm-head diff --git a/auto_round/export/export_to_awq/utils.py b/auto_round/export/export_to_awq/utils.py index 64e826e9e..72e218df5 100644 --- a/auto_round/export/export_to_awq/utils.py +++ b/auto_round/export/export_to_awq/utils.py @@ -33,8 +33,9 @@ # SOFTWARE. import gc -import torch import warnings + +import torch import torch.nn as nn from torch.autograd import Function diff --git a/auto_round/export/export_to_gguf/config.py b/auto_round/export/export_to_gguf/config.py index d2b6a821e..e6b71586b 100644 --- a/auto_round/export/export_to_gguf/config.py +++ b/auto_round/export/export_to_gguf/config.py @@ -13,6 +13,7 @@ # limitations under the License. from enum import IntEnum + class ModelType(IntEnum): TEXT = 1 MMPROJ = 2 diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index 45b99d2f0..ea69b0bbd 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -34,29 +34,29 @@ from __future__ import annotations -import ast -import gc -import logging import argparse +import ast import contextlib +import gc import json +import logging +import math import os import re import sys -import psutil from enum import IntEnum -from pathlib import Path from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast from itertools import chain +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast -import math import numpy as np +import psutil import torch from transformers import AutoConfig -from auto_round.utils import logger, LazyImport, get_module, clean_module_parameter from auto_round.export.export_to_gguf.packing import ggml_quant +from auto_round.utils import LazyImport, clean_module_parameter, get_module, logger gguf = LazyImport("gguf") @@ -4741,8 +4741,7 @@ def phantom(tok): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - if name.startswith("bert."): - name = name[5:] + name = name.removeprefix("bert.") if name.endswith(".gamma"): name = name[:-6] + ".weight" @@ -4798,6 +4797,7 @@ def _xlmroberta_set_vocab(self) -> None: raise FileNotFoundError(f"File not found: {tokenizer_path}") from base64 import b64decode + from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model) @@ -4918,8 +4918,7 @@ def set_gguf_parameters(self): super().set_gguf_parameters() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if name.startswith("distilbert."): - name = name[11:] + name = name.removeprefix("distilbert.") # These layers act as MLM head, so we don't need them if name.startswith("vocab_"): @@ -4960,8 +4959,7 @@ def set_vocab(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # if name starts with "roberta.", remove the prefix # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main - if name.startswith("roberta."): - name = name[8:] + name = name.removeprefix("roberta.") # position embeddings start at pad_token_id + 1, so just chop down the weight tensor if name == "embeddings.position_embeddings.weight": @@ -5076,8 +5074,7 @@ def modify_tensors(self, data_torch, name, bid): if name.startswith("decoder."): return [] - if name.startswith("model."): - name = name[6:] + name = name.removeprefix("model.") return super().modify_tensors(data_torch, name, bid) @@ -5096,8 +5093,7 @@ def set_vocab(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # if name starts with "roberta.", remove the prefix # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main - if name.startswith("roberta."): - name = name[8:] + name = name.removeprefix("roberta.") # position embeddings start at pad_token_id + 1, so just chop down the weight tensor if name == "embeddings.position_embeddings.weight": diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py index 14b8d0993..e42b49f95 100644 --- a/auto_round/export/export_to_gguf/export.py +++ b/auto_round/export/export_to_gguf/export.py @@ -13,14 +13,23 @@ # limitations under the License. import os -import sys import shutil -import torch -from pathlib import Path +import sys import time +from pathlib import Path + +import torch + from auto_round.export.export_to_gguf.convert import ModelBase, ModelType, get_model_architecture -from auto_round.utils import logger, LazyImport, get_block_names, flatten_list, check_to_quantized, get_module, \ - clear_memory +from auto_round.utils import ( + LazyImport, + check_to_quantized, + clear_memory, + flatten_list, + get_block_names, + get_module, + logger, +) TMP_DIR_NAME = "tmp_dir" diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py index b377fdf28..2dfe3b82b 100644 --- a/auto_round/export/export_to_gguf/packing.py +++ b/auto_round/export/export_to_gguf/packing.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch import numpy as np +import torch -from auto_round.export.export_to_gguf.config import QK_K, K_SCALE_SIZE, GGML_QUANT_SIZES +from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, K_SCALE_SIZE, QK_K from auto_round.utils import get_reciprocal GGML_QUANT_TYPE = {} diff --git a/auto_round/export/export_to_itrex/export.py b/auto_round/export/export_to_itrex/export.py index 435eed66d..34aa1aed7 100644 --- a/auto_round/export/export_to_itrex/export.py +++ b/auto_round/export/export_to_itrex/export.py @@ -19,14 +19,14 @@ import torch import transformers + from auto_round.export.register import register_format -from auto_round.utils import get_module, logger, set_module, detect_device, check_to_quantized +from auto_round.utils import check_to_quantized, detect_device, get_module, logger, set_module from .config import QuantConfig from .model_wrapper import WeightOnlyLinear - def quant_weight_w_scale(weight, scale, zp, group_size=-1, device="cpu"): """Quant and dequant tensor with group size. diff --git a/auto_round/export/export_to_itrex/model_wrapper.py b/auto_round/export/export_to_itrex/model_wrapper.py index c8e985939..05d0ce45e 100644 --- a/auto_round/export/export_to_itrex/model_wrapper.py +++ b/auto_round/export/export_to_itrex/model_wrapper.py @@ -19,12 +19,13 @@ # since the model classes inherit torch.nn.Module. import math +import numpy as np import torch from packaging.version import Version from torch.autograd import Function from torch.nn import functional as F -import numpy as np -from auto_round.utils import logger, can_pack_with_numba + +from auto_round.utils import can_pack_with_numba, logger NF4 = [ -1.0, diff --git a/auto_round/export/export_to_llmcompressor/export.py b/auto_round/export/export_to_llmcompressor/export.py index a60338752..d9e5dc50d 100644 --- a/auto_round/export/export_to_llmcompressor/export.py +++ b/auto_round/export/export_to_llmcompressor/export.py @@ -13,10 +13,12 @@ # limitations under the License. import os + import torch -from auto_round.utils import get_module, logger, set_module, detect_device -from auto_round.wrapper import WrapperWALayer + from auto_round.export.export_to_llmcompressor.config import quantization_config +from auto_round.utils import detect_device, get_module, logger, set_module +from auto_round.wrapper import WrapperWALayer @torch.no_grad() diff --git a/auto_round/inference/auto_quantizer.py b/auto_round/inference/auto_quantizer.py index 3aea03e04..765a7e150 100644 --- a/auto_round/inference/auto_quantizer.py +++ b/auto_round/inference/auto_quantizer.py @@ -29,8 +29,9 @@ import importlib.util import warnings from dataclasses import dataclass +from enum import Enum from logging import getLogger -from typing import Any, Dict, Optional, Tuple, Union, List +from typing import Any, Dict, List, Optional, Tuple, Union import torch import torch.nn as nn @@ -39,9 +40,9 @@ from transformers.quantizers import AutoQuantizationConfig, HfQuantizer from transformers.quantizers.auto import AUTO_QUANTIZER_MAPPING from transformers.utils.quantization_config import AwqConfig, GPTQConfig, QuantizationConfigMixin, QuantizationMethod -from auto_round.utils import (is_hpu_supported) + from auto_round.inference.convert_model import convert_hf_model, infer_target_device, post_init -from enum import Enum +from auto_round.utils import is_hpu_supported logger = getLogger(__name__) import sys diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index d1ad65f97..47bffe6dd 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -14,7 +14,7 @@ import functools from dataclasses import dataclass, field -from typing import List, Any, Optional +from typing import Any, List, Optional from transformers.utils.versions import require_version @@ -308,7 +308,7 @@ def check_compatible(backend_name, device, bits, group_size, sym, packing_format backend = BackendInfos[backend_name] # Check if device is supported by the backend - if not device in backend.device: + if device not in backend.device: return False # Check if bit-width is supported diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index 1e6a565b5..6004e8acc 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -15,19 +15,31 @@ import re from logging import getLogger from typing import Union -from tqdm import tqdm + import torch import torch.nn as nn - +from tqdm import tqdm from transformers.pytorch_utils import Conv1D -from auto_round.utils import ( - get_module, set_module, is_hpu_supported, get_block_names, find_matching_blocks, - get_layer_names_in_block, check_to_quantized, check_start_with_block_name, SUPPORTED_LAYER_TYPES) - from auto_round.inference.backend import ( - get_layer_backend, dynamic_import_inference_linear, find_backend, BackendInfos, get_highest_priority_backend, - process_requirement) + BackendInfos, + dynamic_import_inference_linear, + find_backend, + get_highest_priority_backend, + get_layer_backend, + process_requirement, +) +from auto_round.utils import ( + SUPPORTED_LAYER_TYPES, + check_start_with_block_name, + check_to_quantized, + find_matching_blocks, + get_block_names, + get_layer_names_in_block, + get_module, + is_hpu_supported, + set_module, +) logger = getLogger(__name__) @@ -69,7 +81,9 @@ def get_keys_to_not_convert(model): Input model """ from copy import deepcopy + from accelerate.utils import find_tied_parameters + # Create a copy of the model and tie the weights, then # check if it contains tied weights tied_model = deepcopy(model) # this has 0 cost since it is done inside `init_empty_weights` context manager` @@ -508,7 +522,7 @@ def convert_hf_model(model: nn.Module, target_device="cpu"): quantization_config = model.config.quantization_config - if hasattr(quantization_config, "desc_act") and quantization_config.desc_act == True: + if hasattr(quantization_config, "desc_act") and quantization_config.desc_act: ##check static_group if (hasattr(quantization_config, "static_groups") and not quantization_config.static_groups) or ( not hasattr(quantization_config, "static_groups")): diff --git a/auto_round/low_cpu_mem/load.py b/auto_round/low_cpu_mem/load.py index 1ff291b16..44a8f2b0b 100644 --- a/auto_round/low_cpu_mem/load.py +++ b/auto_round/low_cpu_mem/load.py @@ -33,7 +33,6 @@ ) from ..low_cpu_mem import modified_pickle as pickle - from .utils import torch torch_version = torch.__version__.split("+")[0] diff --git a/auto_round/low_cpu_mem/utils.py b/auto_round/low_cpu_mem/utils.py index f4165b4eb..715556f0f 100644 --- a/auto_round/low_cpu_mem/utils.py +++ b/auto_round/low_cpu_mem/utils.py @@ -16,13 +16,13 @@ # limitations under the License. """Utils for layer wise quantization.""" -import os import gc import json -import pickle -from functools import partial import logging +import os +import pickle from collections import OrderedDict +from functools import partial import torch from accelerate import init_empty_weights @@ -30,9 +30,10 @@ from transformers import AutoConfig, AutoModelForCausalLM from transformers.models.auto.auto_factory import _BaseAutoModelClass -from .load import load from auto_round.utils import detect_device +from .load import load + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(filename)s L%(lineno)d: %(message)s") logger = logging.getLogger("low_cpu_mem_tools") diff --git a/auto_round/mllm/autoround_mllm.py b/auto_round/mllm/autoround_mllm.py index 5ab25e92d..ed6d06611 100644 --- a/auto_round/mllm/autoround_mllm.py +++ b/auto_round/mllm/autoround_mllm.py @@ -12,29 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Union -from tqdm import tqdm from copy import deepcopy +from typing import Union import torch +from tqdm import tqdm + +from auto_round.special_model_handler import SUPPORT_ONLY_TEXT_MODELS, _handle_special_model +from ..autoround import AutoRound +from ..low_cpu_mem.utils import get_layers_before_block from ..utils import ( - logger, + clear_memory, detect_device, + extract_block_names_to_str, + find_matching_blocks, + get_block_names, + logger, + mllm_load_model, to_device, to_dtype, - get_block_names, - find_matching_blocks, - extract_block_names_to_str, - clear_memory, - mllm_load_model ) -from ..autoround import AutoRound -from .template import get_template, Template -from auto_round.special_model_handler import SUPPORT_ONLY_TEXT_MODELS, _handle_special_model from .mllm_dataset import get_mllm_dataloader -from ..low_cpu_mem.utils import get_layers_before_block - +from .template import Template, get_template def _only_text_test(model, tokenizer, device, model_type): @@ -198,7 +198,7 @@ def __init__( (dataset in CALIB_DATASETS.keys() and not \ _only_text_test(model, tokenizer, device, self.template.model_type)): if quant_nontext_module: - logger.warning(f"Text only dataset cannot be used for calibrating non-text modules," + logger.warning("Text only dataset cannot be used for calibrating non-text modules," "switching to liuhaotian/llava_conv_58k") else: logger.warning(f"{model.config.model_type} not support for {dataset}," diff --git a/auto_round/mllm/eval.py b/auto_round/mllm/eval.py index 997dea72d..dfe4aaa52 100644 --- a/auto_round/mllm/eval.py +++ b/auto_round/mllm/eval.py @@ -46,14 +46,15 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import json import os import time -import json from functools import partial -import pandas as pd -from ..utils import logger, LazyImport import numpy as np +import pandas as pd + +from ..utils import LazyImport, logger vlmeval = LazyImport("vlmeval") diff --git a/auto_round/mllm/mllm_dataset.py b/auto_round/mllm/mllm_dataset.py index 18b7d0fd5..953c3e0c2 100644 --- a/auto_round/mllm/mllm_dataset.py +++ b/auto_round/mllm/mllm_dataset.py @@ -12,18 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import json +import os from typing import Dict import torch -from torch.utils.data import Dataset, DataLoader +from torch.utils.data import DataLoader, Dataset from transformers import set_seed -from .utils import _extract_data_dir -from .template import Template -from ..utils import logger from ..special_model_handler import check_mllm_model_batch +from ..utils import logger +from .template import Template +from .utils import _extract_data_dir MLLM_DATASET: Dict[str, Dataset] = {} @@ -250,7 +250,7 @@ def get_mllm_dataloader( tokenizer, seqlen, dataset, seed, bs, nsamples) if quant_nontext_module: logger.error( - f"Text only dataset cannot be used for calibrating non-text modules," + "Text only dataset cannot be used for calibrating non-text modules," " switching to liuhaotian/llava_conv_58k") exit(-1) return dataloader, bs, gradient_accumulate_steps diff --git a/auto_round/mllm/template.py b/auto_round/mllm/template.py index 0782fd09c..1794cca06 100644 --- a/auto_round/mllm/template.py +++ b/auto_round/mllm/template.py @@ -12,14 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import json +import os from dataclasses import dataclass -from typing import Dict, Optional, List from enum import Enum, unique +from typing import Dict, List, Optional from ..utils import logger -from .processor import BasicProcessor, PROCESSORS +from .processor import PROCESSORS, BasicProcessor TEMPLATES: Dict[str, "Template"] = {} diff --git a/auto_round/mllm/utils.py b/auto_round/mllm/utils.py index eebb2a189..90e67e855 100644 --- a/auto_round/mllm/utils.py +++ b/auto_round/mllm/utils.py @@ -13,6 +13,7 @@ # limitations under the License. import os + import requests from ..utils import LazyImport diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py index 8faea4e8f..7463dbcc4 100644 --- a/auto_round/script/llm.py +++ b/auto_round/script/llm.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import argparse +import logging + # Copyright (c) 2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -27,16 +30,16 @@ # limitations under the License. import os import re -import argparse import sys -import logging + from auto_round.utils import ( - get_fp_layer_names, clear_memory, get_device_and_parallelism, + get_fp_layer_names, get_model_dtype, + set_cuda_visible_devices, str2bool, - set_cuda_visible_devices) +) os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" @@ -341,11 +344,9 @@ def tune(args): args.eval_bs = "auto" import transformers + from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer - from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, AutoConfig - - from auto_round.utils import detect_device, get_library_version - from auto_round.utils import logger + from auto_round.utils import detect_device, get_library_version, logger if args.format is None: args.format = "auto_round" @@ -427,8 +428,8 @@ def tune(args): if lm_head_layer_name in item: ##TODO extend to encoder-decoder layer, seq classification model args.quant_lm_head = False logger.warning( - f"reset `quant_lm_head` to `False` as quantizing lm_head with tied weights has not been " - f"supported currently") + "reset `quant_lm_head` to `False` as quantizing lm_head with tied weights has not been " + "supported currently") break if args.quant_lm_head: @@ -670,8 +671,8 @@ def eval(args): eval_model_dtype = get_model_dtype(args.eval_model_dtype) if is_gguf_file: import torch - from transformers import AutoTokenizer, AutoModelForCausalLM from lm_eval.utils import make_table # pylint: disable=E0401 + from transformers import AutoModelForCausalLM, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model, gguf_file=gguf_file) logger.warning("evaluating gguf model is an experimental feature, the accuracy may be not correct.") @@ -719,12 +720,13 @@ def eval_task_by_task( # load after _eval_int in order to make sure import torch after set CUDA_VISBILE_DEVICES import traceback - from auto_round.utils import logger + from lm_eval import simple_evaluate as lm_simple_evaluate from lm_eval.models.huggingface import HFLM from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRoundConfig # pylint: disable=E0611 + from auto_round.utils import logger if batch_size is None: batch_size = "auto:8" is_gguf_file = False diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py index 2e913fd92..dabbc4014 100644 --- a/auto_round/script/mllm.py +++ b/auto_round/script/mllm.py @@ -12,17 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +import argparse import os import sys -import argparse from auto_round.utils import ( - get_fp_layer_names, clear_memory, - is_debug_mode, get_device_and_parallelism, - set_cuda_visible_devices, + get_fp_layer_names, + is_debug_mode, logger, + set_cuda_visible_devices, ) os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" @@ -284,8 +284,7 @@ def setup_lmeval_parser(): def tune(args): import transformers - - from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, AutoProcessor + from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer if args.format is None: args.format = "auto_round" @@ -377,8 +376,8 @@ def tune(args): if lm_head_layer_name in item: ##TODO extend to encoder-decoder layer, seq classification model args.quant_lm_head = False print( - f"warning, disable quant_lm_head as quantizing lm_head with tied weights has not been " - f"supported currently") + "warning, disable quant_lm_head as quantizing lm_head with tied weights has not been " + "supported currently") break if args.quant_lm_head: layer_config[lm_head_layer_name] = {"bits": args.bits} @@ -390,8 +389,8 @@ def tune(args): if args.quant_lm_head and args.low_gpu_mem_usage: print( - f"warning, low_gpu_mem_usage=False is strongly recommended if the whole model could be loaded to " - f"gpu") + "warning, low_gpu_mem_usage=False is strongly recommended if the whole model could be loaded to " + "gpu") if "--truncation" not in sys.argv: args.truncation = None diff --git a/auto_round/testing_utils.py b/auto_round/testing_utils.py index be856083c..c859c6188 100644 --- a/auto_round/testing_utils.py +++ b/auto_round/testing_utils.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import unittest import importlib.util +import unittest import torch - from transformers.utils.versions import require_version + def is_gguf_available(): return importlib.util.find_spec("gguf") is not None diff --git a/auto_round/utils.py b/auto_round/utils.py index f64c8fb41..988af3db8 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -13,23 +13,24 @@ # limitations under the License. import copy +import gc import logging import os -import sys +import re import subprocess +import sys from collections import UserDict -import re +from functools import lru_cache + import cpuinfo import psutil import torch +import transformers +from packaging import version from torch.amp import autocast -from functools import lru_cache -from packaging import version -import gc +from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType from auto_round.special_model_handler import SPECIAL_MULTIMODAL_BLOCK, SPECIAL_SHARED_CACHE_KEYS -import transformers -from auto_round.export.export_to_gguf.config import GGUF_CONFIG, GGML_QUANT_SIZES, GGUF_INNER_CONFIG, QK_K, ModelType SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings") @@ -108,6 +109,7 @@ def format(self, record): logger.addHandler(fh) import importlib + import transformers @@ -1094,7 +1096,7 @@ def check_awq_gemm_compatibility(model, bits, group_size, sym, layer_configs=Non - str: An error message describing why the model is incompatible, or an empty string if compatible. """ if bits != 4: - return False, f"AutoAWQ GEMM kernel only supports 4 bits" + return False, "AutoAWQ GEMM kernel only supports 4 bits" for n, m in model.named_modules(): if isinstance(m, transformers.pytorch_utils.Conv1D): return False, "AutoAWQ GEMM kernel does not support conv1d" @@ -1174,9 +1176,10 @@ def get_layer_features(layer): def _gguf_args_check(args_or_ar, format_str=None, model_type=ModelType.TEXT): - from auto_round.utils import logger import argparse + from auto_round.utils import logger + if format_str is None: args_or_ar.format = args_or_ar.format.replace("q*_", f"q{args_or_ar.bits}_") format_str = args_or_ar.format @@ -1209,6 +1212,7 @@ def _gguf_args_check(args_or_ar, format_str=None, model_type=ModelType.TEXT): if isinstance(args_or_ar.model, str) and os.path.isdir(args_or_ar.model): from pathlib import Path + from auto_round.export.export_to_gguf.convert import ModelBase hparams = ModelBase.load_hparams(Path(args_or_ar.model)) model_architecture = hparams["architectures"][0] @@ -1275,7 +1279,7 @@ def llm_load_model( low_cpu_mem_mode=0, low_cpu_mem_tmp_dir=None, **kwargs): - from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM + from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer is_glm = bool(re.search("chatglm", pretrained_model_name_or_path.lower())) low_cpu_mem_usage = False @@ -1342,9 +1346,10 @@ def mllm_load_model( model_dtype=None, **kwargs): import json + import transformers - from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM, AutoModel - from huggingface_hub import HfApi, hf_hub_download, HfFileSystem + from huggingface_hub import HfApi, HfFileSystem, hf_hub_download + from transformers import AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer if os.path.isdir(pretrained_model_name_or_path): config = json.load(open(os.path.join(pretrained_model_name_or_path, "config.json"))) @@ -1372,7 +1377,7 @@ def mllm_load_model( processor, image_processor = None, None if "deepseek_vl_v2" == model_type: - from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM # pylint: disable=E0401 + from deepseek_vl2.models import DeepseekVLV2ForCausalLM, DeepseekVLV2Processor # pylint: disable=E0401 processor = DeepseekVLV2Processor.from_pretrained(pretrained_model_name_or_path) tokenizer = processor.tokenizer model: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained( @@ -1628,6 +1633,7 @@ def get_layer_config_by_gguf_format(layer_config, gguf_format, model, model_type target_gguf_format = next((fmt for fmt in gguf_format if fmt != "fake"), None) import gguf # pylint: disable=E0401 + from auto_round.export.export_to_gguf.convert import ModelBase, get_model_architecture model_architecture = get_model_architecture(hparams=model.config.to_dict(), model_type=model_type) try: @@ -1912,7 +1918,7 @@ def get_gguf_qtype_by_layer_config(layer_config): return gguf.GGMLQuantizationType.Q6_K if bits == 8 and sym and group_size == 32: return gguf.GGMLQuantizationType.Q8_0 - raise ValueError(f"Unknown layer config") + raise ValueError("Unknown layer config") def flatten_list(nested_list): flattened = [] diff --git a/auto_round/wrapper.py b/auto_round/wrapper.py index a226daeed..12eb573fc 100644 --- a/auto_round/wrapper.py +++ b/auto_round/wrapper.py @@ -13,15 +13,12 @@ # limitations under the License. import torch -from torch.functional import F import transformers +from torch.functional import F + from auto_round.data_type import get_quant_func -from .utils import ( - check_to_quantized, - get_scale_shape, - set_module, - logger, SUPPORTED_LAYER_TYPES -) + +from .utils import SUPPORTED_LAYER_TYPES, check_to_quantized, get_scale_shape, logger, set_module def reshape_and_pad_tensor(v, group_size=-1): diff --git a/auto_round_extension/__init__.py b/auto_round_extension/__init__.py index e69de29bb..2b414eb72 100644 --- a/auto_round_extension/__init__.py +++ b/auto_round_extension/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/auto_round_extension/cuda/__init__.py b/auto_round_extension/cuda/__init__.py index e69de29bb..2b414eb72 100644 --- a/auto_round_extension/cuda/__init__.py +++ b/auto_round_extension/cuda/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/auto_round_extension/cuda/gptqmodel_marlin.py b/auto_round_extension/cuda/gptqmodel_marlin.py index 242037d9b..42bc2c75a 100644 --- a/auto_round_extension/cuda/gptqmodel_marlin.py +++ b/auto_round_extension/cuda/gptqmodel_marlin.py @@ -24,13 +24,13 @@ def get_marlin_layer(): ##use an ugly wrapper to import gptqmodel on demand - from gptqmodel.models._const import DEVICE, PLATFORM # pylint: disable=E0401 - from gptqmodel.nn_modules.qlinear import BaseQuantLinear # pylint: disable=E0401 - from gptqmodel.utils.backend import BACKEND # pylint: disable=E0401 + from gptqmodel.models._const import DEVICE, PLATFORM # pylint: disable=E0401 + from gptqmodel.nn_modules.qlinear import BaseQuantLinear # pylint: disable=E0401 + from gptqmodel.utils.backend import BACKEND # pylint: disable=E0401 marlin_import_exception = None try: - import gptqmodel_marlin_kernels # pylint: disable=E0401 + import gptqmodel_marlin_kernels # pylint: disable=E0401 except ImportError as e: marlin_import_exception = e diff --git a/auto_round_extension/hpu/__init__.py b/auto_round_extension/hpu/__init__.py index e69de29bb..2b414eb72 100644 --- a/auto_round_extension/hpu/__init__.py +++ b/auto_round_extension/hpu/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/auto_round_extension/hpu/qlinear_hpu.py b/auto_round_extension/hpu/qlinear_hpu.py index 9a6eb1490..5bcd06927 100644 --- a/auto_round_extension/hpu/qlinear_hpu.py +++ b/auto_round_extension/hpu/qlinear_hpu.py @@ -40,6 +40,7 @@ import torch import torch.nn as nn import transformers + try: import habana_frameworks.torch.core as htcore convert_from_uint4 = torch.ops.hpu.convert_from_uint4 diff --git a/auto_round_extension/hpu/qlinear_hpu_gptq.py b/auto_round_extension/hpu/qlinear_hpu_gptq.py index c2f3a884e..0016ff032 100644 --- a/auto_round_extension/hpu/qlinear_hpu_gptq.py +++ b/auto_round_extension/hpu/qlinear_hpu_gptq.py @@ -40,6 +40,7 @@ import torch import torch.nn as nn import transformers + try: import habana_frameworks.torch.core as htcore convert_from_uint4 = torch.ops.hpu.convert_from_uint4 diff --git a/auto_round_extension/ipex/__init__.py b/auto_round_extension/ipex/__init__.py index 2d32570d3..2929160d8 100644 --- a/auto_round_extension/ipex/__init__.py +++ b/auto_round_extension/ipex/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from auto_round_extension.ipex.qlinear_ipex_awq import QuantLinear as IpexAWQQuantLinear from auto_round_extension.ipex.qlinear_ipex_gptq import ( QuantLinear as IpexGPTQQuantLinear, diff --git a/auto_round_extension/ipex/qlinear_ipex_awq.py b/auto_round_extension/ipex/qlinear_ipex_awq.py index b03d2104f..83afdb2ff 100644 --- a/auto_round_extension/ipex/qlinear_ipex_awq.py +++ b/auto_round_extension/ipex/qlinear_ipex_awq.py @@ -1,6 +1,21 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch import torch.nn as nn + class QuantLinear(nn.Module): QUANT_TYPE = "ipex_awq" def __init__(self, w_bit, group_size, in_features, out_features, bias, dev): diff --git a/auto_round_extension/ipex/qlinear_ipex_gptq.py b/auto_round_extension/ipex/qlinear_ipex_gptq.py index 042f0226f..e308ec5b9 100644 --- a/auto_round_extension/ipex/qlinear_ipex_gptq.py +++ b/auto_round_extension/ipex/qlinear_ipex_gptq.py @@ -1,12 +1,25 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math +from logging import getLogger import numpy as np import torch import torch.nn as nn import transformers -from logging import getLogger - logger = getLogger(__name__) BITS_DTYPE_MAPPING = { @@ -100,9 +113,13 @@ def post_init(self): # if not self.training and IPEX_AVAILABLE: if not self.training: import intel_extension_for_pytorch as ipex - from intel_extension_for_pytorch.nn.modules.weight_only_quantization import WeightOnlyQuantizedLinear, \ - QuantDtype, QuantMethod + from intel_extension_for_pytorch.nn.modules.weight_only_quantization import ( + QuantDtype, + QuantMethod, + WeightOnlyQuantizedLinear, + ) from packaging import version + from auto_round.utils import get_library_version ipex_version = get_library_version("intel_extension_for_pytorch") if version.parse(ipex_version) >= version.parse("2.5"): diff --git a/auto_round_extension/qbits/__init__.py b/auto_round_extension/qbits/__init__.py index b4594b642..04cea3305 100644 --- a/auto_round_extension/qbits/__init__.py +++ b/auto_round_extension/qbits/__init__.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from auto_round_extension.qbits.qlinear_qbits import QuantLinear as QBitsQuantLinear from auto_round_extension.qbits.qlinear_qbits_gptq import ( QuantLinear as QBitsGPTQQuantLinear, diff --git a/auto_round_extension/qbits/qbits_awq.py b/auto_round_extension/qbits/qbits_awq.py index 0b3adefb0..07b3012ba 100644 --- a/auto_round_extension/qbits/qbits_awq.py +++ b/auto_round_extension/qbits/qbits_awq.py @@ -1,5 +1,20 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import torch import torch.nn as nn + AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7] def unpack_awq(qweight: torch.Tensor, qzeros: torch.Tensor, bits: int): shifts = torch.arange(0, 32, bits, device="cpu") diff --git a/auto_round_extension/qbits/qlinear_qbits.py b/auto_round_extension/qbits/qlinear_qbits.py index aee384f32..2dee2d506 100644 --- a/auto_round_extension/qbits/qlinear_qbits.py +++ b/auto_round_extension/qbits/qlinear_qbits.py @@ -18,7 +18,9 @@ import numpy as np import torch import torch.nn as nn + from auto_round.utils import convert_dtype_torch2str, logger + QBITS_AVAILABLE = True BITS_DTYPE_MAPPING = { diff --git a/auto_round_extension/qbits/qlinear_qbits_gptq.py b/auto_round_extension/qbits/qlinear_qbits_gptq.py index 8a1211955..e67bf6ef5 100644 --- a/auto_round_extension/qbits/qlinear_qbits_gptq.py +++ b/auto_round_extension/qbits/qlinear_qbits_gptq.py @@ -18,7 +18,9 @@ import numpy as np import torch import torch.nn as nn + from auto_round.utils import convert_dtype_torch2str, logger + QBITS_AVAILABLE = True BITS_DTYPE_MAPPING = { diff --git a/auto_round_extension/torch/__init__.py b/auto_round_extension/torch/__init__.py index e69de29bb..2b414eb72 100644 --- a/auto_round_extension/torch/__init__.py +++ b/auto_round_extension/torch/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/auto_round_extension/torch/qlinear_torch.py b/auto_round_extension/torch/qlinear_torch.py index 066ba5beb..401ce149e 100644 --- a/auto_round_extension/torch/qlinear_torch.py +++ b/auto_round_extension/torch/qlinear_torch.py @@ -20,7 +20,6 @@ import torch.nn as nn import transformers - logger = getLogger(__name__) diff --git a/auto_round_extension/torch/qlinear_torch_zp.py b/auto_round_extension/torch/qlinear_torch_zp.py index a274f57be..503e40e49 100644 --- a/auto_round_extension/torch/qlinear_torch_zp.py +++ b/auto_round_extension/torch/qlinear_torch_zp.py @@ -21,7 +21,6 @@ import torch.nn as nn import transformers - logger = getLogger(__name__) diff --git a/auto_round_extension/triton/__init__.py b/auto_round_extension/triton/__init__.py index e69de29bb..2b414eb72 100644 --- a/auto_round_extension/triton/__init__.py +++ b/auto_round_extension/triton/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/auto_round_extension/triton/qlinear_tritonv2.py b/auto_round_extension/triton/qlinear_tritonv2.py index b9d8931b9..7320ba03f 100644 --- a/auto_round_extension/triton/qlinear_tritonv2.py +++ b/auto_round_extension/triton/qlinear_tritonv2.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math from logging import getLogger @@ -6,17 +20,15 @@ import torch.nn as nn import transformers - from auto_round_extension.triton.triton_utils.mixin import TritonModuleMixin - logger = getLogger(__name__) try: from auto_round_extension.triton.triton_utils.dequant import QuantLinearFunction, quant_matmul_248 except ImportError as e: if torch.xpu.is_available(): - logger.error(f"please make sure your triton version is same with `pytorch-triton-xpu` library ") + logger.error("please make sure your triton version is same with `pytorch-triton-xpu` library ") exit(-1) triton_import_exception = e diff --git a/auto_round_extension/triton/qlinear_tritonv2_zp.py b/auto_round_extension/triton/qlinear_tritonv2_zp.py index 530e14d8d..af18f27b0 100644 --- a/auto_round_extension/triton/qlinear_tritonv2_zp.py +++ b/auto_round_extension/triton/qlinear_tritonv2_zp.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math from logging import getLogger @@ -6,17 +20,15 @@ import torch.nn as nn import transformers - from auto_round_extension.triton.triton_utils_zp.mixin import TritonModuleMixin - logger = getLogger(__name__) try: from auto_round_extension.triton.triton_utils_zp.dequant import QuantLinearFunction, quant_matmul_248 except ImportError as e: if torch.xpu.is_available(): - logger.error(f"please make sure your triton version is same with `pytorch-triton-xpu` library ") + logger.error("please make sure your triton version is same with `pytorch-triton-xpu` library ") exit(-1) triton_import_exception = e diff --git a/auto_round_extension/triton/triton_utils/custom_autotune.py b/auto_round_extension/triton/triton_utils/custom_autotune.py index b511579cc..5b5b5b14d 100644 --- a/auto_round_extension/triton/triton_utils/custom_autotune.py +++ b/auto_round_extension/triton/triton_utils/custom_autotune.py @@ -40,7 +40,6 @@ import triton - # code based https://github.com/fpgaminer/GPTQ-triton """ Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100. diff --git a/auto_round_extension/triton/triton_utils/kernels.py b/auto_round_extension/triton/triton_utils/kernels.py index eebd29cdb..0fb42ac45 100644 --- a/auto_round_extension/triton/triton_utils/kernels.py +++ b/auto_round_extension/triton/triton_utils/kernels.py @@ -43,7 +43,6 @@ from . import custom_autotune - logger = getLogger(__name__) diff --git a/auto_round_extension/triton/triton_utils_zp/custom_autotune.py b/auto_round_extension/triton/triton_utils_zp/custom_autotune.py index b511579cc..5b5b5b14d 100644 --- a/auto_round_extension/triton/triton_utils_zp/custom_autotune.py +++ b/auto_round_extension/triton/triton_utils_zp/custom_autotune.py @@ -40,7 +40,6 @@ import triton - # code based https://github.com/fpgaminer/GPTQ-triton """ Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100. diff --git a/auto_round_extension/triton/triton_utils_zp/kernels.py b/auto_round_extension/triton/triton_utils_zp/kernels.py index f361a3dde..2d4a358aa 100644 --- a/auto_round_extension/triton/triton_utils_zp/kernels.py +++ b/auto_round_extension/triton/triton_utils_zp/kernels.py @@ -42,7 +42,6 @@ from . import custom_autotune - logger = getLogger(__name__) diff --git a/pyproject.toml b/pyproject.toml index 60bdae880..5fe81084a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,97 @@ [tool.codespell] -ignore-words = ".azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt" \ No newline at end of file +ignore-words = ".azure-pipelines/scripts/codeScan/codespell/autoround_dict.txt" + +[tool.isort] +profile = "black" +line_length = 120 +known_first_party = ["auto_round"] +extend_skip_glob = ["**/__init__.py"] + +[tool.ruff] +# Exclude a variety of commonly ignored directories. +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".ipynb_checkpoints", + ".mypy_cache", + ".nox", + ".pants.d", + ".pyenv", + ".pytest_cache", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "site-packages", + "venv", +] + +# Same as Black. +line-length = 120 +indent-width = 4 + +# Assume Python 3.10 +target-version = "py310" + +[tool.ruff.lint] +# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. +# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or +# McCabe complexity (`C901`) by default. +select = ["E4", "E7", "E9", "F", "NPY", "FURB"] +ignore = [ + "E402", # Module level import not at top of file + "E501", # Line too long (121 > 120 characters) + "E721", # Do not compare types, use isinstance() + "E722", # Do not use bare except + "E731", # Do not assign a lambda expression, use a def + "E741", # Do not use variables named ‘l’, ‘O’, or ‘I’ + "F401", # {name} imported but unused + "F403", # from {name} import * used; unable to detect undefined names + "F841", # Local variable is assigned to but never used{name} +] + +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + +# Allow unused variables when underscore-prefixed. +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +[tool.ruff.format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "space" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" + +# Enable auto-formatting of code examples in docstrings. Markdown, +# reStructuredText code/literal blocks and doctests are all supported. +# +# This is currently disabled by default, but it is planned for this +# to be opt-out in the future. +docstring-code-format = false + +# Set the line length limit used when formatting code snippets in +# docstrings. +# +# This only has an effect when the `docstring-code-format` setting is +# enabled. +docstring-code-line-length = "dynamic" diff --git a/setup.py b/setup.py index a725e32e6..1d21ef8d5 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,10 @@ -import re -from io import open import os -from setuptools import find_packages, setup +import re import sys from functools import lru_cache +from io import open + +from setuptools import find_packages, setup os.environ["CC"] = "g++" os.environ["CXX"] = "g++" @@ -67,7 +68,7 @@ def is_cpu_env(): def fetch_requirements(path): requirements = [] with open(path, "r") as fd: - requirements = [r.strip() for r in fd.readlines()] + requirements = [r.strip() for r in fd] return requirements diff --git a/test/test_cpu/test_auto_round_hpu_only.py b/test/test_cpu/test_auto_round_hpu_only.py index 006e44f63..a6fdb433a 100644 --- a/test/test_cpu/test_auto_round_hpu_only.py +++ b/test/test_cpu/test_auto_round_hpu_only.py @@ -1,14 +1,15 @@ import pytest import torch -from auto_round.utils import is_hpu_supported - from _test_helpers import is_pytest_mode_compile, is_pytest_mode_lazy +from auto_round.utils import is_hpu_supported + def run_opt_125m_on_hpu(): - from auto_round import AutoRound from transformers import AutoModelForCausalLM, AutoTokenizer + from auto_round import AutoRound + model_name = "facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -24,7 +25,7 @@ def run_opt_125m_on_hpu(): seqlen=2, ) q_model, qconfig = autoround.quantize() - assert q_model is not None, f"Expected q_model to be not None" + assert q_model is not None, "Expected q_model to be not None" @pytest.mark.skipif(not is_hpu_supported(), reason="HPU is not supported") @@ -42,8 +43,7 @@ def test_opt_125m_compile_mode(): def test_import(): from auto_round import AutoRound - from auto_round.export.export_to_itrex.export import ( - WeightOnlyLinear, save_quantized_as_itrex) + from auto_round.export.export_to_itrex.export import WeightOnlyLinear, save_quantized_as_itrex @pytest.mark.parametrize( @@ -51,9 +51,10 @@ def test_import(): ["fp8_to_int_sym"], ) def test_w4a8(data_type): - from auto_round import AutoRound from transformers import AutoModelForCausalLM, AutoTokenizer + from auto_round import AutoRound + model_name = "facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained( model_name, @@ -76,4 +77,4 @@ def test_w4a8(data_type): act_dynamic=False, ) q_model, qconfig = autoround.quantize() - assert q_model is not None, f"Expected q_model to be not None" + assert q_model is not None, "Expected q_model to be not None" diff --git a/test/test_cpu/test_autoopt.py b/test/test_cpu/test_autoopt.py index 6a986706d..536906e8f 100644 --- a/test/test_cpu/test_autoopt.py +++ b/test/test_cpu/test_autoopt.py @@ -8,7 +8,7 @@ import transformers from transformers import AutoModelForCausalLM, AutoTokenizer -from auto_round import AutoRoundOPT, AutoRoundAdam +from auto_round import AutoRoundAdam, AutoRoundOPT class LLMDataLoader: diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py index 25b5afe14..fc577e319 100644 --- a/test/test_cpu/test_autoround.py +++ b/test/test_cpu/test_autoround.py @@ -7,10 +7,10 @@ sys.path.insert(0, "../..") import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig +from _test_helpers import model_infer +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound -from _test_helpers import model_infer class LLMDataLoader: @@ -423,7 +423,8 @@ def test_fallback_layers(self): def test_not_convert_modules(self): import requests from PIL import Image - from transformers import Qwen2VLForConditionalGeneration, AutoProcessor + from transformers import AutoProcessor, Qwen2VLForConditionalGeneration + from auto_round_extension.ipex.qlinear_ipex_awq import QuantLinear model_name = "Qwen/Qwen2-VL-2B-Instruct-AWQ" quantization_config = AutoRoundConfig() diff --git a/test/test_cpu/test_autoround_acc.py b/test/test_cpu/test_autoround_acc.py index 1158e9be3..656064907 100644 --- a/test/test_cpu/test_autoround_acc.py +++ b/test/test_cpu/test_autoround_acc.py @@ -3,14 +3,18 @@ import shutil import sys import unittest + sys.path.insert(0, "../..") +from math import isclose + import torch import transformers -from math import isclose from transformers import AutoModelForCausalLM, AutoTokenizer + from auto_round import AutoRound # pylint: disable=E0401 from auto_round.export.export_to_itrex.export import pack_model # pylint: disable=E0401 + class LLMDataLoader: def __init__(self): self.batch_size = 1 diff --git a/test/test_cpu/test_block_names.py b/test/test_cpu/test_block_names.py index 6f1436315..f7555a944 100644 --- a/test/test_cpu/test_block_names.py +++ b/test/test_cpu/test_block_names.py @@ -2,13 +2,16 @@ import shutil import sys import unittest + sys.path.insert(0, ".") sys.path.insert(0, "../..") import torch import torch.nn as nn -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + from auto_round import AutoRound + class LLMDataLoader: def __init__(self, input_size=10): self.batch_size = 1 @@ -175,8 +178,9 @@ def test_block_name_quant(self): def test_mm_block_name(self): - from auto_round.utils import get_block_names from transformers import Qwen2VLForConditionalGeneration + + from auto_round.utils import get_block_names model_name = "Qwen/Qwen2-VL-2B-Instruct" model = Qwen2VLForConditionalGeneration.from_pretrained( model_name, trust_remote_code=True, device_map="auto") diff --git a/test/test_cpu/test_conv1d.py b/test/test_cpu/test_conv1d.py index 8e87170a2..0009bcedf 100644 --- a/test/test_cpu/test_conv1d.py +++ b/test/test_cpu/test_conv1d.py @@ -5,10 +5,12 @@ sys.path.insert(0, "../..") import torch +from _test_helpers import model_infer from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound -from _test_helpers import model_infer + + class LLMDataLoader: def __init__(self): self.batch_size = 1 diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py index 8915c548e..bfd54ba31 100644 --- a/test/test_cpu/test_export.py +++ b/test/test_cpu/test_export.py @@ -4,7 +4,7 @@ sys.path.insert(0, "../..") import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound @@ -206,6 +206,7 @@ def test_autoround_3bit_sym_format(self): def test_static_afp8_export(self): import os + from safetensors import safe_open model_name = "facebook/opt-125m" diff --git a/test/test_cpu/test_generation.py b/test/test_cpu/test_generation.py index cec33e991..11c9ca1bb 100644 --- a/test/test_cpu/test_generation.py +++ b/test/test_cpu/test_generation.py @@ -5,7 +5,7 @@ sys.path.insert(0, "../..") import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index ae5cfba5a..cb7691a1b 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -1,7 +1,8 @@ import os +import shutil import sys import unittest -import shutil + sys.path.insert(0, "../..") import torch diff --git a/test/test_cpu/test_hpu.py b/test/test_cpu/test_hpu.py index 629a93212..035060817 100644 --- a/test/test_cpu/test_hpu.py +++ b/test/test_cpu/test_hpu.py @@ -9,7 +9,6 @@ from auto_round import AutoRound - class LLMDataLoader: def __init__(self): self.batch_size = 1 @@ -20,7 +19,7 @@ def __iter__(self): def is_hpu_supported(): try: - import habana_frameworks.torch.core as htcore # pylint: disable=E0401 + import habana_frameworks.torch.core as htcore # pylint: disable=E0401 except ImportError as e: return False return True diff --git a/test/test_cpu/test_llmcompressor_w8a8.py b/test/test_cpu/test_llmcompressor_w8a8.py index 7156b6a07..978a8ed5c 100644 --- a/test/test_cpu/test_llmcompressor_w8a8.py +++ b/test/test_cpu/test_llmcompressor_w8a8.py @@ -1,7 +1,8 @@ import os +import shutil import sys import unittest -import shutil + sys.path.insert(0, "../..") import torch diff --git a/test/test_cpu/test_load_awq_gptq.py b/test/test_cpu/test_load_awq_gptq.py index 202605491..88e9730bc 100644 --- a/test/test_cpu/test_load_awq_gptq.py +++ b/test/test_cpu/test_load_awq_gptq.py @@ -4,9 +4,7 @@ sys.path.insert(0, "../..") -from transformers import AutoModelForCausalLM, AutoTokenizer - -from transformers import AutoRoundConfig +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer class TestAutoRound(unittest.TestCase): diff --git a/test/test_cpu/test_low_cpu_mem.py b/test/test_cpu/test_low_cpu_mem.py index 2c4378f07..5ad8f5659 100644 --- a/test/test_cpu/test_low_cpu_mem.py +++ b/test/test_cpu/test_low_cpu_mem.py @@ -1,20 +1,21 @@ +import os import shutil import sys -import os import unittest + sys.path.insert(0, "../..") import torch from transformers import AutoModelForCausalLM, AutoTokenizer + +from auto_round import AutoRound from auto_round.low_cpu_mem.utils import ( - load_model_with_hooks, - load_empty_model, get_layers_before_block, layer_wise_load, layer_wise_save, - ) - -from auto_round import AutoRound + load_empty_model, + load_model_with_hooks, +) class LLMDataLoader: diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py index 5d5426b9f..4206847f0 100644 --- a/test/test_cpu/test_mllm.py +++ b/test/test_cpu/test_mllm.py @@ -3,11 +3,11 @@ sys.path.insert(0, "../..") -from auto_round import AutoRoundMLLM +import shutil from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration -import shutil +from auto_round import AutoRoundMLLM class FakeDataLoader: @@ -73,7 +73,7 @@ def test_quant_vision(self): ## bug need to fix autoround.save_quantized("./saved/", format="auto_round", inplace=True) def test_quant_block_names(self): - from auto_round.utils import get_block_names,find_matching_blocks + from auto_round.utils import find_matching_blocks, get_block_names tokenizer = AutoTokenizer.from_pretrained(self.model_name) processor = AutoProcessor.from_pretrained(self.model_name, trust_remote_code=True) model = Qwen2VLForConditionalGeneration.from_pretrained( @@ -109,6 +109,7 @@ def test_diff_dataset(self): def test_pure_text_model_check(self): from transformers import AutoModelForCausalLM + from auto_round.utils import is_pure_text_model model = Qwen2VLForConditionalGeneration.from_pretrained( self.model_name, trust_remote_code=True, device_map="auto") diff --git a/test/test_cpu/test_utils.py b/test/test_cpu/test_utils.py index eff324e43..846af0036 100644 --- a/test/test_cpu/test_utils.py +++ b/test/test_cpu/test_utils.py @@ -1,8 +1,10 @@ -from unittest.mock import patch import sys +from unittest.mock import patch + sys.path.insert(0, "../..") import auto_round.utils as auto_round_utils + class TestPackingWithNumba: @patch.object(auto_round_utils, "_is_tbb_installed", lambda: False) diff --git a/test/test_cpu/test_woq_linear.py b/test/test_cpu/test_woq_linear.py index 1f48e2309..673b5d06d 100644 --- a/test/test_cpu/test_woq_linear.py +++ b/test/test_cpu/test_woq_linear.py @@ -1,6 +1,8 @@ +import sys + import pytest import torch -import sys + sys.path.insert(0, "../..") from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear diff --git a/test/test_cuda/test_2_3bits.py b/test/test_cuda/test_2_3bits.py index 73f73d8f2..519d4e6de 100644 --- a/test/test_cuda/test_2_3bits.py +++ b/test/test_cuda/test_2_3bits.py @@ -1,17 +1,17 @@ import copy +import re import shutil import sys import unittest -import re sys.path.insert(0, "../..") import torch import transformers +from lm_eval.utils import make_table # pylint: disable=E0401 from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate -from lm_eval.utils import make_table # pylint: disable=E0401 from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051 diff --git a/test/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py index 817b5087d..22c963a48 100644 --- a/test/test_cuda/test_auto_round_format.py +++ b/test/test_cuda/test_auto_round_format.py @@ -4,15 +4,13 @@ import unittest sys.path.insert(0, "../..") -from auto_round.eval.evaluation import simple_evaluate_user_model -from auto_round.testing_utils import require_greater_than_050, require_autogptq, require_awq, require_ipex - import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer -from auto_round import AutoRound -from auto_round import AutoRoundConfig +from auto_round import AutoRound, AutoRoundConfig +from auto_round.eval.evaluation import simple_evaluate_user_model +from auto_round.testing_utils import require_autogptq, require_awq, require_greater_than_050, require_ipex class LLMDataLoader: diff --git a/test/test_cuda/test_conv1d.py b/test/test_cuda/test_conv1d.py index bf0daba34..b8fa0aa43 100644 --- a/test/test_cuda/test_conv1d.py +++ b/test/test_cuda/test_conv1d.py @@ -5,11 +5,13 @@ sys.path.insert(0, "../..") import torch +from _test_helpers import model_infer from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound from auto_round.testing_utils import require_gptqmodel -from _test_helpers import model_infer + + class LLMDataLoader: def __init__(self): self.batch_size = 1 @@ -35,7 +37,7 @@ def tearDownClass(self): def test_quant(self): self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) bits, group_size, sym = 4, 128, True - from auto_round import AutoRoundConfig + from auto_round import AutoRoundConfig autoround = AutoRound( self.model, self.tokenizer, diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py index 2a6539306..4279d6e57 100644 --- a/test/test_cuda/test_exllamav2_backend.py +++ b/test/test_cuda/test_exllamav2_backend.py @@ -1,15 +1,16 @@ import shutil import sys import unittest + import pytest + sys.path.insert(0, "../..") import torch from transformers import AutoModelForCausalLM, AutoTokenizer -from auto_round import AutoRound -from auto_round import AutoRoundConfig +from auto_round import AutoRound, AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel diff --git a/test/test_cuda/test_get_block_name.py b/test/test_cuda/test_get_block_name.py index fd89e2aa1..f796970d3 100644 --- a/test/test_cuda/test_get_block_name.py +++ b/test/test_cuda/test_get_block_name.py @@ -6,10 +6,17 @@ sys.path.insert(0, "../..") import torch import transformers -from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoModelForVision2Seq, \ - Gemma3ForConditionalGeneration, Mistral3ForConditionalGeneration -from auto_round.utils import get_block_names, is_pure_text_model +from transformers import ( + AutoModelForCausalLM, + AutoModelForVision2Seq, + AutoTokenizer, + Gemma3ForConditionalGeneration, + Mistral3ForConditionalGeneration, + Qwen2VLForConditionalGeneration, +) + from auto_round import AutoRound +from auto_round.utils import get_block_names, is_pure_text_model class TestAutoRound(unittest.TestCase): @@ -33,7 +40,7 @@ def test_glm4(self): model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["transformer.encoder.layers"], [40]) - assert is_pure_text_model(model) == True + assert is_pure_text_model(model), "Expected model to be pure text model" def test_opt_125m(self): model_name = "/models/opt-125m" @@ -41,56 +48,56 @@ def test_opt_125m(self): block_names = get_block_names(model) self.check_block_names(block_names, ["model.decoder.layers"], [12]) - assert is_pure_text_model(model) == True + assert is_pure_text_model(model) def test_Qwen(self): model_name = "/models/Qwen2.5-7B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [28]) - assert is_pure_text_model(model) == True + assert is_pure_text_model(model) def test_phi4(self): model_name = "/models/phi-4" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [40]) - assert is_pure_text_model(model) == True + assert is_pure_text_model(model) def test_llama3(self): model_name = "/models/Meta-Llama-3.1-8B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [32]) - assert is_pure_text_model(model) == True + assert is_pure_text_model(model) def test_mixtral(self): model_name = "/models/Mixtral-8x7B-Instruct-v0.1" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [32]) - assert is_pure_text_model(model) == True + assert is_pure_text_model(model) def test_falcon(self): model_name = "/models/Falcon3-7B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [28]) - assert is_pure_text_model(model) == True + assert is_pure_text_model(model) def test_orca(self): model_name = "/models/Orca-2-7b" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [32]) - assert is_pure_text_model(model) == True + assert is_pure_text_model(model) def test_OLMo(self): model_name = "/models/OLMo-2-1124-7B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) block_names = get_block_names(model) self.check_block_names(block_names, ["model.layers"], [32]) - assert is_pure_text_model(model) == True + assert is_pure_text_model(model) def test_Qwen2VL(self): model_name = "/models/Qwen2-VL-2B-Instruct" @@ -100,7 +107,7 @@ def test_Qwen2VL(self): block_names = get_block_names(model, quant_vision=True) self.check_block_names(block_names, ["visual.blocks", "model.layers"], [32, 28]) - assert is_pure_text_model(model) == False + assert not is_pure_text_model(model) def test_Llama32(self): model_name = "/models/Llama-3.2-11B-Vision-Instruct" @@ -113,7 +120,7 @@ def test_Llama32(self): ["vision_model.transformer.layers", "vision_model.global_transformer.layers", "language_model.model.layers"], [32, 8, 40]) - assert is_pure_text_model(model) == False + assert not is_pure_text_model(model) def test_SmolVLM(self): model_name = "/models/SmolVLM-Instruct" @@ -123,7 +130,7 @@ def test_SmolVLM(self): block_names = get_block_names(model, quant_vision=True) self.check_block_names(block_names, ["model.vision_model.encoder.layers", "model.text_model.layers"], [27, 24]) - assert is_pure_text_model(model) == False + assert not is_pure_text_model(model) def test_glm_4v(self): model_name = "/models/glm-4v-9b" @@ -134,7 +141,7 @@ def test_glm_4v(self): block_names = get_block_names(model, quant_vision=True) self.check_block_names(block_names, ["transformer.encoder.layers", "transformer.vision.transformer.layers"], [40, 63]) - assert is_pure_text_model(model) == False + assert not is_pure_text_model(model) def test_gemma3(self): model_name = "/models/gemma-3-12b-it" @@ -145,7 +152,7 @@ def test_gemma3(self): block_names = get_block_names(model, quant_vision=True) self.check_block_names(block_names, ["vision_tower.vision_model.encoder.layers", "language_model.model.layers"], [27, 48]) - assert is_pure_text_model(model) == False + assert not is_pure_text_model(model) def test_Mistral3(self): model_name = "/models/Mistral-Small-3.1-24B-Instruct-2503" @@ -156,7 +163,7 @@ def test_Mistral3(self): block_names = get_block_names(model, quant_vision=True) self.check_block_names(block_names, ["vision_tower.transformer.layers", "language_model.model.layers"], [24, 40]) - assert is_pure_text_model(model) == False + assert not is_pure_text_model(model) def test_Molmo(self): model_name = "/models/Molmo-7B-D-0924" @@ -168,7 +175,7 @@ def test_Molmo(self): self.check_block_names(block_names, ["model.transformer.blocks", "model.vision_backbone.image_vit.transformer.resblocks"], [28, 23]) - assert is_pure_text_model(model) == False + assert not is_pure_text_model(model) if __name__ == "__main__": diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py index ece1c8c1a..6faf31eb4 100644 --- a/test/test_cuda/test_gguf.py +++ b/test/test_cuda/test_gguf.py @@ -11,6 +11,7 @@ from auto_round import AutoRound from auto_round.testing_utils import require_gguf + class LLMDataLoader: def __init__(self): self.batch_size = 1 diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py index 7f520cb38..d32faf92a 100644 --- a/test/test_cuda/test_main_func.py +++ b/test/test_cuda/test_main_func.py @@ -1,18 +1,18 @@ import copy +import re import shutil import sys import unittest -import re sys.path.insert(0, "../..") import torch import transformers +from lm_eval.utils import make_table # pylint: disable=E0401 from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound, AutoRoundAdam from auto_round.eval.evaluation import simple_evaluate -from auto_round.testing_utils import require_gptqmodel, require_optimum, require_awq -from lm_eval.utils import make_table # pylint: disable=E0401 +from auto_round.testing_utils import require_awq, require_gptqmodel, require_optimum def get_accuracy(data): diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py index cb3516cad..f398029c2 100644 --- a/test/test_cuda/test_marlin_backend.py +++ b/test/test_cuda/test_marlin_backend.py @@ -1,14 +1,14 @@ import shutil import sys import unittest + import pytest sys.path.insert(0, "../..") import torch from transformers import AutoModelForCausalLM, AutoTokenizer -from auto_round import AutoRound -from auto_round import AutoRoundConfig +from auto_round import AutoRound, AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate_user_model diff --git a/test/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py index 5f8ab73d7..1710c1660 100644 --- a/test/test_cuda/test_multiple_card.py +++ b/test/test_cuda/test_multiple_card.py @@ -1,16 +1,18 @@ import re +import shutil import sys import unittest -import shutil + sys.path.insert(0, "../..") import torch from lm_eval.utils import make_table # pylint: disable=E0401 from transformers import AutoModelForCausalLM, AutoTokenizer + from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate -from auto_round.testing_utils import multi_card, require_greater_than_050, require_gptqmodel +from auto_round.testing_utils import multi_card, require_gptqmodel, require_greater_than_050 def get_accuracy(data): @@ -105,7 +107,6 @@ def test_device_map(self): autoround = AutoRound(model, tokenizer, iters=2, device_map=device_map, nsamples=7,seqlen=32) autoround.quantize() - from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc" device_map = {} diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py index 63a494f75..8f3b584bc 100644 --- a/test/test_cuda/test_multiple_card_calib.py +++ b/test/test_cuda/test_multiple_card_calib.py @@ -1,13 +1,14 @@ import os import re -import sys import shutil +import sys import unittest sys.path.insert(0, "../..") from auto_round.testing_utils import multi_card + def get_accuracy(data): match = re.search(r'\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|', data) diff --git a/test/test_cuda/test_qbits.py b/test/test_cuda/test_qbits.py index 8b79d97b5..4af9fb358 100644 --- a/test/test_cuda/test_qbits.py +++ b/test/test_cuda/test_qbits.py @@ -6,8 +6,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer -from auto_round import AutoRoundConfig, AutoRound -from auto_round.testing_utils import require_itrex, require_gptqmodel +from auto_round import AutoRound, AutoRoundConfig +from auto_round.testing_utils import require_gptqmodel, require_itrex class TestAutoRound(unittest.TestCase): @@ -78,8 +78,9 @@ def test_mixed_precision(self): layer_config["model.decoder.layers.0.self_attn.k_proj"] = {"bits": 8} layer_config["model.decoder.layers.6.self_attn.out_proj"] = {"bits": 2, "group_size": 32} bits, group_size, sym = 4, 128, True - from auto_round import AutoRound import torch + + from auto_round import AutoRound autoround = AutoRound( model, tokenizer, diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py index ddca458aa..8208ae255 100644 --- a/test/test_cuda/test_support_vlms.py +++ b/test/test_cuda/test_support_vlms.py @@ -1,15 +1,16 @@ import os -import sys import shutil +import sys import unittest sys.path.insert(0, '../..') -from auto_round import AutoRoundConfig ## must import for auto-round format -from auto_round.testing_utils import require_gptqmodel, require_vlm_env import requests from PIL import Image +from auto_round import AutoRoundConfig # # must import for auto-round format +from auto_round.testing_utils import require_gptqmodel, require_vlm_env + class TestSupportVLMS(unittest.TestCase): @classmethod @@ -352,7 +353,7 @@ def test_deepseek_vl2(self): self.assertFalse(res > 0 or res == -1, msg="deepseek vl2 tuning fail") quantized_model_path = os.path.join(self.save_dir, "deepseek-vl2-tiny-w4g32") - from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM + from deepseek_vl2.models import DeepseekVLV2ForCausalLM, DeepseekVLV2Processor from transformers import AutoModelForCausalLM vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(quantized_model_path) tokenizer = vl_chat_processor.tokenizer diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py index 527f5aec9..e429f14d4 100644 --- a/test/test_cuda/test_transformers.py +++ b/test/test_cuda/test_transformers.py @@ -26,7 +26,6 @@ ) from transformers.utils import is_torch_available - if is_torch_available(): import torch diff --git a/test/test_cuda/test_triton_backend.py b/test/test_cuda/test_triton_backend.py index a0d69322f..e31b659dd 100644 --- a/test/test_cuda/test_triton_backend.py +++ b/test/test_cuda/test_triton_backend.py @@ -4,12 +4,11 @@ import unittest sys.path.insert(0, "../..") -from auto_round.eval.evaluation import simple_evaluate_user_model - import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound, AutoRoundConfig +from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_greater_than_050 diff --git a/test/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py index 15ad6ca1c..62444d786 100644 --- a/test/test_cuda/test_vlms.py +++ b/test/test_cuda/test_vlms.py @@ -1,16 +1,18 @@ -import re -import os -import sys import copy +import os +import re import shutil +import sys import unittest + import requests sys.path.insert(0, "../..") from PIL import Image + from auto_round import AutoRoundConfig -from auto_round.testing_utils import require_gptqmodel, require_vlm_env, require_optimum +from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env class TestAutoRound(unittest.TestCase): @@ -42,8 +44,7 @@ def tearDownClass(self): # res == """ There is a girl who likes adventure, and she is looking for a partner to go on a treasure hunt. She has found a map that leads to a hidden treasure, but she needs a partner to help her decipher the clues and find the treasure. You""") def qwen_inference(self, quantized_model_dir): - import requests - from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer + from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir) processor = AutoProcessor.from_pretrained(quantized_model_dir, trust_remote_code=True) model = Qwen2VLForConditionalGeneration.from_pretrained( @@ -92,8 +93,9 @@ def qwen_inference(self, quantized_model_dir): @require_gptqmodel @require_optimum def test_vlm_tune(self): + from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration + from auto_round import AutoRoundMLLM - from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer ## load the model model_name = "/models/Qwen2-VL-2B-Instruct" diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py index caad16634..0c547b236 100644 --- a/test/test_xpu/test_autoround.py +++ b/test/test_xpu/test_autoround.py @@ -8,8 +8,8 @@ import transformers from transformers import AutoModelForCausalLM, AutoTokenizer -from auto_round import AutoRoundConfig -from auto_round import AutoRound +from auto_round import AutoRound, AutoRoundConfig + class LLMDataLoader: def __init__(self): @@ -52,7 +52,6 @@ def test_gptq_format(self): quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path) - from auto_round import AutoRoundConfig quantization_config = AutoRoundConfig( backend="auto" ) @@ -88,7 +87,6 @@ def test_awq_format(self): autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") quantized_model_path = "./saved" - from auto_round import AutoRoundConfig quantization_config = AutoRoundConfig( backend="auto" )