From ad144c2265cdee0d23014dbb3d017ea257cb26ed Mon Sep 17 00:00:00 2001 From: hiyouga <467089858@qq.com> Date: Thu, 27 Jun 2024 00:29:42 +0800 Subject: [PATCH] support HQQ/EETQ #4113 --- README.md | 4 +- README_zh.md | 4 +- setup.py | 8 +- src/llamafactory/extras/env.py | 5 +- src/llamafactory/hparams/model_args.py | 7 +- src/llamafactory/model/__init__.py | 2 + src/llamafactory/model/loader.py | 4 +- .../model/model_utils/quantization.py | 75 ++++++++++++------- src/llamafactory/webui/chatter.py | 10 ++- src/llamafactory/webui/common.py | 2 + src/llamafactory/webui/components/export.py | 5 +- src/llamafactory/webui/components/top.py | 13 ++-- src/llamafactory/webui/locales.py | 20 ++++- src/llamafactory/webui/manager.py | 1 + src/llamafactory/webui/runner.py | 18 ++++- src/llamafactory/webui/utils.py | 13 ++++ 16 files changed, 134 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 4b42edd7..443c8cf7 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ Choose your path: - **Various models**: LLaMA, LLaVA, Mistral, Mixtral-MoE, Qwen, Yi, Gemma, Baichuan, ChatGLM, Phi, etc. - **Integrated methods**: (Continuous) pre-training, (multimodal) supervised fine-tuning, reward modeling, PPO, DPO, KTO, ORPO, etc. -- **Scalable resources**: 32-bit full-tuning, 16-bit freeze-tuning, 16-bit LoRA and 2/4/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8. +- **Scalable resources**: 16-bit full-tuning, freeze-tuning, LoRA and 2/3/4/5/6/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8/HQQ/EETQ. - **Advanced algorithms**: GaLore, BAdam, DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ, PiSSA and Agent tuning. - **Practical tricks**: FlashAttention-2, Unsloth, RoPE scaling, NEFTune and rsLoRA. - **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, etc. @@ -341,7 +341,7 @@ cd LLaMA-Factory pip install -e ".[torch,metrics]" ``` -Extra dependencies available: torch, torch_npu, metrics, deepspeed, bitsandbytes, vllm, galore, badam, gptq, awq, aqlm, qwen, modelscope, quality +Extra dependencies available: torch, torch-npu, metrics, deepspeed, bitsandbytes, hqq, eetq, gptq, awq, aqlm, vllm, galore, badam, qwen, modelscope, quality > [!TIP] > Use `pip install --no-deps -e .` to resolve package conflicts. diff --git a/README_zh.md b/README_zh.md index 3926c09d..d5172a7d 100644 --- a/README_zh.md +++ b/README_zh.md @@ -48,7 +48,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd - **多种模型**:LLaMA、LLaVA、Mistral、Mixtral-MoE、Qwen、Yi、Gemma、Baichuan、ChatGLM、Phi 等等。 - **集成方法**:(增量)预训练、(多模态)指令监督微调、奖励模型训练、PPO 训练、DPO 训练、KTO 训练、ORPO 训练等等。 -- **多种精度**:32 比特全参数微调、16 比特冻结微调、16 比特 LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8 的 2/4/8 比特 QLoRA 微调。 +- **多种精度**:16 比特全参数微调、冻结微调、LoRA 微调和基于 AQLM/AWQ/GPTQ/LLM.int8/HQQ/EETQ 的 2/3/4/5/6/8 比特 QLoRA 微调。 - **先进算法**:GaLore、BAdam、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ、PiSSA 和 Agent 微调。 - **实用技巧**:FlashAttention-2、Unsloth、RoPE scaling、NEFTune 和 rsLoRA。 - **实验监控**:LlamaBoard、TensorBoard、Wandb、MLflow 等等。 @@ -341,7 +341,7 @@ cd LLaMA-Factory pip install -e ".[torch,metrics]" ``` -可选的额外依赖项:torch、torch_npu、metrics、deepspeed、bitsandbytes、vllm、galore、badam、gptq、awq、aqlm、qwen、modelscope、quality +可选的额外依赖项:torch、torch-npu、metrics、deepspeed、bitsandbytes、hqq、eetq、gptq、awq、aqlm、vllm、galore、badam、qwen、modelscope、quality > [!TIP] > 遇到包冲突时,可使用 `pip install --no-deps -e .` 解决。 diff --git a/setup.py b/setup.py index 8254b6d4..d43c311c 100644 --- a/setup.py +++ b/setup.py @@ -39,12 +39,14 @@ extra_require = { "metrics": ["nltk", "jieba", "rouge-chinese"], "deepspeed": ["deepspeed>=0.10.0"], "bitsandbytes": ["bitsandbytes>=0.39.0"], - "vllm": ["vllm>=0.4.3"], - "galore": ["galore-torch"], - "badam": ["badam>=1.2.1"], + "hqq": ["hqq"], + "eetq": ["eetq"], "gptq": ["optimum>=1.17.0", "auto-gptq>=0.5.0"], "awq": ["autoawq"], "aqlm": ["aqlm[gpu]>=1.1.0"], + "vllm": ["vllm>=0.4.3"], + "galore": ["galore-torch"], + "badam": ["badam>=1.2.1"], "qwen": ["transformers_stream_generator"], "modelscope": ["modelscope"], "dev": ["ruff", "pytest"], diff --git a/src/llamafactory/extras/env.py b/src/llamafactory/extras/env.py index ab387231..14876048 100644 --- a/src/llamafactory/extras/env.py +++ b/src/llamafactory/extras/env.py @@ -1,4 +1,7 @@ -# Copyright 2024 the LlamaFactory team. +# Copyright 2024 HuggingFace Inc. and the LlamaFactory team. +# +# This code is inspired by the HuggingFace's transformers library. +# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/commands/env.py # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py index 3f21145d..087c8c38 100644 --- a/src/llamafactory/hparams/model_args.py +++ b/src/llamafactory/hparams/model_args.py @@ -77,6 +77,10 @@ class ModelArguments: default=True, metadata={"help": "Whether or not to use memory-efficient model loading."}, ) + quantization_method: Literal["bitsandbytes", "hqq", "eetq"] = field( + default="bitsandbytes", + metadata={"help": "Quantization method to use for on-the-fly quantization."}, + ) quantization_bit: Optional[int] = field( default=None, metadata={"help": "The number of bits to quantize the model using bitsandbytes."}, @@ -235,9 +239,6 @@ class ModelArguments: if self.new_special_tokens is not None: # support multiple special tokens self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")] - assert self.quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization." - assert self.export_quantization_bit in [None, 8, 4, 3, 2], "We only accept 2/3/4/8-bit quantization." - if self.export_quantization_bit is not None and self.export_quantization_dataset is None: raise ValueError("Quantization dataset is necessary for exporting.") diff --git a/src/llamafactory/model/__init__.py b/src/llamafactory/model/__init__.py index 4abbaa1b..48cfe76c 100644 --- a/src/llamafactory/model/__init__.py +++ b/src/llamafactory/model/__init__.py @@ -14,10 +14,12 @@ from .loader import load_config, load_model, load_tokenizer from .model_utils.misc import find_all_linear_modules +from .model_utils.quantization import QuantizationMethod from .model_utils.valuehead import load_valuehead_params __all__ = [ + "QuantizationMethod", "load_config", "load_model", "load_tokenizer", diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index e1015821..1261d17a 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -186,11 +186,11 @@ def load_model( trainable_params, all_param = count_parameters(model) if is_trainable: - param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( + param_stats = "trainable params: {:,} || all params: {:,} || trainable%: {:.4f}".format( trainable_params, all_param, 100 * trainable_params / all_param ) else: - param_stats = "all params: {:d}".format(all_param) + param_stats = "all params: {:,}".format(all_param) logger.info(param_stats) diff --git a/src/llamafactory/model/model_utils/quantization.py b/src/llamafactory/model/model_utils/quantization.py index fab61cb8..3203b4aa 100644 --- a/src/llamafactory/model/model_utils/quantization.py +++ b/src/llamafactory/model/model_utils/quantization.py @@ -23,7 +23,7 @@ from typing import TYPE_CHECKING, Any, Dict, List import torch from datasets import load_dataset -from transformers import BitsAndBytesConfig, GPTQConfig +from transformers import BitsAndBytesConfig, EetqConfig, GPTQConfig, HqqConfig from transformers.integrations import is_deepspeed_zero3_enabled from transformers.modeling_utils import is_fsdp_enabled from transformers.utils.versions import require_version @@ -59,7 +59,7 @@ class QuantizationMethod(str, Enum): def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[Dict[str, Any]]: r""" - Prepares the dataset to perform AutoGPTQ. + Prepares the tokenized dataset to perform AutoGPTQ. Do not use tensor output for JSON serialization. """ if os.path.isfile(model_args.export_quantization_dataset): data_path = FILEEXT2TYPE.get(model_args.export_quantization_dataset.split(".")[-1], None) @@ -93,7 +93,7 @@ def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "Mod word_idx = random.randint(0, sample["input_ids"].size(1) - maxlen - 1) input_ids = sample["input_ids"][:, word_idx : word_idx + maxlen] attention_mask = sample["attention_mask"][:, word_idx : word_idx + maxlen] - samples.append({"input_ids": input_ids, "attention_mask": attention_mask}) + samples.append({"input_ids": input_ids.tolist(), "attention_mask": attention_mask.tolist()}) return samples @@ -105,7 +105,7 @@ def configure_quantization( init_kwargs: Dict[str, Any], ) -> None: r""" - Priority: PTQ-quantized (training) > AutoGPTQ (export) > Bitsandbytes (training) + Priority: PTQ-quantized (train/infer) > AutoGPTQ (export) > On-the-fly quantization (train/infer) """ if getattr(config, "quantization_config", None): # ptq if is_deepspeed_zero3_enabled(): @@ -131,6 +131,9 @@ def configure_quantization( logger.info("Loading {}-bit {}-quantized model.".format(quant_bits, quant_method.upper())) elif model_args.export_quantization_bit is not None: # auto-gptq + if model_args.export_quantization_bit not in [8, 4, 3, 2]: + raise ValueError("AutoGPTQ only accepts 2/3/4/8-bit quantization.") + require_version("optimum>=1.17.0", "To fix: pip install optimum>=1.17.0") require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0") from accelerate.utils import get_max_memory @@ -146,30 +149,48 @@ def configure_quantization( init_kwargs["max_memory"] = get_max_memory() logger.info("Quantizing model to {} bit with AutoGPTQ.".format(model_args.export_quantization_bit)) - elif model_args.quantization_bit is not None: # bnb - if model_args.quantization_bit == 8: - require_version("bitsandbytes>=0.37.0", "To fix: pip install bitsandbytes>=0.37.0") - init_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) + elif model_args.quantization_bit is not None: # on-the-fly + if model_args.quantization_method == QuantizationMethod.BITS_AND_BYTES.value: + if model_args.quantization_bit == 8: + require_version("bitsandbytes>=0.37.0", "To fix: pip install bitsandbytes>=0.37.0") + init_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) + elif model_args.quantization_bit == 4: + require_version("bitsandbytes>=0.39.0", "To fix: pip install bitsandbytes>=0.39.0") + init_kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=model_args.compute_dtype, + bnb_4bit_use_double_quant=model_args.double_quantization, + bnb_4bit_quant_type=model_args.quantization_type, + bnb_4bit_quant_storage=model_args.compute_dtype, # crucial for fsdp+qlora + ) + else: + raise ValueError("Bitsandbytes only accepts 4-bit or 8-bit quantization.") - elif model_args.quantization_bit == 4: - require_version("bitsandbytes>=0.39.0", "To fix: pip install bitsandbytes>=0.39.0") - init_kwargs["quantization_config"] = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_dtype=model_args.compute_dtype, - bnb_4bit_use_double_quant=model_args.double_quantization, - bnb_4bit_quant_type=model_args.quantization_type, - bnb_4bit_quant_storage=model_args.compute_dtype, # crucial for fsdp+qlora - ) + # Do not assign device map if: + # 1. deepspeed zero3 or fsdp (train) + # 2. auto quantization device map (inference) + if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto": + if model_args.quantization_bit != 4: + raise ValueError("Only 4-bit quantized model can use fsdp+qlora or auto device map.") - # Do not assign device map if: - # 1. deepspeed zero3 or fsdp (train) - # 2. auto quantization device map (inference) - if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto": - if model_args.quantization_bit != 4: - raise ValueError("Only 4-bit quantized model can use fsdp+qlora or auto device map.") + require_version("bitsandbytes>=0.43.0", "To fix: pip install bitsandbytes>=0.43.0") + else: + init_kwargs["device_map"] = {"": get_current_device()} # change auto device map for inference - require_version("bitsandbytes>=0.43.0", "To fix: pip install bitsandbytes>=0.43.0") - else: - init_kwargs["device_map"] = {"": get_current_device()} # change auto device map for inference + logger.info("Quantizing model to {} bit with bitsandbytes.".format(model_args.quantization_bit)) + elif model_args.quantization_method == QuantizationMethod.HQQ.value: + if model_args.quantization_bit not in [8, 6, 5, 4, 3, 2, 1]: + raise ValueError("HQQ only accepts 1/2/3/4/5/6/8-bit quantization.") - logger.info("Quantizing model to {} bit with bitsandbytes.".format(model_args.quantization_bit)) + require_version("hqq", "To fix: pip install hqq") + init_kwargs["quantization_config"] = HqqConfig( + nbits=model_args.quantization_bit, quant_zero=False, quant_scale=False, axis=0 + ) # use ATEN kernel (axis=0) for performance + logger.info("Quantizing model to {} bit with HQQ.".format(model_args.quantization_bit)) + elif model_args.quantization_method == QuantizationMethod.EETQ.value: + if model_args.quantization_bit != 8: + raise ValueError("EETQ only accepts 8-bit quantization.") + + require_version("eetq", "To fix: pip install eetq") + init_kwargs["quantization_config"] = EetqConfig() + logger.info("Quantizing model to {} bit with EETQ.".format(model_args.quantization_bit)) diff --git a/src/llamafactory/webui/chatter.py b/src/llamafactory/webui/chatter.py index 652c341c..8abef920 100644 --- a/src/llamafactory/webui/chatter.py +++ b/src/llamafactory/webui/chatter.py @@ -23,7 +23,7 @@ from ..data import Role from ..extras.constants import PEFT_METHODS from ..extras.misc import torch_gc from ..extras.packages import is_gradio_available -from .common import get_save_dir +from .common import QUANTIZATION_BITS, get_save_dir from .locales import ALERTS @@ -76,11 +76,17 @@ class WebChatModel(ChatModel): yield error return + if get("top.quantization_bit") in QUANTIZATION_BITS: + quantization_bit = int(get("top.quantization_bit")) + else: + quantization_bit = None + yield ALERTS["info_loading"][lang] args = dict( model_name_or_path=model_path, finetuning_type=finetuning_type, - quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None, + quantization_bit=quantization_bit, + quantization_method=get("top.quantization_method"), template=get("top.template"), flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto", use_unsloth=(get("top.booster") == "unsloth"), diff --git a/src/llamafactory/webui/common.py b/src/llamafactory/webui/common.py index 980428a4..bced18f0 100644 --- a/src/llamafactory/webui/common.py +++ b/src/llamafactory/webui/common.py @@ -47,6 +47,8 @@ DEFAULT_CONFIG_DIR = "config" DEFAULT_DATA_DIR = "data" DEFAULT_SAVE_DIR = "saves" USER_CONFIG = "user_config.yaml" +QUANTIZATION_BITS = ["8", "6", "5", "4", "3", "2", "1"] +GPTQ_BITS = ["8", "4", "3", "2"] def get_save_dir(*paths: str) -> os.PathLike: diff --git a/src/llamafactory/webui/components/export.py b/src/llamafactory/webui/components/export.py index 14257949..0a938f02 100644 --- a/src/llamafactory/webui/components/export.py +++ b/src/llamafactory/webui/components/export.py @@ -18,7 +18,7 @@ from ...extras.constants import PEFT_METHODS from ...extras.misc import torch_gc from ...extras.packages import is_gradio_available from ...train.tuner import export_model -from ..common import get_save_dir +from ..common import GPTQ_BITS, get_save_dir from ..locales import ALERTS @@ -32,9 +32,6 @@ if TYPE_CHECKING: from ..engine import Engine -GPTQ_BITS = ["8", "4", "3", "2"] - - def can_quantize(checkpoint_path: Union[str, List[str]]) -> "gr.Dropdown": if isinstance(checkpoint_path, list) and len(checkpoint_path) != 0: return gr.Dropdown(value="none", interactive=False) diff --git a/src/llamafactory/webui/components/top.py b/src/llamafactory/webui/components/top.py index 18b9a7d2..e331d5e4 100644 --- a/src/llamafactory/webui/components/top.py +++ b/src/llamafactory/webui/components/top.py @@ -18,7 +18,7 @@ from ...data import TEMPLATES from ...extras.constants import METHODS, SUPPORTED_MODELS from ...extras.packages import is_gradio_available from ..common import get_model_info, list_checkpoints, save_config -from ..utils import can_quantize +from ..utils import can_quantize, can_quantize_to if is_gradio_available(): @@ -43,10 +43,11 @@ def create_top() -> Dict[str, "Component"]: with gr.Accordion(open=False) as advanced_tab: with gr.Row(): - quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", scale=2) - template = gr.Dropdown(choices=list(TEMPLATES.keys()), value="default", scale=2) - rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=3) - booster = gr.Radio(choices=["none", "flashattn2", "unsloth"], value="none", scale=3) + quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", scale=1) + quantization_method = gr.Dropdown(choices=["bitsandbytes", "hqq", "eetq"], value="bitsandbytes", scale=1) + template = gr.Dropdown(choices=list(TEMPLATES.keys()), value="default", scale=1) + rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=2) + booster = gr.Radio(choices=["auto", "flashattn2", "unsloth"], value="auto", scale=2) visual_inputs = gr.Checkbox(scale=1) model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs], queue=False).then( @@ -58,6 +59,7 @@ def create_top() -> Dict[str, "Component"]: list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False ) checkpoint_path.focus(list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False) + quantization_method.change(can_quantize_to, [quantization_method], [quantization_bit], queue=False) return dict( lang=lang, @@ -67,6 +69,7 @@ def create_top() -> Dict[str, "Component"]: checkpoint_path=checkpoint_path, advanced_tab=advanced_tab, quantization_bit=quantization_bit, + quantization_method=quantization_method, template=template, rope_scaling=rope_scaling, booster=booster, diff --git a/src/llamafactory/webui/locales.py b/src/llamafactory/webui/locales.py index cd166584..435876e7 100644 --- a/src/llamafactory/webui/locales.py +++ b/src/llamafactory/webui/locales.py @@ -85,15 +85,29 @@ LOCALES = { "quantization_bit": { "en": { "label": "Quantization bit", - "info": "Enable 4/8-bit model quantization (QLoRA).", + "info": "Enable quantization (QLoRA).", }, "ru": { "label": "Уровень квантования", - "info": "Включить 4/8-битное квантование модели (QLoRA).", + "info": "Включить квантование (QLoRA).", }, "zh": { "label": "量化等级", - "info": "启用 4/8 比特模型量化(QLoRA)。", + "info": "启用量化(QLoRA)。", + }, + }, + "quantization_method": { + "en": { + "label": "Quantization method", + "info": "Quantization algorithm to use.", + }, + "ru": { + "label": "Метод квантования", + "info": "Алгоритм квантования, который следует использовать.", + }, + "zh": { + "label": "量化方法", + "info": "使用的量化算法。", }, }, "template": { diff --git a/src/llamafactory/webui/manager.py b/src/llamafactory/webui/manager.py index 7e9b801a..ebe9f1b9 100644 --- a/src/llamafactory/webui/manager.py +++ b/src/llamafactory/webui/manager.py @@ -71,6 +71,7 @@ class Manager: self._id_to_elem["top.finetuning_type"], self._id_to_elem["top.checkpoint_path"], self._id_to_elem["top.quantization_bit"], + self._id_to_elem["top.quantization_method"], self._id_to_elem["top.template"], self._id_to_elem["top.rope_scaling"], self._id_to_elem["top.booster"], diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py index 549ec765..f7fbac30 100644 --- a/src/llamafactory/webui/runner.py +++ b/src/llamafactory/webui/runner.py @@ -22,7 +22,7 @@ from transformers.trainer import TRAINING_ARGS_NAME from ..extras.constants import LLAMABOARD_CONFIG, PEFT_METHODS, TRAINING_STAGES from ..extras.misc import is_gpu_or_npu_available, torch_gc from ..extras.packages import is_gradio_available -from .common import DEFAULT_CACHE_DIR, DEFAULT_CONFIG_DIR, get_save_dir, load_config +from .common import DEFAULT_CACHE_DIR, DEFAULT_CONFIG_DIR, QUANTIZATION_BITS, get_save_dir, load_config from .locales import ALERTS, LOCALES from .utils import abort_process, gen_cmd, get_eval_results, get_trainer_info, load_args, save_args, save_cmd @@ -104,6 +104,11 @@ class Runner: model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type") user_config = load_config() + if get("top.quantization_bit") in QUANTIZATION_BITS: + quantization_bit = int(get("top.quantization_bit")) + else: + quantization_bit = None + args = dict( stage=TRAINING_STAGES[get("train.training_stage")], do_train=True, @@ -111,7 +116,8 @@ class Runner: cache_dir=user_config.get("cache_dir", None), preprocessing_num_workers=16, finetuning_type=finetuning_type, - quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None, + quantization_bit=quantization_bit, + quantization_method=get("top.quantization_method"), template=get("top.template"), rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None, flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto", @@ -234,13 +240,19 @@ class Runner: model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type") user_config = load_config() + if get("top.quantization_bit") in QUANTIZATION_BITS: + quantization_bit = int(get("top.quantization_bit")) + else: + quantization_bit = None + args = dict( stage="sft", model_name_or_path=get("top.model_path"), cache_dir=user_config.get("cache_dir", None), preprocessing_num_workers=16, finetuning_type=finetuning_type, - quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None, + quantization_bit=quantization_bit, + quantization_method=get("top.quantization_method"), template=get("top.template"), rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None, flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto", diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py index a616bcba..4f313e4e 100644 --- a/src/llamafactory/webui/utils.py +++ b/src/llamafactory/webui/utils.py @@ -25,6 +25,7 @@ from yaml import safe_dump, safe_load from ..extras.constants import PEFT_METHODS, RUNNING_LOG, TRAINER_LOG, TRAINING_ARGS, TRAINING_STAGES from ..extras.packages import is_gradio_available, is_matplotlib_available from ..extras.ploting import gen_loss_plot +from ..model import QuantizationMethod from .common import DEFAULT_CACHE_DIR, DEFAULT_CONFIG_DIR, get_save_dir from .locales import ALERTS @@ -55,6 +56,18 @@ def can_quantize(finetuning_type: str) -> "gr.Dropdown": return gr.Dropdown(interactive=True) +def can_quantize_to(quantization_method: str) -> "gr.Dropdown": + r""" + Returns the available quantization bits. + """ + if quantization_method == QuantizationMethod.BITS_AND_BYTES.value: + return gr.Dropdown(choices=["none", "8", "4"]) + elif quantization_method == QuantizationMethod.HQQ.value: + return gr.Dropdown(choices=["none", "8", "6", "5", "4", "3", "2", "1"]) + elif quantization_method == QuantizationMethod.EETQ.value: + return gr.Dropdown(choices=["none", "8"]) + + def change_stage(training_stage: str = list(TRAINING_STAGES.keys())[0]) -> Tuple[List[str], bool]: r""" Modifys states after changing the training stage.