diff --git a/assets/wechat.jpg b/assets/wechat.jpg index d3d6c314..96b59791 100644 Binary files a/assets/wechat.jpg and b/assets/wechat.jpg differ diff --git a/src/llmtuner/tuner/core/utils.py b/src/llmtuner/tuner/core/utils.py index 3ab02246..9e757718 100644 --- a/src/llmtuner/tuner/core/utils.py +++ b/src/llmtuner/tuner/core/utils.py @@ -3,12 +3,16 @@ from types import MethodType from typing import TYPE_CHECKING, List, Optional from llmtuner.extras.constants import LAYERNORM_NAMES +from llmtuner.extras.logging import get_logger if TYPE_CHECKING: from transformers.modeling_utils import PreTrainedModel from llmtuner.hparams import FinetuningArguments +logger = get_logger(__name__) + + def find_all_linear_modules( model: "PreTrainedModel", quantization_bit: Optional[int] = None, @@ -49,6 +53,7 @@ def prepare_model_for_training( for name, param in model.named_parameters(): if param.ndim == 1 and any(ln_name in name for ln_name in layernorm_names): param.data = param.data.to(torch.float32) + logger.info("Upcasting weights in layernorm in float32.") if finetuning_args.neft_alpha > 1e-6: input_embed: torch.nn.Embedding = model.get_input_embeddings() @@ -62,6 +67,7 @@ def prepare_model_for_training( return embeddings input_embed.forward = MethodType(noisy_forward, input_embed) + logger.info("Using noisy embedding with alpha={:.2f}".format(finetuning_args.neft_alpha)) if use_gradient_checkpointing: if hasattr(model, "enable_input_require_grads"): @@ -73,6 +79,7 @@ def prepare_model_for_training( model.gradient_checkpointing_enable() model.config.use_cache = False # turn off when gradient checkpointing is enabled + logger.info("Gradient checkpointing enabled.") if finetuning_args.finetuning_type != "full" and hasattr(model, output_layer_name): output_layer: torch.nn.Linear = getattr(model, output_layer_name) diff --git a/src/llmtuner/webui/components/top.py b/src/llmtuner/webui/components/top.py index 8fad3cc9..c6299cab 100644 --- a/src/llmtuner/webui/components/top.py +++ b/src/llmtuner/webui/components/top.py @@ -31,9 +31,10 @@ def create_top() -> Dict[str, "Component"]: with gr.Accordion(label="Model config (LLaMA only)", open=False) as llama_tab: with gr.Row(): - flash_attn = gr.Checkbox(value=False) - shift_attn = gr.Checkbox(value=False) - rope_scaling = gr.Dropdown(choices=["none", "linear", "dynamic"], value="none") + with gr.Column(): + flash_attn = gr.Checkbox(value=False) + shift_attn = gr.Checkbox(value=False) + rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none") model_name.change( list_checkpoint, [model_name, finetuning_type], [checkpoints], queue=False diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py index 8e6dc683..20974d18 100644 --- a/src/llmtuner/webui/components/train.py +++ b/src/llmtuner/webui/components/train.py @@ -79,26 +79,30 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]: logging_steps = gr.Slider(value=5, minimum=5, maximum=1000, step=5) save_steps = gr.Slider(value=100, minimum=10, maximum=5000, step=10) warmup_steps = gr.Slider(value=0, minimum=0, maximum=5000, step=1) + neft_alpha = gr.Slider(value=0, minimum=0, maximum=10, step=0.1) - input_elems.update({logging_steps, save_steps, warmup_steps}) + with gr.Column(): + train_on_prompt = gr.Checkbox(value=False) + upcast_layernorm = gr.Checkbox(value=False) + + input_elems.update({logging_steps, save_steps, warmup_steps, neft_alpha, train_on_prompt, upcast_layernorm}) elem_dict.update(dict( - advanced_tab=advanced_tab, logging_steps=logging_steps, save_steps=save_steps, warmup_steps=warmup_steps + advanced_tab=advanced_tab, logging_steps=logging_steps, save_steps=save_steps, warmup_steps=warmup_steps, + neft_alpha=neft_alpha, train_on_prompt=train_on_prompt, upcast_layernorm=upcast_layernorm )) with gr.Accordion(label="LoRA config", open=False) as lora_tab: with gr.Row(): lora_rank = gr.Slider(value=8, minimum=1, maximum=1024, step=1, scale=1) lora_dropout = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01, scale=1) - lora_target = gr.Textbox(scale=2) + lora_target = gr.Textbox(scale=1) + additional_target = gr.Textbox(scale=1) resume_lora_training = gr.Checkbox(value=True, scale=1) - input_elems.update({lora_rank, lora_dropout, lora_target, resume_lora_training}) + input_elems.update({lora_rank, lora_dropout, lora_target, additional_target, resume_lora_training}) elem_dict.update(dict( - lora_tab=lora_tab, - lora_rank=lora_rank, - lora_dropout=lora_dropout, - lora_target=lora_target, - resume_lora_training=resume_lora_training, + lora_tab=lora_tab, lora_rank=lora_rank, lora_dropout=lora_dropout, lora_target=lora_target, + additional_target=additional_target, resume_lora_training=resume_lora_training, )) with gr.Accordion(label="RLHF config", open=False) as rlhf_tab: diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py index 831a3eff..780e616f 100644 --- a/src/llmtuner/webui/locales.py +++ b/src/llmtuner/webui/locales.py @@ -309,6 +309,36 @@ LOCALES = { "info": "学习率预热采用的步数。" } }, + "neft_alpha": { + "en": { + "label": "NEFTune Alpha", + "info": "Magnitude of noise adding to embedding vectors." + }, + "zh": { + "label": "NEFTune 噪声参数", + "info": "嵌入向量所添加的噪声大小。" + } + }, + "train_on_prompt": { + "en": { + "label": "Train on prompt", + "info": "Compute loss on the prompt tokens in supervised fine-tuning." + }, + "zh": { + "label": "计算输入损失", + "info": "在监督微调时候计算输入序列的损失。" + } + }, + "upcast_layernorm": { + "en": { + "label": "Upcast LayerNorm", + "info": "Upcast weights of layernorm in float32." + }, + "zh": { + "label": "缩放归一化层", + "info": "将归一化层权重缩放至 32 位浮点数。" + } + }, "lora_tab": { "en": { "label": "LoRA configurations" @@ -340,11 +370,21 @@ LOCALES = { "lora_target": { "en": { "label": "LoRA modules (optional)", - "info": "The name(s) of target modules to apply LoRA. Use commas to separate multiple modules." + "info": "Name(s) of target modules to apply LoRA. Use commas to separate multiple modules." }, "zh": { - "label": "LoRA 作用层(非必填)", - "info": "应用 LoRA 的线性层名称。使用英文逗号分隔多个名称。" + "label": "LoRA 作用模块(非必填)", + "info": "应用 LoRA 的目标模块名称。使用英文逗号分隔多个名称。" + } + }, + "additional_target": { + "en": { + "label": "Additional modules (optional)", + "info": "Name(s) of modules apart from LoRA layers to be set as trainable. Use commas to separate multiple modules." + }, + "zh": { + "label": "附加模块(非必填)", + "info": "除 LoRA 层以外的可训练模块名称。使用英文逗号分隔多个名称。" } }, "resume_lora_training": { diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py index f951fe2c..e31d4fe0 100644 --- a/src/llmtuner/webui/runner.py +++ b/src/llmtuner/webui/runner.py @@ -89,7 +89,6 @@ class Runner: stage=TRAINING_STAGES[get("train.training_stage")], model_name_or_path=get("top.model_path"), do_train=True, - overwrite_cache=False, cache_dir=user_config.get("cache_dir", None), checkpoint_dir=checkpoint_dir, finetuning_type=get("top.finetuning_type"), @@ -112,9 +111,13 @@ class Runner: logging_steps=get("train.logging_steps"), save_steps=get("train.save_steps"), warmup_steps=get("train.warmup_steps"), + neft_alpha=get("train.neft_alpha"), + train_on_prompt=get("train.train_on_prompt"), + upcast_layernorm=get("train.upcast_layernorm"), lora_rank=get("train.lora_rank"), lora_dropout=get("train.lora_dropout"), lora_target=get("train.lora_target") or get_module(get("top.model_name")), + additional_target=get("train.additional_target"), resume_lora_training=get("train.resume_lora_training"), output_dir=output_dir ) @@ -160,7 +163,6 @@ class Runner: stage="sft", model_name_or_path=get("top.model_path"), do_eval=True, - overwrite_cache=False, predict_with_generate=True, cache_dir=user_config.get("cache_dir", None), checkpoint_dir=checkpoint_dir,