From 92dab8a90bdd82a72a06559943467b56dde12c71 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Tue, 2 Apr 2024 20:07:43 +0800 Subject: [PATCH] simplify readme --- README.md | 394 ++----------------- README_zh.md | 390 ++---------------- examples/{ => extras}/fsdp_qlora/README.md | 0 examples/{ => extras}/fsdp_qlora/fsdp.sh | 5 +- examples/full_multi_gpu/multi_node.sh | 2 +- examples/full_multi_gpu/single_node.sh | 2 +- examples/inference/api_demo.sh | 7 + examples/inference/cli_demo.sh | 7 + examples/inference/evaluate.sh | 12 + examples/inference/web_demo.sh | 7 + examples/lora_multi_gpu/multi_node.sh | 2 +- examples/lora_multi_gpu/single_node.sh | 2 +- examples/lora_single_gpu/prepare.sh | 18 + examples/merge_lora/README.md | 9 + examples/merge_lora/merge.sh | 2 +- src/llmtuner/data/loader.py | 19 +- src/llmtuner/extras/misc.py | 12 + src/llmtuner/extras/patches/llama_patch.py | 2 +- src/llmtuner/extras/patches/mixtral_patch.py | 38 -- src/llmtuner/hparams/data_args.py | 4 +- src/llmtuner/model/patcher.py | 101 ++--- src/llmtuner/model/utils.py | 16 +- src/llmtuner/train/dpo/collator.py | 54 --- src/llmtuner/train/rm/collator.py | 29 -- 24 files changed, 244 insertions(+), 890 deletions(-) rename examples/{ => extras}/fsdp_qlora/README.md (100%) rename examples/{ => extras}/fsdp_qlora/fsdp.sh (87%) create mode 100644 examples/inference/api_demo.sh create mode 100644 examples/inference/cli_demo.sh create mode 100644 examples/inference/evaluate.sh create mode 100644 examples/inference/web_demo.sh create mode 100644 examples/lora_single_gpu/prepare.sh delete mode 100644 src/llmtuner/extras/patches/mixtral_patch.py delete mode 100644 src/llmtuner/train/dpo/collator.py delete mode 100644 src/llmtuner/train/rm/collator.py diff --git a/README.md b/README.md index c90392c8..6450e61e 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Choose your path: ## Benchmark -Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning), LLaMA-Factory's LoRA tuning offers up to **3.7 times faster** training speed with a better Rouge score on the advertising text generation task. By leveraging 4-bit quantization technique, LLaMA-Factory's QLoRA further improves the efficiency regarding the GPU memory. +Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning), LLaMA Factory's LoRA tuning offers up to **3.7 times faster** training speed with a better Rouge score on the advertising text generation task. By leveraging 4-bit quantization technique, LLaMA Factory's QLoRA further improves the efficiency regarding the GPU memory. ![benchmark](assets/benchmark.svg) @@ -62,7 +62,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ - **Training Speed**: the number of training samples processed per second during the training. (bs=4, cutoff_len=1024) - **Rouge Score**: Rouge-2 score on the development set of the [advertising text generation](https://aclanthology.org/D19-1321.pdf) task. (bs=4, cutoff_len=1024) - **GPU Memory**: Peak GPU memory usage in 4-bit quantized training. (bs=1, cutoff_len=1024) -- We adopt `pre_seq_len=128` for ChatGLM's P-Tuning and `lora_rank=32` for LLaMA-Factory's LoRA tuning. +- We adopt `pre_seq_len=128` for ChatGLM's P-Tuning and `lora_rank=32` for LLaMA Factory's LoRA tuning. @@ -72,7 +72,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ [24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv! -[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/fsdp_qlora` for usage. +[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/extras/fsdp_qlora` for usage.
Full Changelog @@ -168,9 +168,6 @@ You also can add a custom chat template to [template.py](src/llmtuner/data/templ | DPO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | ORPO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | -> [!NOTE] -> Use `--quantization_bit 4` argument to enable QLoRA. - ## Provided Datasets
Pre-training datasets @@ -263,7 +260,7 @@ huggingface-cli login | ------------ | ------- | --------- | | python | 3.8 | 3.10 | | torch | 1.13.1 | 2.2.0 | -| transformers | 4.37.2 | 4.39.2 | +| transformers | 4.37.2 | 4.39.3 | | datasets | 2.14.3 | 2.18.0 | | accelerate | 0.27.2 | 0.28.0 | | peft | 0.9.0 | 0.10.0 | @@ -293,23 +290,28 @@ huggingface-cli login ## Getting Started -### Data Preparation (optional) +### Data Preparation -Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use a single `.json` file or a [dataset loading script](https://huggingface.co/docs/datasets/dataset_script) with multiple files to create a custom dataset. +Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use datasets on HuggingFace / ModelScope hub or load the dataset in local disk. > [!NOTE] -> Please update `data/dataset_info.json` to use your custom dataset. About the format of this file, please refer to `data/README.md`. +> Please update `data/dataset_info.json` to use your custom dataset. -### Dependence Installation (optional) +### Dependence Installation ```bash git clone https://github.com/hiyouga/LLaMA-Factory.git conda create -n llama_factory python=3.10 conda activate llama_factory cd LLaMA-Factory -pip install -r requirements.txt +pip install -e .[metrics] ``` +> [!TIP] +> Extra dependencies available: deepspeed, metrics, unsloth, vllm, bitsandbytes, gptq, awq, aqlm, qwen, quality + +
For Windows users + If you want to enable the quantized LoRA (QLoRA) on the Windows platform, you will be required to install a pre-built version of `bitsandbytes` library, which supports CUDA 11.1 to 12.2, please select the appropriate [release version](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels) based on your CUDA version. ```bash @@ -318,352 +320,17 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl To enable FlashAttention-2 on the Windows platform, you need to install the precompiled `flash-attn` library, which supports CUDA 12.1 to 12.2. Please download the corresponding version from [flash-attention](https://github.com/bdashore3/flash-attention/releases) based on your requirements. -### Use ModelScope Hub (optional) +
-If you have trouble with downloading models and datasets from Hugging Face, you can use LLaMA-Factory together with ModelScope in the following manner. +### LLaMA Board GUI -```bash -export USE_MODELSCOPE_HUB=1 # `set USE_MODELSCOPE_HUB=1` for Windows -``` - -Then you can train the corresponding model by specifying a model ID of the ModelScope Hub. (find a full list of model IDs at [ModelScope Hub](https://modelscope.cn/models)) - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --model_name_or_path modelscope/Llama-2-7b-ms \ - ... # arguments (same as below) -``` - -LLaMA Board also supports using the models and datasets on the ModelScope Hub. - -```bash -CUDA_VISIBLE_DEVICES=0 USE_MODELSCOPE_HUB=1 python src/train_web.py -``` - -### Train on a single GPU - -> [!IMPORTANT] -> If you want to train models on multiple GPUs, please refer to [Distributed Training](#distributed-training). - - -#### LLaMA Board GUI +#### Use local environment ```bash CUDA_VISIBLE_DEVICES=0 python src/train_web.py +# or CUDA_VISIBLE_DEVICES=0 python -m llmtuner.webui.interface ``` -#### Pre-Training - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --stage pt \ - --do_train \ - --model_name_or_path path_to_llama_model \ - --dataset wiki_demo \ - --finetuning_type lora \ - --lora_target q_proj,v_proj \ - --output_dir path_to_pt_checkpoint \ - --overwrite_cache \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --lr_scheduler_type cosine \ - --logging_steps 10 \ - --save_steps 1000 \ - --learning_rate 5e-5 \ - --num_train_epochs 3.0 \ - --plot_loss \ - --fp16 -``` - -#### Supervised Fine-Tuning - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --stage sft \ - --do_train \ - --model_name_or_path path_to_llama_model \ - --dataset alpaca_gpt4_en \ - --template default \ - --finetuning_type lora \ - --lora_target q_proj,v_proj \ - --output_dir path_to_sft_checkpoint \ - --overwrite_cache \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --lr_scheduler_type cosine \ - --logging_steps 10 \ - --save_steps 1000 \ - --learning_rate 5e-5 \ - --num_train_epochs 3.0 \ - --plot_loss \ - --fp16 -``` - -#### Reward Modeling - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --stage rm \ - --do_train \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_sft_checkpoint \ - --create_new_adapter \ - --dataset comparison_gpt4_en \ - --template default \ - --finetuning_type lora \ - --lora_target q_proj,v_proj \ - --output_dir path_to_rm_checkpoint \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --lr_scheduler_type cosine \ - --logging_steps 10 \ - --save_steps 1000 \ - --learning_rate 1e-5 \ - --num_train_epochs 1.0 \ - --plot_loss \ - --fp16 -``` - -#### PPO Training - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --stage ppo \ - --do_train \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_sft_checkpoint \ - --create_new_adapter \ - --dataset alpaca_gpt4_en \ - --template default \ - --finetuning_type lora \ - --lora_target q_proj,v_proj \ - --reward_model path_to_rm_checkpoint \ - --output_dir path_to_ppo_checkpoint \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --lr_scheduler_type cosine \ - --top_k 0 \ - --top_p 0.9 \ - --logging_steps 10 \ - --save_steps 1000 \ - --learning_rate 1e-5 \ - --num_train_epochs 1.0 \ - --plot_loss \ - --fp16 -``` - -> [!TIP] -> Use `--adapter_name_or_path path_to_sft_checkpoint,path_to_ppo_checkpoint` to infer the fine-tuned model if `--create_new_adapter` was enabled. - -> [!WARNING] -> Use `--per_device_train_batch_size=1` for LLaMA-2 models in fp16 PPO training. - -#### DPO Training - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --stage dpo \ - --do_train \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_sft_checkpoint \ - --create_new_adapter \ - --dataset comparison_gpt4_en \ - --template default \ - --finetuning_type lora \ - --lora_target q_proj,v_proj \ - --output_dir path_to_dpo_checkpoint \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --lr_scheduler_type cosine \ - --logging_steps 10 \ - --save_steps 1000 \ - --learning_rate 1e-5 \ - --num_train_epochs 1.0 \ - --plot_loss \ - --fp16 -``` - -> [!TIP] -> Use `--adapter_name_or_path path_to_sft_checkpoint,path_to_dpo_checkpoint` to infer the fine-tuned model if `--create_new_adapter` was enabled. - -### Distributed Training - -#### Use Huggingface Accelerate - -```bash -accelerate launch --config_file config.yaml src/train_bash.py \ - --ddp_timeout 180000000 \ - ... # arguments (same as above) -``` - -
Example config.yaml for LoRA training - -```yaml -compute_environment: LOCAL_MACHINE -debug: false -distributed_type: MULTI_GPU -downcast_bf16: 'no' -gpu_ids: all -machine_rank: 0 -main_training_function: main -mixed_precision: fp16 -num_machines: 1 -num_processes: 4 -rdzv_backend: static -same_network: true -tpu_env: [] -tpu_use_cluster: false -tpu_use_sudo: false -use_cpu: false -``` - -
- -> [!TIP] -> We commend using Accelerate for LoRA tuning. - -#### Use DeepSpeed - -```bash -deepspeed --num_gpus 8 src/train_bash.py \ - --deepspeed ds_config.json \ - --ddp_timeout 180000000 \ - ... # arguments (same as above) -``` - -
Example ds_config.json for full-parameter training with DeepSpeed ZeRO-2 - -```json -{ - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "gradient_accumulation_steps": "auto", - "gradient_clipping": "auto", - "zero_allow_untested_optimizer": true, - "fp16": { - "enabled": "auto", - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - }, - "bf16": { - "enabled": "auto" - }, - "zero_optimization": { - "stage": 2, - "allgather_partitions": true, - "allgather_bucket_size": 5e8, - "overlap_comm": true, - "reduce_scatter": true, - "reduce_bucket_size": 5e8, - "contiguous_gradients": true, - "round_robin_gradients": true - } -} -``` - -
- -> [!TIP] -> Refer to [examples](examples) for more training scripts. - -### Merge LoRA weights and export model - -```bash -CUDA_VISIBLE_DEVICES= python src/export_model.py \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_checkpoint \ - --template default \ - --finetuning_type lora \ - --export_dir path_to_export \ - --export_size 2 \ - --export_legacy_format False -``` - -> [!WARNING] -> Merging LoRA weights into a quantized model is not supported. - -> [!TIP] -> Use `--model_name_or_path path_to_export` solely to use the exported model. -> -> Use `CUDA_VISIBLE_DEVICES=0`, `--export_quantization_bit 4` and `--export_quantization_dataset data/c4_demo.json` to quantize the model with AutoGPTQ after merging the LoRA weights. - -### Inference with OpenAI-style API - -```bash -CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_checkpoint \ - --template default \ - --finetuning_type lora -``` - -> [!TIP] -> Visit `http://localhost:8000/docs` for API documentation. - -### Inference with command line - -```bash -CUDA_VISIBLE_DEVICES=0 python src/cli_demo.py \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_checkpoint \ - --template default \ - --finetuning_type lora -``` - -### Inference with web browser - -```bash -CUDA_VISIBLE_DEVICES=0 python src/web_demo.py \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_checkpoint \ - --template default \ - --finetuning_type lora -``` - -### Evaluation - -```bash -CUDA_VISIBLE_DEVICES=0 python src/evaluate.py \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_checkpoint \ - --template vanilla \ - --finetuning_type lora \ - --task mmlu \ - --split test \ - --lang en \ - --n_shot 5 \ - --batch_size 4 -``` - -### Predict - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --stage sft \ - --do_predict \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_checkpoint \ - --dataset alpaca_gpt4_en \ - --template default \ - --finetuning_type lora \ - --output_dir path_to_predict_result \ - --per_device_eval_batch_size 1 \ - --max_samples 100 \ - --predict_with_generate \ - --fp16 -``` - -> [!WARNING] -> Use `--per_device_train_batch_size=1` for LLaMA-2 models in fp16 predict. - -> [!TIP] -> We recommend using `--per_device_eval_batch_size=1` and `--max_target_length 128` at 4/8-bit predict. - -### Dockerize Training - #### Use Docker ```bash @@ -692,6 +359,27 @@ docker compose -f ./docker-compose.yml up -d > * data: Place datasets on this dir of the host machine so that they can be selected on LLaMA Board GUI. > * output: Set export dir to this location so that the merged result can be accessed directly on the host machine. +> [!WARNING] +> LLaMA Board GUI does not yet support multi-GPUs training. + +### Command Line Interface + +See [examples](examples) for usage. + +> [!TIP] +> Use `python src/train_bash.py -h` to display arguments description. + +### Use ModelScope Hub + +If you have trouble with downloading models and datasets from Hugging Face, you can use ModelScope. + +```bash +export USE_MODELSCOPE_HUB=1 # `set USE_MODELSCOPE_HUB=1` for Windows +``` + +> [!TIP] +> Train the model by specifying a model ID of the ModelScope Hub as the `--model_name_or_path`. You can find a full list of model IDs at [ModelScope Hub](https://modelscope.cn/models), e.g., `modelscope/Llama-2-7b-ms`. + ## Projects using LLaMA Factory 1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223) @@ -738,7 +426,7 @@ If this work is helpful, please kindly cite as: ```bibtex @article{zheng2024llamafactory, - title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models}, + title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models}, author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Yongqiang Ma}, journal={arXiv preprint arXiv:2403.13372}, year={2024}, @@ -748,7 +436,7 @@ If this work is helpful, please kindly cite as: ## Acknowledgement -This repo benefits from [PEFT](https://github.com/huggingface/peft), [QLoRA](https://github.com/artidoro/qlora) and [FastChat](https://github.com/lm-sys/FastChat). Thanks for their wonderful works. +This repo benefits from [PEFT](https://github.com/huggingface/peft), [TRL](https://github.com/huggingface/trl), [QLoRA](https://github.com/artidoro/qlora) and [FastChat](https://github.com/lm-sys/FastChat). Thanks for their wonderful works. ## Star History diff --git a/README_zh.md b/README_zh.md index d591e852..8b19f17f 100644 --- a/README_zh.md +++ b/README_zh.md @@ -53,7 +53,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd ## 性能指标 -与 ChatGLM 官方的 [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning) 微调相比,LLaMA-Factory 的 LoRA 微调提供了 **3.7 倍**的加速比,同时在广告文案生成任务上取得了更高的 Rouge 分数。结合 4 比特量化技术,LLaMA-Factory 的 QLoRA 微调进一步降低了 GPU 显存消耗。 +与 ChatGLM 官方的 [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning) 微调相比,LLaMA Factory 的 LoRA 微调提供了 **3.7 倍**的加速比,同时在广告文案生成任务上取得了更高的 Rouge 分数。结合 4 比特量化技术,LLaMA Factory 的 QLoRA 微调进一步降低了 GPU 显存消耗。 ![benchmark](assets/benchmark.svg) @@ -62,7 +62,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd - **Training Speed**: 训练阶段每秒处理的样本数量。(批处理大小=4,截断长度=1024) - **Rouge Score**: [广告文案生成](https://aclanthology.org/D19-1321.pdf)任务验证集上的 Rouge-2 分数。(批处理大小=4,截断长度=1024) - **GPU Memory**: 4 比特量化训练的 GPU 显存峰值。(批处理大小=1,截断长度=1024) -- 我们在 ChatGLM 的 P-Tuning 中采用 `pre_seq_len=128`,在 LLaMA-Factory 的 LoRA 微调中采用 `lora_rank=32`。 +- 我们在 ChatGLM 的 P-Tuning 中采用 `pre_seq_len=128`,在 LLaMA Factory 的 LoRA 微调中采用 `lora_rank=32`。
@@ -72,7 +72,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd [24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看! -[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/fsdp_qlora`。 +[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/extras/fsdp_qlora`。
展开日志 @@ -168,9 +168,6 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd | DPO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | ORPO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | -> [!NOTE] -> 请使用 `--quantization_bit 4` 参数来启用 QLoRA 训练。 - ## 数据集
预训练数据集 @@ -263,7 +260,7 @@ huggingface-cli login | ------------ | ------- | --------- | | python | 3.8 | 3.10 | | torch | 1.13.1 | 2.2.0 | -| transformers | 4.37.2 | 4.39.2 | +| transformers | 4.37.2 | 4.39.3 | | datasets | 2.14.3 | 2.18.0 | | accelerate | 0.27.2 | 0.28.0 | | peft | 0.9.0 | 0.10.0 | @@ -293,23 +290,28 @@ huggingface-cli login ## 如何使用 -### 数据准备(可跳过) +### 数据准备 -关于数据集文件的格式,请参考 [data/README_zh.md](data/README_zh.md) 的内容。构建自定义数据集时,既可以使用单个 `.json` 文件,也可以使用一个[数据加载脚本](https://huggingface.co/docs/datasets/dataset_script)和多个文件。 +关于数据集文件的格式,请参考 [data/README_zh.md](data/README_zh.md) 的内容。你可以使用 HuggingFace / ModelScope 上的数据集或加载本地数据集。 > [!NOTE] -> 使用自定义数据集时,请更新 `data/dataset_info.json` 文件,该文件的格式请参考 `data/README_zh.md`。 +> 使用自定义数据集时,请更新 `data/dataset_info.json` 文件。 -### 环境搭建(可跳过) +### 安装依赖 ```bash git clone https://github.com/hiyouga/LLaMA-Factory.git conda create -n llama_factory python=3.10 conda activate llama_factory cd LLaMA-Factory -pip install -r requirements.txt +pip install -e .[metrics] ``` +> [!TIP] +> 可选的额外依赖项:deepspeed、metrics、unsloth、vllm、bitsandbytes、gptq、awq、aqlm、qwen、quality + +
Windows 用户指南 + 如果要在 Windows 平台上开启量化 LoRA(QLoRA),需要安装预编译的 `bitsandbytes` 库, 支持 CUDA 11.1 到 12.2, 请根据您的 CUDA 版本情况选择适合的[发布版本](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels)。 ```bash @@ -318,350 +320,17 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl 如果要在 Windows 平台上开启 FlashAttention-2,需要安装预编译的 `flash-attn` 库,支持 CUDA 12.1 到 12.2,请根据需求到 [flash-attention](https://github.com/bdashore3/flash-attention/releases) 下载对应版本安装。 -### 使用魔搭社区(可跳过) +
-如果您在 Hugging Face 模型和数据集的下载中遇到了问题,可以通过下述方法使用魔搭社区。 +### LLaMA Board 可视化界面 -```bash -export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1` -``` - -接着即可通过指定模型名称来训练对应的模型。(在[魔搭社区](https://modelscope.cn/models)查看所有可用的模型) - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --model_name_or_path modelscope/Llama-2-7b-ms \ - ... # 参数同下 -``` - -LLaMA Board 同样支持魔搭社区的模型和数据集下载。 - -```bash -CUDA_VISIBLE_DEVICES=0 USE_MODELSCOPE_HUB=1 python src/train_web.py -``` - -### 单 GPU 训练 - -> [!IMPORTANT] -> 如果您使用多张 GPU 训练模型,请移步[多 GPU 分布式训练](#多-gpu-分布式训练)部分。 - -#### LLaMA Board GUI +#### 使用本地环境 ```bash CUDA_VISIBLE_DEVICES=0 python src/train_web.py +# 或 CUDA_VISIBLE_DEVICES=0 python -m llmtuner.webui.interface ``` -#### 预训练 - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --stage pt \ - --do_train \ - --model_name_or_path path_to_llama_model \ - --dataset wiki_demo \ - --finetuning_type lora \ - --lora_target q_proj,v_proj \ - --output_dir path_to_pt_checkpoint \ - --overwrite_cache \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --lr_scheduler_type cosine \ - --logging_steps 10 \ - --save_steps 1000 \ - --learning_rate 5e-5 \ - --num_train_epochs 3.0 \ - --plot_loss \ - --fp16 -``` - -#### 指令监督微调 - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --stage sft \ - --do_train \ - --model_name_or_path path_to_llama_model \ - --dataset alpaca_gpt4_zh \ - --template default \ - --finetuning_type lora \ - --lora_target q_proj,v_proj \ - --output_dir path_to_sft_checkpoint \ - --overwrite_cache \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --lr_scheduler_type cosine \ - --logging_steps 10 \ - --save_steps 1000 \ - --learning_rate 5e-5 \ - --num_train_epochs 3.0 \ - --plot_loss \ - --fp16 -``` - -#### 奖励模型训练 - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --stage rm \ - --do_train \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_sft_checkpoint \ - --create_new_adapter \ - --dataset comparison_gpt4_zh \ - --template default \ - --finetuning_type lora \ - --lora_target q_proj,v_proj \ - --output_dir path_to_rm_checkpoint \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --lr_scheduler_type cosine \ - --logging_steps 10 \ - --save_steps 1000 \ - --learning_rate 1e-5 \ - --num_train_epochs 1.0 \ - --plot_loss \ - --fp16 -``` - -#### PPO 训练 - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --stage ppo \ - --do_train \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_sft_checkpoint \ - --create_new_adapter \ - --dataset alpaca_gpt4_zh \ - --template default \ - --finetuning_type lora \ - --lora_target q_proj,v_proj \ - --reward_model path_to_rm_checkpoint \ - --output_dir path_to_ppo_checkpoint \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --lr_scheduler_type cosine \ - --top_k 0 \ - --top_p 0.9 \ - --logging_steps 10 \ - --save_steps 1000 \ - --learning_rate 1e-5 \ - --num_train_epochs 1.0 \ - --plot_loss \ - --fp16 -``` - -> [!TIP] -> 如果开启了 `--create_new_adapter`,则使用 `--adapter_name_or_path path_to_sft_checkpoint,path_to_ppo_checkpoint` 来进行微调模型的推理。 - -> [!WARNING] -> 如果使用 fp16 精度进行 LLaMA-2 模型的 PPO 训练,请使用 `--per_device_train_batch_size=1`。 - -#### DPO 训练 - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --stage dpo \ - --do_train \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_sft_checkpoint \ - --create_new_adapter \ - --dataset comparison_gpt4_zh \ - --template default \ - --finetuning_type lora \ - --lora_target q_proj,v_proj \ - --output_dir path_to_dpo_checkpoint \ - --per_device_train_batch_size 2 \ - --gradient_accumulation_steps 4 \ - --lr_scheduler_type cosine \ - --logging_steps 10 \ - --save_steps 1000 \ - --learning_rate 1e-5 \ - --num_train_epochs 1.0 \ - --plot_loss \ - --fp16 -``` - -> [!TIP] -> 如果开启了 `--create_new_adapter`,则使用 `--adapter_name_or_path path_to_sft_checkpoint,path_to_dpo_checkpoint` 来进行微调模型的推理。 - -### 多 GPU 分布式训练 - -#### 使用 Huggingface Accelerate - -```bash -accelerate launch --config_file config.yaml src/train_bash.py \ - --ddp_timeout 180000000 \ - ... # 参数同上 -``` - -
使用 Accelerate 进行 LoRA 训练的 config.yaml 示例 - -```yaml -compute_environment: LOCAL_MACHINE -debug: false -distributed_type: MULTI_GPU -downcast_bf16: 'no' -gpu_ids: all -machine_rank: 0 -main_training_function: main -mixed_precision: fp16 -num_machines: 1 -num_processes: 4 -rdzv_backend: static -same_network: true -tpu_env: [] -tpu_use_cluster: false -tpu_use_sudo: false -use_cpu: false -``` - -
- -> [!TIP] -> 我们推荐使用 Accelerate 进行 LoRA 训练。 - -#### 使用 DeepSpeed - -```bash -deepspeed --num_gpus 8 src/train_bash.py \ - --deepspeed ds_config.json \ - --ddp_timeout 180000000 \ - ... # 参数同上 -``` - -
使用 DeepSpeed ZeRO-2 进行全参数训练的 ds_config.json 示例 - -```json -{ - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "gradient_accumulation_steps": "auto", - "gradient_clipping": "auto", - "zero_allow_untested_optimizer": true, - "fp16": { - "enabled": "auto", - "loss_scale": 0, - "loss_scale_window": 1000, - "initial_scale_power": 16, - "hysteresis": 2, - "min_loss_scale": 1 - }, - "bf16": { - "enabled": "auto" - }, - "zero_optimization": { - "stage": 2, - "allgather_partitions": true, - "allgather_bucket_size": 5e8, - "overlap_comm": true, - "reduce_scatter": true, - "reduce_bucket_size": 5e8, - "contiguous_gradients": true, - "round_robin_gradients": true - } -} -``` - -
- -> [!TIP] -> 更多训练脚本请查看 [examples](examples)。 - -### 合并 LoRA 权重并导出模型 - -```bash -CUDA_VISIBLE_DEVICES= python src/export_model.py \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_checkpoint \ - --template default \ - --finetuning_type lora \ - --export_dir path_to_export \ - --export_size 2 \ - --export_legacy_format False -``` - -> [!WARNING] -> 尚不支持量化模型的 LoRA 权重合并及导出。 - -> [!TIP] -> 仅使用 `--model_name_or_path path_to_export` 来加载导出后的模型。 -> -> 合并 LoRA 权重之后可再次使用 `CUDA_VISIBLE_DEVICES=0`、`--export_quantization_bit 4` 和 `--export_quantization_dataset data/c4_demo.json` 基于 AutoGPTQ 量化模型。 - -### 使用 OpenAI 风格 API 推理 - -```bash -CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_checkpoint \ - --template default \ - --finetuning_type lora -``` - -> [!TIP] -> 关于 API 文档请见 `http://localhost:8000/docs`。 - -### 使用命令行推理 - -```bash -CUDA_VISIBLE_DEVICES=0 python src/cli_demo.py \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_checkpoint \ - --template default \ - --finetuning_type lora -``` - -### 使用浏览器推理 - -```bash -CUDA_VISIBLE_DEVICES=0 python src/web_demo.py \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_checkpoint \ - --template default \ - --finetuning_type lora -``` - -### 模型评估 - -```bash -CUDA_VISIBLE_DEVICES=0 python src/evaluate.py \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_checkpoint \ - --template vanilla \ - --finetuning_type lora \ - --task ceval \ - --split validation \ - --lang zh \ - --n_shot 5 \ - --batch_size 4 -``` - -### 模型预测 - -```bash -CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ - --stage sft \ - --do_predict \ - --model_name_or_path path_to_llama_model \ - --adapter_name_or_path path_to_checkpoint \ - --dataset alpaca_gpt4_zh \ - --template default \ - --finetuning_type lora \ - --output_dir path_to_predict_result \ - --per_device_eval_batch_size 1 \ - --max_samples 100 \ - --predict_with_generate \ - --fp16 -``` - -> [!WARNING] -> 如果使用 fp16 精度进行 LLaMA-2 模型的预测,请使用 `--per_device_eval_batch_size=1`。 - -> [!TIP] -> 我们建议在量化模型的预测中使用 `--per_device_eval_batch_size=1` 和 `--max_target_length 128`。 - -### 使用容器 #### 使用 Docker @@ -691,6 +360,27 @@ docker compose -f ./docker-compose.yml up -d > * data:宿主机中存放数据集的文件夹路径。 > * output:将导出目录设置为该路径后,即可在宿主机中访问导出后的模型。 +> [!WARNING] +> LLaMA Board 可视化界面尚不支持多 GPU 训练。 + +### 命令行接口 + +使用方法请参考 [examples](examples) 文件夹。 + +> [!TIP] +> 使用 `python src/train_bash.py -h` 查看参数文档。 + +### 使用魔搭社区 + +如果您在 Hugging Face 模型和数据集的下载中遇到了问题,可以通过下述方法使用魔搭社区。 + +```bash +export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1` +``` + +> [!TIP] +> 将 `--model_name_or_path` 设置为模型 ID 来加载对应的模型。在[魔搭社区](https://modelscope.cn/models)查看所有可用的模型,例如 `modelscope/Llama-2-7b-ms`。 + ## 使用了 LLaMA Factory 的项目 1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223) @@ -747,7 +437,7 @@ docker compose -f ./docker-compose.yml up -d ## 致谢 -本项目受益于 [PEFT](https://github.com/huggingface/peft)、[QLoRA](https://github.com/artidoro/qlora) 和 [FastChat](https://github.com/lm-sys/FastChat),感谢以上诸位作者的付出。 +本项目受益于 [PEFT](https://github.com/huggingface/peft)、[TRL](https://github.com/huggingface/trl)、[QLoRA](https://github.com/artidoro/qlora) 和 [FastChat](https://github.com/lm-sys/FastChat),感谢以上诸位作者的付出。 ## Star History diff --git a/examples/fsdp_qlora/README.md b/examples/extras/fsdp_qlora/README.md similarity index 100% rename from examples/fsdp_qlora/README.md rename to examples/extras/fsdp_qlora/README.md diff --git a/examples/fsdp_qlora/fsdp.sh b/examples/extras/fsdp_qlora/fsdp.sh similarity index 87% rename from examples/fsdp_qlora/fsdp.sh rename to examples/extras/fsdp_qlora/fsdp.sh index 2f7772a4..0fce3ecc 100644 --- a/examples/fsdp_qlora/fsdp.sh +++ b/examples/extras/fsdp_qlora/fsdp.sh @@ -15,11 +15,13 @@ CUDA_VISIBLE_DEVICES=0,1 accelerate launch \ --overwrite_cache \ --overwrite_output_dir \ --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 8 \ + --gradient_accumulation_steps 4 \ --lr_scheduler_type cosine \ --logging_steps 10 \ + --warmup_steps 20 \ --save_steps 100 \ --eval_steps 100 \ --evaluation_strategy steps \ @@ -28,6 +30,7 @@ CUDA_VISIBLE_DEVICES=0,1 accelerate launch \ --num_train_epochs 3.0 \ --max_samples 3000 \ --val_size 0.1 \ + --ddp_timeout 180000000 \ --quantization_bit 4 \ --plot_loss \ --fp16 diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh index b463cc10..d1382bc2 100644 --- a/examples/full_multi_gpu/multi_node.sh +++ b/examples/full_multi_gpu/multi_node.sh @@ -33,6 +33,6 @@ python -m torch.distributed.run \ --num_train_epochs 3.0 \ --max_samples 3000 \ --val_size 0.1 \ - --ddp_timeout 1800000 \ + --ddp_timeout 180000000 \ --plot_loss \ --fp16 diff --git a/examples/full_multi_gpu/single_node.sh b/examples/full_multi_gpu/single_node.sh index a54b81b3..ea4acf90 100644 --- a/examples/full_multi_gpu/single_node.sh +++ b/examples/full_multi_gpu/single_node.sh @@ -27,6 +27,6 @@ deepspeed --num_gpus 4 ../../src/train_bash.py \ --num_train_epochs 3.0 \ --max_samples 3000 \ --val_size 0.1 \ - --ddp_timeout 1800000 \ + --ddp_timeout 180000000 \ --plot_loss \ --fp16 diff --git a/examples/inference/api_demo.sh b/examples/inference/api_demo.sh new file mode 100644 index 00000000..4a601bb6 --- /dev/null +++ b/examples/inference/api_demo.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \ + --template default \ + --finetuning_type lora diff --git a/examples/inference/cli_demo.sh b/examples/inference/cli_demo.sh new file mode 100644 index 00000000..fdeb01e6 --- /dev/null +++ b/examples/inference/cli_demo.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python src/cli_demo.py \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \ + --template default \ + --finetuning_type lora diff --git a/examples/inference/evaluate.sh b/examples/inference/evaluate.sh new file mode 100644 index 00000000..b3053662 --- /dev/null +++ b/examples/inference/evaluate.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python src/evaluate.py \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \ + --template vanilla \ + --finetuning_type lora \ + --task mmlu \ + --split test \ + --lang en \ + --n_shot 5 \ + --batch_size 4 diff --git a/examples/inference/web_demo.sh b/examples/inference/web_demo.sh new file mode 100644 index 00000000..0f8307fb --- /dev/null +++ b/examples/inference/web_demo.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES=0 python src/web_demo.py \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \ + --template default \ + --finetuning_type lora diff --git a/examples/lora_multi_gpu/multi_node.sh b/examples/lora_multi_gpu/multi_node.sh index 562364a7..5172b9a6 100644 --- a/examples/lora_multi_gpu/multi_node.sh +++ b/examples/lora_multi_gpu/multi_node.sh @@ -30,6 +30,6 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \ --num_train_epochs 3.0 \ --max_samples 3000 \ --val_size 0.1 \ - --ddp_timeout 1800000 \ + --ddp_timeout 180000000 \ --plot_loss \ --fp16 diff --git a/examples/lora_multi_gpu/single_node.sh b/examples/lora_multi_gpu/single_node.sh index ddb9459d..269d76d7 100644 --- a/examples/lora_multi_gpu/single_node.sh +++ b/examples/lora_multi_gpu/single_node.sh @@ -30,6 +30,6 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch \ --num_train_epochs 3.0 \ --max_samples 3000 \ --val_size 0.1 \ - --ddp_timeout 1800000 \ + --ddp_timeout 180000000 \ --plot_loss \ --fp16 diff --git a/examples/lora_single_gpu/prepare.sh b/examples/lora_single_gpu/prepare.sh new file mode 100644 index 00000000..3652cea4 --- /dev/null +++ b/examples/lora_single_gpu/prepare.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +CUDA_VISIBLE_DEVICES= python ../../src/train_bash.py \ + --stage sft \ + --do_train \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --dataset alpaca_gpt4_en,glaive_toolcall \ + --dataset_dir ../../data \ + --template default \ + --finetuning_type lora \ + --lora_target q_proj,v_proj \ + --output_dir ../../saves/LLaMA2-7B/lora/sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ + --max_samples 3000 \ + --tokenized_path ../../saves/datasets/sft diff --git a/examples/merge_lora/README.md b/examples/merge_lora/README.md index c6c16071..a095f288 100644 --- a/examples/merge_lora/README.md +++ b/examples/merge_lora/README.md @@ -1,3 +1,12 @@ +> [!WARNING] +> Merging LoRA weights into a quantized model is not supported. + +> [!TIP] +> Use `--model_name_or_path path_to_model` solely to use the exported model or model fine-tuned in full/freeze mode. +> +> Use `CUDA_VISIBLE_DEVICES=0`, `--export_quantization_bit 4` and `--export_quantization_dataset data/c4_demo.json` to quantize the model with AutoGPTQ after merging the LoRA weights. + + Usage: - `merge.sh`: merge the lora weights diff --git a/examples/merge_lora/merge.sh b/examples/merge_lora/merge.sh index 42b9fcdd..bd2babb8 100644 --- a/examples/merge_lora/merge.sh +++ b/examples/merge_lora/merge.sh @@ -1,6 +1,6 @@ #!/bin/bash -CUDA_VISIBLE_DEVICES=0 python ../../src/export_model.py \ +CUDA_VISIBLE_DEVICES= python ../../src/export_model.py \ --model_name_or_path meta-llama/Llama-2-7b-hf \ --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \ --template default \ diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 0ab734e0..c22f9a77 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -6,6 +6,7 @@ from datasets import load_dataset, load_from_disk from ..extras.constants import FILEEXT2TYPE from ..extras.logging import get_logger +from ..extras.misc import is_path_available from .aligner import align_dataset from .parser import get_dataset_list from .preprocess import get_preprocess_and_print_func @@ -122,11 +123,12 @@ def get_dataset( if data_args.train_on_prompt and template.efficient_eos: raise ValueError("Current template does not support `train_on_prompt`.") - # Load from cache - if data_args.cache_path is not None: - if os.path.exists(data_args.cache_path): + # Load tokenized dataset + if data_args.tokenized_path is not None: + if not is_path_available(data_args.tokenized_path): logger.warning("Loading dataset from disk will ignore other data arguments.") - dataset = load_from_disk(data_args.cache_path) + dataset = load_from_disk(data_args.tokenized_path) + logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path)) if data_args.streaming: dataset = dataset.to_iterable_dataset() return dataset @@ -158,10 +160,13 @@ def get_dataset( dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs) - if data_args.cache_path is not None and not os.path.exists(data_args.cache_path): + if data_args.tokenized_path is not None: if training_args.should_save: - dataset.save_to_disk(data_args.cache_path) - logger.info("Dataset cache saved at {}.".format(data_args.cache_path)) + dataset.save_to_disk(data_args.tokenized_path) + logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path)) + logger.info("Please restart the training with `--tokenized_path {}`.".format(data_args.tokenized_path)) + + exit(0) if training_args.should_log: try: diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py index 60cf153b..a696b315 100644 --- a/src/llmtuner/extras/misc.py +++ b/src/llmtuner/extras/misc.py @@ -193,6 +193,18 @@ def infer_optim_dtype(model_dtype: torch.dtype) -> torch.dtype: return torch.float32 +def is_path_available(path: os.PathLike) -> bool: + r""" + Checks if the path is empty or not exist. + """ + if not os.path.exists(path): + return True + elif os.path.isdir(path) and not os.listdir(path): + return True + else: + return False + + def torch_gc() -> None: r""" Collects GPU memory. diff --git a/src/llmtuner/extras/patches/llama_patch.py b/src/llmtuner/extras/patches/llama_patch.py index f0b65d65..6a90c41a 100644 --- a/src/llmtuner/extras/patches/llama_patch.py +++ b/src/llmtuner/extras/patches/llama_patch.py @@ -193,6 +193,6 @@ def llama_flash_attn_forward( def apply_llama_patch() -> None: - require_version("transformers==4.39.2", "To fix: pip install transformers==4.39.2") + require_version("transformers==4.39.3", "To fix: pip install transformers==4.39.3") LlamaAttention.forward = llama_torch_attn_forward LlamaFlashAttention2.forward = llama_flash_attn_forward diff --git a/src/llmtuner/extras/patches/mixtral_patch.py b/src/llmtuner/extras/patches/mixtral_patch.py deleted file mode 100644 index 382492e0..00000000 --- a/src/llmtuner/extras/patches/mixtral_patch.py +++ /dev/null @@ -1,38 +0,0 @@ -import torch -import torch.nn.functional as F -from transformers.models.mixtral.modeling_mixtral import MixtralBLockSparseTop2MLP, MixtralSparseMoeBlock - - -def mlp_forward(self: "MixtralBLockSparseTop2MLP", hidden_states: torch.Tensor) -> torch.Tensor: - current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states) - current_hidden_states = self.w2(current_hidden_states) - return current_hidden_states - - -# Modified from: https://huggingface.co/deepseek-ai/deepseek-moe-16b-base/blob/main/modeling_deepseek.py -def moe_forward(self: "MixtralSparseMoeBlock", hidden_states: torch.Tensor) -> torch.Tensor: - batch_size, sequence_length, hidden_dim = hidden_states.shape - hidden_states = hidden_states.view(-1, hidden_dim) - # router_logits: (batch * sequence_length, n_experts) - router_logits = self.gate(hidden_states) - - routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) - topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False) - topk_weight /= topk_weight.sum(dim=-1, keepdim=True) - # we cast back to the input dtype - topk_weight = topk_weight.to(hidden_states.dtype) - - hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0) - y = torch.empty_like(hidden_states) - flat_topk_idx = topk_idx.view(-1) - for i in range(self.num_experts): - expert = self.experts[i] - y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i]) - y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1) - final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim) - return final_hidden_states, router_logits - - -def patch_mixtral_replace_moe_impl() -> None: - MixtralBLockSparseTop2MLP.forward = mlp_forward - MixtralSparseMoeBlock.forward = moe_forward diff --git a/src/llmtuner/hparams/data_args.py b/src/llmtuner/hparams/data_args.py index 76e6d6da..f5f75c77 100644 --- a/src/llmtuner/hparams/data_args.py +++ b/src/llmtuner/hparams/data_args.py @@ -84,9 +84,9 @@ class DataArguments: "help": "Whether or not to pack the sequences in training. Will automatically enable in pre-training." }, ) - cache_path: Optional[str] = field( + tokenized_path: Optional[str] = field( default=None, - metadata={"help": "Path to save or load the pre-processed datasets."}, + metadata={"help": "Path to save or load the tokenized datasets."}, ) def __post_init__(self): diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py index 7132470a..434a3a84 100644 --- a/src/llmtuner/model/patcher.py +++ b/src/llmtuner/model/patcher.py @@ -17,8 +17,7 @@ from ..extras.logging import get_logger from ..extras.misc import get_current_device, infer_optim_dtype from ..extras.packages import is_flash_attn2_available from ..extras.patches.llama_patch import apply_llama_patch -from ..extras.patches.mixtral_patch import patch_mixtral_replace_moe_impl -from .utils import QuantizationMethod +from .utils import QuantizationMethod, add_z3_leaf_module if TYPE_CHECKING: @@ -32,47 +31,6 @@ logger = get_logger(__name__) SUPPORTED_CLASS_FOR_S2ATTN = ["llama"] -def _noisy_mean_initialization(embed_weight: torch.Tensor, num_new_tokens: int): - embedding_dim = embed_weight.size(1) - avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True) - noise_weight = torch.empty_like(embed_weight[-num_new_tokens:]) - noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim))) - embed_weight[-num_new_tokens:] = avg_weight + noise_weight - - -def _resize_embedding_layer(model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer") -> None: - r""" - Resize token embeddings. - """ - if is_deepspeed_zero3_enabled(): - import deepspeed # type: ignore - - params = [model.get_input_embeddings().weight] - if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings: - params.append(model.get_output_embeddings().weight) - - context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0) - else: - context_maybe_zero3 = nullcontext() - - with context_maybe_zero3: - current_embedding_size = model.get_input_embeddings().weight.size(0) - - if len(tokenizer) > current_embedding_size: - if not isinstance(model.get_output_embeddings(), torch.nn.Linear): - logger.warning("Current model does not support resizing token embeddings.") - return - - model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64) - with context_maybe_zero3: - new_embedding_size = model.get_input_embeddings().weight.size(0) - num_new_tokens = new_embedding_size - current_embedding_size - _noisy_mean_initialization(model.get_input_embeddings().weight.data, num_new_tokens) - _noisy_mean_initialization(model.get_output_embeddings().weight.data, num_new_tokens) - - logger.info("Resized token embeddings from {} to {}.".format(current_embedding_size, new_embedding_size)) - - def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[str]: r""" Inspired by: https://github.com/huggingface/optimum/blob/v1.16.0/optimum/gptq/data.py#L133 @@ -180,8 +138,12 @@ def _configure_quantization( quant_method = quantization_config.get("quant_method", "") if quant_method == QuantizationMethod.GPTQ: + require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0") quantization_config["use_exllama"] = False # disable exllama + if quant_method == QuantizationMethod.AWQ: + require_version("autoawq", "To fix: pip install autoawq") + if quant_method == QuantizationMethod.AQLM: require_version("transformers>=4.39.0", "To fix: pip install transformers>=4.39.0") require_version("aqlm>=1.1.0", "To fix: pip install aqlm[gpu]>=1.1.0") @@ -235,6 +197,47 @@ def _configure_quantization( logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit)) +def _noisy_mean_initialization(embed_weight: torch.Tensor, num_new_tokens: int): + embedding_dim = embed_weight.size(1) + avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True) + noise_weight = torch.empty_like(embed_weight[-num_new_tokens:]) + noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim))) + embed_weight[-num_new_tokens:] = avg_weight + noise_weight + + +def _resize_embedding_layer(model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer") -> None: + r""" + Resize token embeddings. + """ + if is_deepspeed_zero3_enabled(): + import deepspeed # type: ignore + + params = [model.get_input_embeddings().weight] + if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings: + params.append(model.get_output_embeddings().weight) + + context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0) + else: + context_maybe_zero3 = nullcontext() + + with context_maybe_zero3: + current_embedding_size = model.get_input_embeddings().weight.size(0) + + if len(tokenizer) > current_embedding_size: + if not isinstance(model.get_output_embeddings(), torch.nn.Linear): + logger.warning("Current model does not support resizing token embeddings.") + return + + model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64) + with context_maybe_zero3: + new_embedding_size = model.get_input_embeddings().weight.size(0) + num_new_tokens = new_embedding_size - current_embedding_size + _noisy_mean_initialization(model.get_input_embeddings().weight.data, num_new_tokens) + _noisy_mean_initialization(model.get_output_embeddings().weight.data, num_new_tokens) + + logger.info("Resized token embeddings from {} to {}.".format(current_embedding_size, new_embedding_size)) + + def _fp32_forward_post_hook( module: "torch.nn.Module", args: Tuple["torch.Tensor"], output: "torch.Tensor" ) -> "torch.Tensor": @@ -348,15 +351,15 @@ def patch_model( if is_trainable: _prepare_model_for_training(model, model_args) - if getattr(model.config, "model_type", None) == "mixtral" and is_deepspeed_zero3_enabled(): - require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0") - from deepspeed.utils import set_z3_leaf_modules # type: ignore + if getattr(model.config, "model_type", None) == "mixtral": from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock - set_z3_leaf_modules(model, [MixtralSparseMoeBlock]) + add_z3_leaf_module(model, MixtralSparseMoeBlock) - if is_trainable: - patch_mixtral_replace_moe_impl() + if getattr(model.config, "model_type", None) == "qwen2moe": + from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock + + add_z3_leaf_module(model, Qwen2MoeSparseMoeBlock) try: model.add_model_tags(["llama-factory"]) diff --git a/src/llmtuner/model/utils.py b/src/llmtuner/model/utils.py index 1b96a9dd..771e6112 100644 --- a/src/llmtuner/model/utils.py +++ b/src/llmtuner/model/utils.py @@ -3,7 +3,9 @@ from typing import TYPE_CHECKING, Dict, List import torch from transformers import PreTrainedModel +from transformers.integrations import is_deepspeed_zero3_enabled from transformers.utils import cached_file +from transformers.utils.versions import require_version from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME from ..extras.logging import get_logger @@ -28,11 +30,23 @@ class QuantizationMethod(str, Enum): GPTQ = "gptq" AWQ = "awq" AQLM = "aqlm" + QUANTO = "quanto" + + +def add_z3_leaf_module(model: "PreTrainedModel", module: "torch.nn.Module") -> None: + r""" + Sets module as a leaf module to skip partitioning in deepspeed zero3. + """ + if is_deepspeed_zero3_enabled(): + require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0") + from deepspeed.utils import set_z3_leaf_modules # type: ignore + + set_z3_leaf_modules(model, [module]) def find_all_linear_modules(model: "PreTrainedModel") -> List[str]: r""" - Finds all available modules to apply lora. + Finds all available modules to apply lora or galore. """ quantization_method = getattr(model, "quantization_method", None) if quantization_method is None: diff --git a/src/llmtuner/train/dpo/collator.py b/src/llmtuner/train/dpo/collator.py deleted file mode 100644 index 7e8ba1c5..00000000 --- a/src/llmtuner/train/dpo/collator.py +++ /dev/null @@ -1,54 +0,0 @@ -from dataclasses import dataclass -from typing import Any, Dict, List, Sequence, Tuple - -import torch -from transformers import DataCollatorForSeq2Seq - - -@dataclass -class DPODataCollatorWithPadding(DataCollatorForSeq2Seq): - r""" - Data collator for pairwise data. - """ - - def _pad_labels(self, batch: torch.Tensor, positions: List[Tuple[int, int]]) -> torch.Tensor: - padded_labels = [] - for feature, (prompt_len, answer_len) in zip(batch, positions): - if self.tokenizer.padding_side == "left": - start, end = feature.size(0) - answer_len, feature.size(0) - else: - start, end = prompt_len, prompt_len + answer_len - padded_tensor = self.label_pad_token_id * torch.ones_like(feature) - padded_tensor[start:end] = feature[start:end] - padded_labels.append(padded_tensor) - return torch.stack(padded_labels, dim=0).contiguous() # in contiguous memory - - def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]: - r""" - Pads batched data to the longest sequence in the batch. - - We generate 2 * n examples where the first n examples represent chosen examples and - the last n examples represent rejected examples. - """ - concatenated_features = [] - label_positions = [] - for key in ("chosen_ids", "rejected_ids"): - for feature in features: - prompt_len, answer_len = len(feature["prompt_ids"]), len(feature[key]) - concatenated_features.append( - { - "input_ids": feature["prompt_ids"] + feature[key], - "attention_mask": [1] * (prompt_len + answer_len), - } - ) - label_positions.append((prompt_len, answer_len)) - - batch = self.tokenizer.pad( - concatenated_features, - padding=self.padding, - max_length=self.max_length, - pad_to_multiple_of=self.pad_to_multiple_of, - return_tensors=self.return_tensors, - ) - batch["labels"] = self._pad_labels(batch["input_ids"], label_positions) - return batch diff --git a/src/llmtuner/train/rm/collator.py b/src/llmtuner/train/rm/collator.py deleted file mode 100644 index 8d5d4ada..00000000 --- a/src/llmtuner/train/rm/collator.py +++ /dev/null @@ -1,29 +0,0 @@ -from dataclasses import dataclass -from typing import Any, Dict, Sequence - -import torch -from transformers import DataCollatorWithPadding - - -@dataclass -class PairwiseDataCollatorWithPadding(DataCollatorWithPadding): - r""" - Data collator for pairwise data. - """ - - def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]: - r""" - Pads batched data to the longest sequence in the batch. - - We generate 2 * n examples where the first n examples represent chosen examples and - the last n examples represent rejected examples. - """ - features = [ - { - "input_ids": feature["prompt_ids"] + feature[key], - "attention_mask": [1] * (len(feature["prompt_ids"]) + len(feature[key])), - } - for key in ("chosen_ids", "rejected_ids") - for feature in features - ] - return super().__call__(features)