From 92dab8a90bdd82a72a06559943467b56dde12c71 Mon Sep 17 00:00:00 2001
From: hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 2 Apr 2024 20:07:43 +0800
Subject: [PATCH] simplify readme

---
 README.md                                    | 394 ++-----------------
 README_zh.md                                 | 390 ++----------------
 examples/{ => extras}/fsdp_qlora/README.md   |   0
 examples/{ => extras}/fsdp_qlora/fsdp.sh     |   5 +-
 examples/full_multi_gpu/multi_node.sh        |   2 +-
 examples/full_multi_gpu/single_node.sh       |   2 +-
 examples/inference/api_demo.sh               |   7 +
 examples/inference/cli_demo.sh               |   7 +
 examples/inference/evaluate.sh               |  12 +
 examples/inference/web_demo.sh               |   7 +
 examples/lora_multi_gpu/multi_node.sh        |   2 +-
 examples/lora_multi_gpu/single_node.sh       |   2 +-
 examples/lora_single_gpu/prepare.sh          |  18 +
 examples/merge_lora/README.md                |   9 +
 examples/merge_lora/merge.sh                 |   2 +-
 src/llmtuner/data/loader.py                  |  19 +-
 src/llmtuner/extras/misc.py                  |  12 +
 src/llmtuner/extras/patches/llama_patch.py   |   2 +-
 src/llmtuner/extras/patches/mixtral_patch.py |  38 --
 src/llmtuner/hparams/data_args.py            |   4 +-
 src/llmtuner/model/patcher.py                | 101 ++---
 src/llmtuner/model/utils.py                  |  16 +-
 src/llmtuner/train/dpo/collator.py           |  54 ---
 src/llmtuner/train/rm/collator.py            |  29 --
 24 files changed, 244 insertions(+), 890 deletions(-)
 rename examples/{ => extras}/fsdp_qlora/README.md (100%)
 rename examples/{ => extras}/fsdp_qlora/fsdp.sh (87%)
 create mode 100644 examples/inference/api_demo.sh
 create mode 100644 examples/inference/cli_demo.sh
 create mode 100644 examples/inference/evaluate.sh
 create mode 100644 examples/inference/web_demo.sh
 create mode 100644 examples/lora_single_gpu/prepare.sh
 delete mode 100644 src/llmtuner/extras/patches/mixtral_patch.py
 delete mode 100644 src/llmtuner/train/dpo/collator.py
 delete mode 100644 src/llmtuner/train/rm/collator.py
diff --git a/README.md b/README.md
index c90392c8..6450e61e 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ Choose your path:
 
 ## Benchmark
 
-Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning), LLaMA-Factory's LoRA tuning offers up to **3.7 times faster** training speed with a better Rouge score on the advertising text generation task. By leveraging 4-bit quantization technique, LLaMA-Factory's QLoRA further improves the efficiency regarding the GPU memory.
+Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning), LLaMA Factory's LoRA tuning offers up to **3.7 times faster** training speed with a better Rouge score on the advertising text generation task. By leveraging 4-bit quantization technique, LLaMA Factory's QLoRA further improves the efficiency regarding the GPU memory.
 
 ![benchmark](assets/benchmark.svg)
 
@@ -62,7 +62,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 - **Training Speed**: the number of training samples processed per second during the training. (bs=4, cutoff_len=1024)
 - **Rouge Score**: Rouge-2 score on the development set of the [advertising text generation](https://aclanthology.org/D19-1321.pdf) task. (bs=4, cutoff_len=1024)
 - **GPU Memory**: Peak GPU memory usage in 4-bit quantized training. (bs=1, cutoff_len=1024)
-- We adopt `pre_seq_len=128` for ChatGLM's P-Tuning and `lora_rank=32` for LLaMA-Factory's LoRA tuning.
+- We adopt `pre_seq_len=128` for ChatGLM's P-Tuning and `lora_rank=32` for LLaMA Factory's LoRA tuning.
 
 </details>
 
@@ -72,7 +72,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 [24/03/21] Our paper "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" is available at arXiv!
 
-[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/fsdp_qlora` for usage.
+[24/03/20] We supported **FSDP+QLoRA** that fine-tunes a 70B model on 2x24GB GPUs. See `examples/extras/fsdp_qlora` for usage.
 
 <details><summary>Full Changelog</summary>
 
@@ -168,9 +168,6 @@ You also can add a custom chat template to [template.py](src/llmtuner/data/templ
 | DPO Training           | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | ORPO Training          | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 
-> [!NOTE]
-> Use `--quantization_bit 4` argument to enable QLoRA.
-
 ## Provided Datasets
 
 <details><summary>Pre-training datasets</summary>
@@ -263,7 +260,7 @@ huggingface-cli login
 | ------------ | ------- | --------- |
 | python       | 3.8     | 3.10      |
 | torch        | 1.13.1  | 2.2.0     |
-| transformers | 4.37.2  | 4.39.2    |
+| transformers | 4.37.2  | 4.39.3    |
 | datasets     | 2.14.3  | 2.18.0    |
 | accelerate   | 0.27.2  | 0.28.0    |
 | peft         | 0.9.0   | 0.10.0    |
@@ -293,23 +290,28 @@ huggingface-cli login
 
 ## Getting Started
 
-### Data Preparation (optional)
+### Data Preparation
 
-Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use a single `.json` file or a [dataset loading script](https://huggingface.co/docs/datasets/dataset_script) with multiple files to create a custom dataset.
+Please refer to [data/README.md](data/README.md) for checking the details about the format of dataset files. You can either use datasets on HuggingFace / ModelScope hub or load the dataset in local disk.
 
 > [!NOTE]
-> Please update `data/dataset_info.json` to use your custom dataset. About the format of this file, please refer to `data/README.md`.
+> Please update `data/dataset_info.json` to use your custom dataset.
 
-### Dependence Installation (optional)
+### Dependence Installation
 
 ```bash
 git clone https://github.com/hiyouga/LLaMA-Factory.git
 conda create -n llama_factory python=3.10
 conda activate llama_factory
 cd LLaMA-Factory
-pip install -r requirements.txt
+pip install -e .[metrics]
 ```
 
+> [!TIP]
+> Extra dependencies available: deepspeed, metrics, unsloth, vllm, bitsandbytes, gptq, awq, aqlm, qwen, quality
+
+<details><summary>For Windows users</summary>
+
 If you want to enable the quantized LoRA (QLoRA) on the Windows platform, you will be required to install a pre-built version of `bitsandbytes` library, which supports CUDA 11.1 to 12.2, please select the appropriate [release version](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels) based on your CUDA version.
 
 ```bash
@@ -318,352 +320,17 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 To enable FlashAttention-2 on the Windows platform, you need to install the precompiled `flash-attn` library, which supports CUDA 12.1 to 12.2. Please download the corresponding version from [flash-attention](https://github.com/bdashore3/flash-attention/releases) based on your requirements.
 
-### Use ModelScope Hub (optional)
+</details>
 
-If you have trouble with downloading models and datasets from Hugging Face, you can use LLaMA-Factory together with ModelScope in the following manner.
+### LLaMA Board GUI
 
-```bash
-export USE_MODELSCOPE_HUB=1 # `set USE_MODELSCOPE_HUB=1` for Windows
-```
-
-Then you can train the corresponding model by specifying a model ID of the ModelScope Hub. (find a full list of model IDs at [ModelScope Hub](https://modelscope.cn/models))
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --model_name_or_path modelscope/Llama-2-7b-ms \
-    ... # arguments (same as below)
-```
-
-LLaMA Board also supports using the models and datasets on the ModelScope Hub.
-
-```bash
-CUDA_VISIBLE_DEVICES=0 USE_MODELSCOPE_HUB=1 python src/train_web.py
-```
-
-### Train on a single GPU
-
-> [!IMPORTANT]
-> If you want to train models on multiple GPUs, please refer to [Distributed Training](#distributed-training).
-
-
-#### LLaMA Board GUI
+#### Use local environment
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 python src/train_web.py
+# or CUDA_VISIBLE_DEVICES=0 python -m llmtuner.webui.interface
 ```
 
-#### Pre-Training
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage pt \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --dataset wiki_demo \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_pt_checkpoint \
-    --overwrite_cache \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --plot_loss \
-    --fp16
-```
-
-#### Supervised Fine-Tuning
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --dataset alpaca_gpt4_en \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_sft_checkpoint \
-    --overwrite_cache \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --plot_loss \
-    --fp16
-```
-
-#### Reward Modeling
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage rm \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_sft_checkpoint \
-    --create_new_adapter \
-    --dataset comparison_gpt4_en \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_rm_checkpoint \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --plot_loss \
-    --fp16
-```
-
-#### PPO Training
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage ppo \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_sft_checkpoint \
-    --create_new_adapter \
-    --dataset alpaca_gpt4_en \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --reward_model path_to_rm_checkpoint \
-    --output_dir path_to_ppo_checkpoint \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --top_k 0 \
-    --top_p 0.9 \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --plot_loss \
-    --fp16
-```
-
-> [!TIP]
-> Use `--adapter_name_or_path path_to_sft_checkpoint,path_to_ppo_checkpoint` to infer the fine-tuned model if `--create_new_adapter` was enabled.
-
-> [!WARNING]
-> Use `--per_device_train_batch_size=1` for LLaMA-2 models in fp16 PPO training.
-
-#### DPO Training
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage dpo \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_sft_checkpoint \
-    --create_new_adapter \
-    --dataset comparison_gpt4_en \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_dpo_checkpoint \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --plot_loss \
-    --fp16
-```
-
-> [!TIP]
-> Use `--adapter_name_or_path path_to_sft_checkpoint,path_to_dpo_checkpoint` to infer the fine-tuned model if `--create_new_adapter` was enabled.
-
-### Distributed Training
-
-#### Use Huggingface Accelerate
-
-```bash
-accelerate launch --config_file config.yaml src/train_bash.py \
-    --ddp_timeout 180000000 \
-    ... # arguments (same as above)
-```
-
-<details><summary>Example config.yaml for LoRA training</summary>
-
-```yaml
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: MULTI_GPU
-downcast_bf16: 'no'
-gpu_ids: all
-machine_rank: 0
-main_training_function: main
-mixed_precision: fp16
-num_machines: 1
-num_processes: 4
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-```
-
-</details>
-
-> [!TIP]
-> We commend using Accelerate for LoRA tuning.
-
-#### Use DeepSpeed
-
-```bash
-deepspeed --num_gpus 8 src/train_bash.py \
-    --deepspeed ds_config.json \
-    --ddp_timeout 180000000 \
-    ... # arguments (same as above)
-```
-
-<details><summary>Example ds_config.json for full-parameter training with DeepSpeed ZeRO-2</summary>
-
-```json
-{
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "zero_allow_untested_optimizer": true,
-  "fp16": {
-    "enabled": "auto",
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 16,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "zero_optimization": {
-    "stage": 2,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 5e8,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 5e8,
-    "contiguous_gradients": true,
-    "round_robin_gradients": true
-  }
-}
-```
-
-</details>
-
-> [!TIP]
-> Refer to [examples](examples) for more training scripts.
-
-### Merge LoRA weights and export model
-
-```bash
-CUDA_VISIBLE_DEVICES= python src/export_model.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora \
-    --export_dir path_to_export \
-    --export_size 2 \
-    --export_legacy_format False
-```
-
-> [!WARNING]
-> Merging LoRA weights into a quantized model is not supported.
-
-> [!TIP]
-> Use `--model_name_or_path path_to_export` solely to use the exported model.
-> 
-> Use `CUDA_VISIBLE_DEVICES=0`, `--export_quantization_bit 4` and `--export_quantization_dataset data/c4_demo.json` to quantize the model with AutoGPTQ after merging the LoRA weights.
-
-### Inference with OpenAI-style API
-
-```bash
-CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora
-```
-
-> [!TIP]
-> Visit `http://localhost:8000/docs` for API documentation.
-
-### Inference with command line
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/cli_demo.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora
-```
-
-### Inference with web browser
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/web_demo.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora
-```
-
-### Evaluation
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/evaluate.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template vanilla \
-    --finetuning_type lora \
-    --task mmlu \
-    --split test \
-    --lang en \
-    --n_shot 5 \
-    --batch_size 4
-```
-
-### Predict
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage sft \
-    --do_predict \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --dataset alpaca_gpt4_en \
-    --template default \
-    --finetuning_type lora \
-    --output_dir path_to_predict_result \
-    --per_device_eval_batch_size 1 \
-    --max_samples 100 \
-    --predict_with_generate \
-    --fp16
-```
-
-> [!WARNING]
-> Use `--per_device_train_batch_size=1` for LLaMA-2 models in fp16 predict.
-
-> [!TIP]
-> We recommend using `--per_device_eval_batch_size=1` and `--max_target_length 128` at 4/8-bit predict.
-
-### Dockerize Training
-
 #### Use Docker
 
 ```bash
@@ -692,6 +359,27 @@ docker compose -f ./docker-compose.yml up -d
 > * data: Place datasets on this dir of the host machine so that they can be selected on LLaMA Board GUI.
 > * output: Set export dir to this location so that the merged result can be accessed directly on the host machine.
 
+> [!WARNING]
+> LLaMA Board GUI does not yet support multi-GPUs training.
+
+### Command Line Interface
+
+See [examples](examples) for usage.
+
+> [!TIP]
+> Use `python src/train_bash.py -h` to display arguments description.
+
+### Use ModelScope Hub
+
+If you have trouble with downloading models and datasets from Hugging Face, you can use ModelScope.
+
+```bash
+export USE_MODELSCOPE_HUB=1 # `set USE_MODELSCOPE_HUB=1` for Windows
+```
+
+> [!TIP]
+> Train the model by specifying a model ID of the ModelScope Hub as the `--model_name_or_path`. You can find a full list of model IDs at [ModelScope Hub](https://modelscope.cn/models), e.g., `modelscope/Llama-2-7b-ms`.
+
 ## Projects using LLaMA Factory
 
 1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223)
@@ -738,7 +426,7 @@ If this work is helpful, please kindly cite as:
 
 ```bibtex
 @article{zheng2024llamafactory,
-  title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models}, 
+  title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models},
   author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Yongqiang Ma},
   journal={arXiv preprint arXiv:2403.13372},
   year={2024},
@@ -748,7 +436,7 @@ If this work is helpful, please kindly cite as:
 
 ## Acknowledgement
 
-This repo benefits from [PEFT](https://github.com/huggingface/peft), [QLoRA](https://github.com/artidoro/qlora) and [FastChat](https://github.com/lm-sys/FastChat). Thanks for their wonderful works.
+This repo benefits from [PEFT](https://github.com/huggingface/peft), [TRL](https://github.com/huggingface/trl), [QLoRA](https://github.com/artidoro/qlora) and [FastChat](https://github.com/lm-sys/FastChat). Thanks for their wonderful works.
 
 ## Star History
 
diff --git a/README_zh.md b/README_zh.md
index d591e852..8b19f17f 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -53,7 +53,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 ## 性能指标
 
-与 ChatGLM 官方的 [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning) 微调相比，LLaMA-Factory 的 LoRA 微调提供了 **3.7 倍**的加速比，同时在广告文案生成任务上取得了更高的 Rouge 分数。结合 4 比特量化技术，LLaMA-Factory 的 QLoRA 微调进一步降低了 GPU 显存消耗。
+与 ChatGLM 官方的 [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ptuning) 微调相比，LLaMA Factory 的 LoRA 微调提供了 **3.7 倍**的加速比，同时在广告文案生成任务上取得了更高的 Rouge 分数。结合 4 比特量化技术，LLaMA Factory 的 QLoRA 微调进一步降低了 GPU 显存消耗。
 
 ![benchmark](assets/benchmark.svg)
 
@@ -62,7 +62,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 - **Training Speed**: 训练阶段每秒处理的样本数量。（批处理大小=4，截断长度=1024）
 - **Rouge Score**: [广告文案生成](https://aclanthology.org/D19-1321.pdf)任务验证集上的 Rouge-2 分数。（批处理大小=4，截断长度=1024）
 - **GPU Memory**: 4 比特量化训练的 GPU 显存峰值。（批处理大小=1，截断长度=1024）
-- 我们在 ChatGLM 的 P-Tuning 中采用 `pre_seq_len=128`，在 LLaMA-Factory 的 LoRA 微调中采用 `lora_rank=32`。
+- 我们在 ChatGLM 的 P-Tuning 中采用 `pre_seq_len=128`，在 LLaMA Factory 的 LoRA 微调中采用 `lora_rank=32`。
 
 </details>
 
@@ -72,7 +72,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 
 [24/03/21] 我们的论文 "[LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models](https://arxiv.org/abs/2403.13372)" 可在 arXiv 上查看！
 
-[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/fsdp_qlora`。
+[24/03/20] 我们支持了能在 2x24GB GPU 上微调 70B 模型的 **FSDP+QLoRA**。详细用法请参照 `examples/extras/fsdp_qlora`。
 
 <details><summary>展开日志</summary>
 
@@ -168,9 +168,6 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd
 | DPO 训练               | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 | ORPO 训练              | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: |
 
-> [!NOTE]
-> 请使用 `--quantization_bit 4` 参数来启用 QLoRA 训练。
-
 ## 数据集
 
 <details><summary>预训练数据集</summary>
@@ -263,7 +260,7 @@ huggingface-cli login
 | ------------ | ------- | --------- |
 | python       | 3.8     | 3.10      |
 | torch        | 1.13.1  | 2.2.0     |
-| transformers | 4.37.2  | 4.39.2    |
+| transformers | 4.37.2  | 4.39.3    |
 | datasets     | 2.14.3  | 2.18.0    |
 | accelerate   | 0.27.2  | 0.28.0    |
 | peft         | 0.9.0   | 0.10.0    |
@@ -293,23 +290,28 @@ huggingface-cli login
 
 ## 如何使用
 
-### 数据准备（可跳过）
+### 数据准备
 
-关于数据集文件的格式，请参考 [data/README_zh.md](data/README_zh.md) 的内容。构建自定义数据集时，既可以使用单个 `.json` 文件，也可以使用一个[数据加载脚本](https://huggingface.co/docs/datasets/dataset_script)和多个文件。
+关于数据集文件的格式，请参考 [data/README_zh.md](data/README_zh.md) 的内容。你可以使用 HuggingFace / ModelScope 上的数据集或加载本地数据集。
 
 > [!NOTE]
-> 使用自定义数据集时，请更新 `data/dataset_info.json` 文件，该文件的格式请参考 `data/README_zh.md`。
+> 使用自定义数据集时，请更新 `data/dataset_info.json` 文件。
 
-### 环境搭建（可跳过）
+### 安装依赖
 
 ```bash
 git clone https://github.com/hiyouga/LLaMA-Factory.git
 conda create -n llama_factory python=3.10
 conda activate llama_factory
 cd LLaMA-Factory
-pip install -r requirements.txt
+pip install -e .[metrics]
 ```
 
+> [!TIP]
+> 可选的额外依赖项：deepspeed、metrics、unsloth、vllm、bitsandbytes、gptq、awq、aqlm、qwen、quality
+
+<details><summary>Windows 用户指南</summary>
+
 如果要在 Windows 平台上开启量化 LoRA（QLoRA），需要安装预编译的 `bitsandbytes` 库, 支持 CUDA 11.1 到 12.2, 请根据您的 CUDA 版本情况选择适合的[发布版本](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels)。
 
 ```bash
@@ -318,350 +320,17 @@ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/downl
 
 如果要在 Windows 平台上开启 FlashAttention-2，需要安装预编译的 `flash-attn` 库，支持 CUDA 12.1 到 12.2，请根据需求到 [flash-attention](https://github.com/bdashore3/flash-attention/releases) 下载对应版本安装。
 
-### 使用魔搭社区（可跳过）
+</details>
 
-如果您在 Hugging Face 模型和数据集的下载中遇到了问题，可以通过下述方法使用魔搭社区。
+### LLaMA Board 可视化界面
 
-```bash
-export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
-```
-
-接着即可通过指定模型名称来训练对应的模型。（在[魔搭社区](https://modelscope.cn/models)查看所有可用的模型）
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --model_name_or_path modelscope/Llama-2-7b-ms \
-    ... # 参数同下
-```
-
-LLaMA Board 同样支持魔搭社区的模型和数据集下载。
-
-```bash
-CUDA_VISIBLE_DEVICES=0 USE_MODELSCOPE_HUB=1 python src/train_web.py
-```
-
-### 单 GPU 训练
-
-> [!IMPORTANT]
-> 如果您使用多张 GPU 训练模型，请移步[多 GPU 分布式训练](#多-gpu-分布式训练)部分。
-
-#### LLaMA Board GUI
+#### 使用本地环境
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 python src/train_web.py
+# 或 CUDA_VISIBLE_DEVICES=0 python -m llmtuner.webui.interface
 ```
 
-#### 预训练
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage pt \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --dataset wiki_demo \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_pt_checkpoint \
-    --overwrite_cache \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --plot_loss \
-    --fp16
-```
-
-#### 指令监督微调
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage sft \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --dataset alpaca_gpt4_zh \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_sft_checkpoint \
-    --overwrite_cache \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 5e-5 \
-    --num_train_epochs 3.0 \
-    --plot_loss \
-    --fp16
-```
-
-#### 奖励模型训练
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage rm \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_sft_checkpoint \
-    --create_new_adapter \
-    --dataset comparison_gpt4_zh \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_rm_checkpoint \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --plot_loss \
-    --fp16
-```
-
-#### PPO 训练
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage ppo \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_sft_checkpoint \
-    --create_new_adapter \
-    --dataset alpaca_gpt4_zh \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --reward_model path_to_rm_checkpoint \
-    --output_dir path_to_ppo_checkpoint \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --top_k 0 \
-    --top_p 0.9 \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --plot_loss \
-    --fp16
-```
-
-> [!TIP]
-> 如果开启了 `--create_new_adapter`，则使用 `--adapter_name_or_path path_to_sft_checkpoint,path_to_ppo_checkpoint` 来进行微调模型的推理。
-
-> [!WARNING]
-> 如果使用 fp16 精度进行 LLaMA-2 模型的 PPO 训练，请使用 `--per_device_train_batch_size=1`。
-
-#### DPO 训练
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage dpo \
-    --do_train \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_sft_checkpoint \
-    --create_new_adapter \
-    --dataset comparison_gpt4_zh \
-    --template default \
-    --finetuning_type lora \
-    --lora_target q_proj,v_proj \
-    --output_dir path_to_dpo_checkpoint \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 4 \
-    --lr_scheduler_type cosine \
-    --logging_steps 10 \
-    --save_steps 1000 \
-    --learning_rate 1e-5 \
-    --num_train_epochs 1.0 \
-    --plot_loss \
-    --fp16
-```
-
-> [!TIP]
-> 如果开启了 `--create_new_adapter`，则使用 `--adapter_name_or_path path_to_sft_checkpoint,path_to_dpo_checkpoint` 来进行微调模型的推理。
-
-### 多 GPU 分布式训练
-
-#### 使用 Huggingface Accelerate
-
-```bash
-accelerate launch --config_file config.yaml src/train_bash.py \
-    --ddp_timeout 180000000 \
-    ... # 参数同上
-```
-
-<details><summary>使用 Accelerate 进行 LoRA 训练的 config.yaml 示例</summary>
-
-```yaml
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: MULTI_GPU
-downcast_bf16: 'no'
-gpu_ids: all
-machine_rank: 0
-main_training_function: main
-mixed_precision: fp16
-num_machines: 1
-num_processes: 4
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-```
-
-</details>
-
-> [!TIP]
-> 我们推荐使用 Accelerate 进行 LoRA 训练。
-
-#### 使用 DeepSpeed
-
-```bash
-deepspeed --num_gpus 8 src/train_bash.py \
-    --deepspeed ds_config.json \
-    --ddp_timeout 180000000 \
-    ... # 参数同上
-```
-
-<details><summary>使用 DeepSpeed ZeRO-2 进行全参数训练的 ds_config.json 示例</summary>
-
-```json
-{
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "zero_allow_untested_optimizer": true,
-  "fp16": {
-    "enabled": "auto",
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 16,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "zero_optimization": {
-    "stage": 2,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 5e8,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 5e8,
-    "contiguous_gradients": true,
-    "round_robin_gradients": true
-  }
-}
-```
-
-</details>
-
-> [!TIP]
-> 更多训练脚本请查看 [examples](examples)。
-
-### 合并 LoRA 权重并导出模型
-
-```bash
-CUDA_VISIBLE_DEVICES= python src/export_model.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora \
-    --export_dir path_to_export \
-    --export_size 2 \
-    --export_legacy_format False
-```
-
-> [!WARNING]
-> 尚不支持量化模型的 LoRA 权重合并及导出。
-
-> [!TIP]
-> 仅使用 `--model_name_or_path path_to_export` 来加载导出后的模型。
-> 
-> 合并 LoRA 权重之后可再次使用 `CUDA_VISIBLE_DEVICES=0`、`--export_quantization_bit 4` 和 `--export_quantization_dataset data/c4_demo.json` 基于 AutoGPTQ 量化模型。
-
-### 使用 OpenAI 风格 API 推理
-
-```bash
-CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora
-```
-
-> [!TIP]
-> 关于 API 文档请见 `http://localhost:8000/docs`。
-
-### 使用命令行推理
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/cli_demo.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora
-```
-
-### 使用浏览器推理
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/web_demo.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template default \
-    --finetuning_type lora
-```
-
-### 模型评估
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/evaluate.py \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --template vanilla \
-    --finetuning_type lora \
-    --task ceval \
-    --split validation \
-    --lang zh \
-    --n_shot 5 \
-    --batch_size 4
-```
-
-### 模型预测
-
-```bash
-CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \
-    --stage sft \
-    --do_predict \
-    --model_name_or_path path_to_llama_model \
-    --adapter_name_or_path path_to_checkpoint \
-    --dataset alpaca_gpt4_zh \
-    --template default \
-    --finetuning_type lora \
-    --output_dir path_to_predict_result \
-    --per_device_eval_batch_size 1 \
-    --max_samples 100 \
-    --predict_with_generate \
-    --fp16
-```
-
-> [!WARNING]
-> 如果使用 fp16 精度进行 LLaMA-2 模型的预测，请使用 `--per_device_eval_batch_size=1`。
-
-> [!TIP]
-> 我们建议在量化模型的预测中使用 `--per_device_eval_batch_size=1` 和 `--max_target_length 128`。
-
-### 使用容器
 
 #### 使用 Docker
 
@@ -691,6 +360,27 @@ docker compose -f ./docker-compose.yml up -d
 > * data：宿主机中存放数据集的文件夹路径。
 > * output：将导出目录设置为该路径后，即可在宿主机中访问导出后的模型。
 
+> [!WARNING]
+> LLaMA Board 可视化界面尚不支持多 GPU 训练。
+
+### 命令行接口
+
+使用方法请参考 [examples](examples) 文件夹。
+
+> [!TIP]
+> 使用 `python src/train_bash.py -h` 查看参数文档。
+
+### 使用魔搭社区
+
+如果您在 Hugging Face 模型和数据集的下载中遇到了问题，可以通过下述方法使用魔搭社区。
+
+```bash
+export USE_MODELSCOPE_HUB=1 # Windows 使用 `set USE_MODELSCOPE_HUB=1`
+```
+
+> [!TIP]
+> 将 `--model_name_or_path` 设置为模型 ID 来加载对应的模型。在[魔搭社区](https://modelscope.cn/models)查看所有可用的模型，例如 `modelscope/Llama-2-7b-ms`。
+
 ## 使用了 LLaMA Factory 的项目
 
 1. Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. [[arxiv]](https://arxiv.org/abs/2308.02223)
@@ -747,7 +437,7 @@ docker compose -f ./docker-compose.yml up -d
 
 ## 致谢
 
-本项目受益于 [PEFT](https://github.com/huggingface/peft)、[QLoRA](https://github.com/artidoro/qlora) 和 [FastChat](https://github.com/lm-sys/FastChat)，感谢以上诸位作者的付出。
+本项目受益于 [PEFT](https://github.com/huggingface/peft)、[TRL](https://github.com/huggingface/trl)、[QLoRA](https://github.com/artidoro/qlora) 和 [FastChat](https://github.com/lm-sys/FastChat)，感谢以上诸位作者的付出。
 
 ## Star History
 
diff --git a/examples/fsdp_qlora/README.md b/examples/extras/fsdp_qlora/README.md
similarity index 100%
rename from examples/fsdp_qlora/README.md
rename to examples/extras/fsdp_qlora/README.md
diff --git a/examples/fsdp_qlora/fsdp.sh b/examples/extras/fsdp_qlora/fsdp.sh
similarity index 87%
rename from examples/fsdp_qlora/fsdp.sh
rename to examples/extras/fsdp_qlora/fsdp.sh
index 2f7772a4..0fce3ecc 100644
--- a/examples/fsdp_qlora/fsdp.sh
+++ b/examples/extras/fsdp_qlora/fsdp.sh
@@ -15,11 +15,13 @@ CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
     --overwrite_cache \
     --overwrite_output_dir \
     --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 1 \
-    --gradient_accumulation_steps 8 \
+    --gradient_accumulation_steps 4 \
     --lr_scheduler_type cosine \
     --logging_steps 10 \
+    --warmup_steps 20 \
     --save_steps 100 \
     --eval_steps 100 \
     --evaluation_strategy steps \
@@ -28,6 +30,7 @@ CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
     --num_train_epochs 3.0 \
     --max_samples 3000 \
     --val_size 0.1 \
+    --ddp_timeout 180000000 \
     --quantization_bit 4 \
     --plot_loss \
     --fp16
diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh
index b463cc10..d1382bc2 100644
--- a/examples/full_multi_gpu/multi_node.sh
+++ b/examples/full_multi_gpu/multi_node.sh
@@ -33,6 +33,6 @@ python -m torch.distributed.run \
     --num_train_epochs 3.0 \
     --max_samples 3000 \
     --val_size 0.1 \
-    --ddp_timeout 1800000 \
+    --ddp_timeout 180000000 \
     --plot_loss \
     --fp16
diff --git a/examples/full_multi_gpu/single_node.sh b/examples/full_multi_gpu/single_node.sh
index a54b81b3..ea4acf90 100644
--- a/examples/full_multi_gpu/single_node.sh
+++ b/examples/full_multi_gpu/single_node.sh
@@ -27,6 +27,6 @@ deepspeed --num_gpus 4 ../../src/train_bash.py \
     --num_train_epochs 3.0 \
     --max_samples 3000 \
     --val_size 0.1 \
-    --ddp_timeout 1800000 \
+    --ddp_timeout 180000000 \
     --plot_loss \
     --fp16
diff --git a/examples/inference/api_demo.sh b/examples/inference/api_demo.sh
new file mode 100644
index 00000000..4a601bb6
--- /dev/null
+++ b/examples/inference/api_demo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 API_PORT=8000 python src/api_demo.py \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
+    --template default \
+    --finetuning_type lora
diff --git a/examples/inference/cli_demo.sh b/examples/inference/cli_demo.sh
new file mode 100644
index 00000000..fdeb01e6
--- /dev/null
+++ b/examples/inference/cli_demo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python src/cli_demo.py \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
+    --template default \
+    --finetuning_type lora
diff --git a/examples/inference/evaluate.sh b/examples/inference/evaluate.sh
new file mode 100644
index 00000000..b3053662
--- /dev/null
+++ b/examples/inference/evaluate.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python src/evaluate.py \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
+    --template vanilla \
+    --finetuning_type lora \
+    --task mmlu \
+    --split test \
+    --lang en \
+    --n_shot 5 \
+    --batch_size 4
diff --git a/examples/inference/web_demo.sh b/examples/inference/web_demo.sh
new file mode 100644
index 00000000..0f8307fb
--- /dev/null
+++ b/examples/inference/web_demo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES=0 python src/web_demo.py \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
+    --template default \
+    --finetuning_type lora
diff --git a/examples/lora_multi_gpu/multi_node.sh b/examples/lora_multi_gpu/multi_node.sh
index 562364a7..5172b9a6 100644
--- a/examples/lora_multi_gpu/multi_node.sh
+++ b/examples/lora_multi_gpu/multi_node.sh
@@ -30,6 +30,6 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
     --num_train_epochs 3.0 \
     --max_samples 3000 \
     --val_size 0.1 \
-    --ddp_timeout 1800000 \
+    --ddp_timeout 180000000 \
     --plot_loss \
     --fp16
diff --git a/examples/lora_multi_gpu/single_node.sh b/examples/lora_multi_gpu/single_node.sh
index ddb9459d..269d76d7 100644
--- a/examples/lora_multi_gpu/single_node.sh
+++ b/examples/lora_multi_gpu/single_node.sh
@@ -30,6 +30,6 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch \
     --num_train_epochs 3.0 \
     --max_samples 3000 \
     --val_size 0.1 \
-    --ddp_timeout 1800000 \
+    --ddp_timeout 180000000 \
     --plot_loss \
     --fp16
diff --git a/examples/lora_single_gpu/prepare.sh b/examples/lora_single_gpu/prepare.sh
new file mode 100644
index 00000000..3652cea4
--- /dev/null
+++ b/examples/lora_single_gpu/prepare.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+CUDA_VISIBLE_DEVICES= python ../../src/train_bash.py \
+    --stage sft \
+    --do_train \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --dataset alpaca_gpt4_en,glaive_toolcall \
+    --dataset_dir ../../data \
+    --template default \
+    --finetuning_type lora \
+    --lora_target q_proj,v_proj \
+    --output_dir ../../saves/LLaMA2-7B/lora/sft \
+    --overwrite_cache \
+    --overwrite_output_dir \
+    --cutoff_len 1024 \
+    --preprocessing_num_workers 16 \
+    --max_samples 3000 \
+    --tokenized_path ../../saves/datasets/sft
diff --git a/examples/merge_lora/README.md b/examples/merge_lora/README.md
index c6c16071..a095f288 100644
--- a/examples/merge_lora/README.md
+++ b/examples/merge_lora/README.md
@@ -1,3 +1,12 @@
+> [!WARNING]
+> Merging LoRA weights into a quantized model is not supported.
+
+> [!TIP]
+> Use `--model_name_or_path path_to_model` solely to use the exported model or model fine-tuned in full/freeze mode.
+>
+> Use `CUDA_VISIBLE_DEVICES=0`, `--export_quantization_bit 4` and `--export_quantization_dataset data/c4_demo.json` to quantize the model with AutoGPTQ after merging the LoRA weights.
+
+
 Usage:
 
 - `merge.sh`: merge the lora weights
diff --git a/examples/merge_lora/merge.sh b/examples/merge_lora/merge.sh
index 42b9fcdd..bd2babb8 100644
--- a/examples/merge_lora/merge.sh
+++ b/examples/merge_lora/merge.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 python ../../src/export_model.py \
+CUDA_VISIBLE_DEVICES= python ../../src/export_model.py \
     --model_name_or_path meta-llama/Llama-2-7b-hf \
     --adapter_name_or_path ../../saves/LLaMA2-7B/lora/sft \
     --template default \
diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py
index 0ab734e0..c22f9a77 100644
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@@ -6,6 +6,7 @@ from datasets import load_dataset, load_from_disk
 
 from ..extras.constants import FILEEXT2TYPE
 from ..extras.logging import get_logger
+from ..extras.misc import is_path_available
 from .aligner import align_dataset
 from .parser import get_dataset_list
 from .preprocess import get_preprocess_and_print_func
@@ -122,11 +123,12 @@ def get_dataset(
     if data_args.train_on_prompt and template.efficient_eos:
         raise ValueError("Current template does not support `train_on_prompt`.")
 
-    # Load from cache
-    if data_args.cache_path is not None:
-        if os.path.exists(data_args.cache_path):
+    # Load tokenized dataset
+    if data_args.tokenized_path is not None:
+        if not is_path_available(data_args.tokenized_path):
             logger.warning("Loading dataset from disk will ignore other data arguments.")
-            dataset = load_from_disk(data_args.cache_path)
+            dataset = load_from_disk(data_args.tokenized_path)
+            logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path))
             if data_args.streaming:
                 dataset = dataset.to_iterable_dataset()
             return dataset
@@ -158,10 +160,13 @@ def get_dataset(
 
         dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs)
 
-        if data_args.cache_path is not None and not os.path.exists(data_args.cache_path):
+        if data_args.tokenized_path is not None:
             if training_args.should_save:
-                dataset.save_to_disk(data_args.cache_path)
-                logger.info("Dataset cache saved at {}.".format(data_args.cache_path))
+                dataset.save_to_disk(data_args.tokenized_path)
+                logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path))
+                logger.info("Please restart the training with `--tokenized_path {}`.".format(data_args.tokenized_path))
+
+            exit(0)
 
         if training_args.should_log:
             try:
diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py
index 60cf153b..a696b315 100644
--- a/src/llmtuner/extras/misc.py
+++ b/src/llmtuner/extras/misc.py
@@ -193,6 +193,18 @@ def infer_optim_dtype(model_dtype: torch.dtype) -> torch.dtype:
         return torch.float32
 
 
+def is_path_available(path: os.PathLike) -> bool:
+    r"""
+    Checks if the path is empty or not exist.
+    """
+    if not os.path.exists(path):
+        return True
+    elif os.path.isdir(path) and not os.listdir(path):
+        return True
+    else:
+        return False
+
+
 def torch_gc() -> None:
     r"""
     Collects GPU memory.
diff --git a/src/llmtuner/extras/patches/llama_patch.py b/src/llmtuner/extras/patches/llama_patch.py
index f0b65d65..6a90c41a 100644
--- a/src/llmtuner/extras/patches/llama_patch.py
+++ b/src/llmtuner/extras/patches/llama_patch.py
@@ -193,6 +193,6 @@ def llama_flash_attn_forward(
 
 
 def apply_llama_patch() -> None:
-    require_version("transformers==4.39.2", "To fix: pip install transformers==4.39.2")
+    require_version("transformers==4.39.3", "To fix: pip install transformers==4.39.3")
     LlamaAttention.forward = llama_torch_attn_forward
     LlamaFlashAttention2.forward = llama_flash_attn_forward
diff --git a/src/llmtuner/extras/patches/mixtral_patch.py b/src/llmtuner/extras/patches/mixtral_patch.py
deleted file mode 100644
index 382492e0..00000000
--- a/src/llmtuner/extras/patches/mixtral_patch.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import torch
-import torch.nn.functional as F
-from transformers.models.mixtral.modeling_mixtral import MixtralBLockSparseTop2MLP, MixtralSparseMoeBlock
-
-
-def mlp_forward(self: "MixtralBLockSparseTop2MLP", hidden_states: torch.Tensor) -> torch.Tensor:
-    current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
-    current_hidden_states = self.w2(current_hidden_states)
-    return current_hidden_states
-
-
-# Modified from: https://huggingface.co/deepseek-ai/deepseek-moe-16b-base/blob/main/modeling_deepseek.py
-def moe_forward(self: "MixtralSparseMoeBlock", hidden_states: torch.Tensor) -> torch.Tensor:
-    batch_size, sequence_length, hidden_dim = hidden_states.shape
-    hidden_states = hidden_states.view(-1, hidden_dim)
-    # router_logits: (batch * sequence_length, n_experts)
-    router_logits = self.gate(hidden_states)
-
-    routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-    topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
-    topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
-    # we cast back to the input dtype
-    topk_weight = topk_weight.to(hidden_states.dtype)
-
-    hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
-    y = torch.empty_like(hidden_states)
-    flat_topk_idx = topk_idx.view(-1)
-    for i in range(self.num_experts):
-        expert = self.experts[i]
-        y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
-    y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
-    final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
-    return final_hidden_states, router_logits
-
-
-def patch_mixtral_replace_moe_impl() -> None:
-    MixtralBLockSparseTop2MLP.forward = mlp_forward
-    MixtralSparseMoeBlock.forward = moe_forward
diff --git a/src/llmtuner/hparams/data_args.py b/src/llmtuner/hparams/data_args.py
index 76e6d6da..f5f75c77 100644
--- a/src/llmtuner/hparams/data_args.py
+++ b/src/llmtuner/hparams/data_args.py
@@ -84,9 +84,9 @@ class DataArguments:
             "help": "Whether or not to pack the sequences in training. Will automatically enable in pre-training."
         },
     )
-    cache_path: Optional[str] = field(
+    tokenized_path: Optional[str] = field(
         default=None,
-        metadata={"help": "Path to save or load the pre-processed datasets."},
+        metadata={"help": "Path to save or load the tokenized datasets."},
     )
 
     def __post_init__(self):
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index 7132470a..434a3a84 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -17,8 +17,7 @@ from ..extras.logging import get_logger
 from ..extras.misc import get_current_device, infer_optim_dtype
 from ..extras.packages import is_flash_attn2_available
 from ..extras.patches.llama_patch import apply_llama_patch
-from ..extras.patches.mixtral_patch import patch_mixtral_replace_moe_impl
-from .utils import QuantizationMethod
+from .utils import QuantizationMethod, add_z3_leaf_module
 
 
 if TYPE_CHECKING:
@@ -32,47 +31,6 @@ logger = get_logger(__name__)
 SUPPORTED_CLASS_FOR_S2ATTN = ["llama"]
 
 
-def _noisy_mean_initialization(embed_weight: torch.Tensor, num_new_tokens: int):
-    embedding_dim = embed_weight.size(1)
-    avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True)
-    noise_weight = torch.empty_like(embed_weight[-num_new_tokens:])
-    noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim)))
-    embed_weight[-num_new_tokens:] = avg_weight + noise_weight
-
-
-def _resize_embedding_layer(model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer") -> None:
-    r"""
-    Resize token embeddings.
-    """
-    if is_deepspeed_zero3_enabled():
-        import deepspeed  # type: ignore
-
-        params = [model.get_input_embeddings().weight]
-        if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings:
-            params.append(model.get_output_embeddings().weight)
-
-        context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0)
-    else:
-        context_maybe_zero3 = nullcontext()
-
-    with context_maybe_zero3:
-        current_embedding_size = model.get_input_embeddings().weight.size(0)
-
-    if len(tokenizer) > current_embedding_size:
-        if not isinstance(model.get_output_embeddings(), torch.nn.Linear):
-            logger.warning("Current model does not support resizing token embeddings.")
-            return
-
-        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)
-        with context_maybe_zero3:
-            new_embedding_size = model.get_input_embeddings().weight.size(0)
-            num_new_tokens = new_embedding_size - current_embedding_size
-            _noisy_mean_initialization(model.get_input_embeddings().weight.data, num_new_tokens)
-            _noisy_mean_initialization(model.get_output_embeddings().weight.data, num_new_tokens)
-
-        logger.info("Resized token embeddings from {} to {}.".format(current_embedding_size, new_embedding_size))
-
-
 def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[str]:
     r"""
     Inspired by: https://github.com/huggingface/optimum/blob/v1.16.0/optimum/gptq/data.py#L133
@@ -180,8 +138,12 @@ def _configure_quantization(
         quant_method = quantization_config.get("quant_method", "")
 
         if quant_method == QuantizationMethod.GPTQ:
+            require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0")
             quantization_config["use_exllama"] = False  # disable exllama
 
+        if quant_method == QuantizationMethod.AWQ:
+            require_version("autoawq", "To fix: pip install autoawq")
+
         if quant_method == QuantizationMethod.AQLM:
             require_version("transformers>=4.39.0", "To fix: pip install transformers>=4.39.0")
             require_version("aqlm>=1.1.0", "To fix: pip install aqlm[gpu]>=1.1.0")
@@ -235,6 +197,47 @@ def _configure_quantization(
         logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit))
 
 
+def _noisy_mean_initialization(embed_weight: torch.Tensor, num_new_tokens: int):
+    embedding_dim = embed_weight.size(1)
+    avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True)
+    noise_weight = torch.empty_like(embed_weight[-num_new_tokens:])
+    noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim)))
+    embed_weight[-num_new_tokens:] = avg_weight + noise_weight
+
+
+def _resize_embedding_layer(model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer") -> None:
+    r"""
+    Resize token embeddings.
+    """
+    if is_deepspeed_zero3_enabled():
+        import deepspeed  # type: ignore
+
+        params = [model.get_input_embeddings().weight]
+        if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings:
+            params.append(model.get_output_embeddings().weight)
+
+        context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0)
+    else:
+        context_maybe_zero3 = nullcontext()
+
+    with context_maybe_zero3:
+        current_embedding_size = model.get_input_embeddings().weight.size(0)
+
+    if len(tokenizer) > current_embedding_size:
+        if not isinstance(model.get_output_embeddings(), torch.nn.Linear):
+            logger.warning("Current model does not support resizing token embeddings.")
+            return
+
+        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)
+        with context_maybe_zero3:
+            new_embedding_size = model.get_input_embeddings().weight.size(0)
+            num_new_tokens = new_embedding_size - current_embedding_size
+            _noisy_mean_initialization(model.get_input_embeddings().weight.data, num_new_tokens)
+            _noisy_mean_initialization(model.get_output_embeddings().weight.data, num_new_tokens)
+
+        logger.info("Resized token embeddings from {} to {}.".format(current_embedding_size, new_embedding_size))
+
+
 def _fp32_forward_post_hook(
     module: "torch.nn.Module", args: Tuple["torch.Tensor"], output: "torch.Tensor"
 ) -> "torch.Tensor":
@@ -348,15 +351,15 @@ def patch_model(
     if is_trainable:
         _prepare_model_for_training(model, model_args)
 
-    if getattr(model.config, "model_type", None) == "mixtral" and is_deepspeed_zero3_enabled():
-        require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0")
-        from deepspeed.utils import set_z3_leaf_modules  # type: ignore
+    if getattr(model.config, "model_type", None) == "mixtral":
         from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
-        set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
+        add_z3_leaf_module(model, MixtralSparseMoeBlock)
 
-        if is_trainable:
-            patch_mixtral_replace_moe_impl()
+    if getattr(model.config, "model_type", None) == "qwen2moe":
+        from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock
+
+        add_z3_leaf_module(model, Qwen2MoeSparseMoeBlock)
 
     try:
         model.add_model_tags(["llama-factory"])
diff --git a/src/llmtuner/model/utils.py b/src/llmtuner/model/utils.py
index 1b96a9dd..771e6112 100644
--- a/src/llmtuner/model/utils.py
+++ b/src/llmtuner/model/utils.py
@@ -3,7 +3,9 @@ from typing import TYPE_CHECKING, Dict, List
 
 import torch
 from transformers import PreTrainedModel
+from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.utils import cached_file
+from transformers.utils.versions import require_version
 
 from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
 from ..extras.logging import get_logger
@@ -28,11 +30,23 @@ class QuantizationMethod(str, Enum):
     GPTQ = "gptq"
     AWQ = "awq"
     AQLM = "aqlm"
+    QUANTO = "quanto"
+
+
+def add_z3_leaf_module(model: "PreTrainedModel", module: "torch.nn.Module") -> None:
+    r"""
+    Sets module as a leaf module to skip partitioning in deepspeed zero3.
+    """
+    if is_deepspeed_zero3_enabled():
+        require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0")
+        from deepspeed.utils import set_z3_leaf_modules  # type: ignore
+
+        set_z3_leaf_modules(model, [module])
 
 
 def find_all_linear_modules(model: "PreTrainedModel") -> List[str]:
     r"""
-    Finds all available modules to apply lora.
+    Finds all available modules to apply lora or galore.
     """
     quantization_method = getattr(model, "quantization_method", None)
     if quantization_method is None:
diff --git a/src/llmtuner/train/dpo/collator.py b/src/llmtuner/train/dpo/collator.py
deleted file mode 100644
index 7e8ba1c5..00000000
--- a/src/llmtuner/train/dpo/collator.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from dataclasses import dataclass
-from typing import Any, Dict, List, Sequence, Tuple
-
-import torch
-from transformers import DataCollatorForSeq2Seq
-
-
-@dataclass
-class DPODataCollatorWithPadding(DataCollatorForSeq2Seq):
-    r"""
-    Data collator for pairwise data.
-    """
-
-    def _pad_labels(self, batch: torch.Tensor, positions: List[Tuple[int, int]]) -> torch.Tensor:
-        padded_labels = []
-        for feature, (prompt_len, answer_len) in zip(batch, positions):
-            if self.tokenizer.padding_side == "left":
-                start, end = feature.size(0) - answer_len, feature.size(0)
-            else:
-                start, end = prompt_len, prompt_len + answer_len
-            padded_tensor = self.label_pad_token_id * torch.ones_like(feature)
-            padded_tensor[start:end] = feature[start:end]
-            padded_labels.append(padded_tensor)
-        return torch.stack(padded_labels, dim=0).contiguous()  # in contiguous memory
-
-    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
-        r"""
-        Pads batched data to the longest sequence in the batch.
-
-        We generate 2 * n examples where the first n examples represent chosen examples and
-        the last n examples represent rejected examples.
-        """
-        concatenated_features = []
-        label_positions = []
-        for key in ("chosen_ids", "rejected_ids"):
-            for feature in features:
-                prompt_len, answer_len = len(feature["prompt_ids"]), len(feature[key])
-                concatenated_features.append(
-                    {
-                        "input_ids": feature["prompt_ids"] + feature[key],
-                        "attention_mask": [1] * (prompt_len + answer_len),
-                    }
-                )
-                label_positions.append((prompt_len, answer_len))
-
-        batch = self.tokenizer.pad(
-            concatenated_features,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors=self.return_tensors,
-        )
-        batch["labels"] = self._pad_labels(batch["input_ids"], label_positions)
-        return batch
diff --git a/src/llmtuner/train/rm/collator.py b/src/llmtuner/train/rm/collator.py
deleted file mode 100644
index 8d5d4ada..00000000
--- a/src/llmtuner/train/rm/collator.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from dataclasses import dataclass
-from typing import Any, Dict, Sequence
-
-import torch
-from transformers import DataCollatorWithPadding
-
-
-@dataclass
-class PairwiseDataCollatorWithPadding(DataCollatorWithPadding):
-    r"""
-    Data collator for pairwise data.
-    """
-
-    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
-        r"""
-        Pads batched data to the longest sequence in the batch.
-
-        We generate 2 * n examples where the first n examples represent chosen examples and
-        the last n examples represent rejected examples.
-        """
-        features = [
-            {
-                "input_ids": feature["prompt_ids"] + feature[key],
-                "attention_mask": [1] * (len(feature["prompt_ids"]) + len(feature[key])),
-            }
-            for key in ("chosen_ids", "rejected_ids")
-            for feature in features
-        ]
-        return super().__call__(features)