fix #3602
This commit is contained in:
parent
6159acbaa0
commit
b0888262e3
|
@ -148,6 +148,8 @@ bash examples/full_multi_gpu/predict.sh
|
||||||
|
|
||||||
#### Merge LoRA Adapters
|
#### Merge LoRA Adapters
|
||||||
|
|
||||||
|
Note: DO NOT use quantized model or `quantization_bit` when merging LoRA adapters.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
|
CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
|
||||||
```
|
```
|
||||||
|
|
|
@ -148,6 +148,8 @@ bash examples/full_multi_gpu/predict.sh
|
||||||
|
|
||||||
#### 合并 LoRA 适配器
|
#### 合并 LoRA 适配器
|
||||||
|
|
||||||
|
注:请勿使用量化后的模型或 `quantization_bit` 参数来合并 LoRA 适配器。
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
|
CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
|
||||||
```
|
```
|
||||||
|
|
|
@ -1,6 +1,12 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
python -m torch.distributed.run \
|
NPROC_PER_NODE=4
|
||||||
|
NNODES=2
|
||||||
|
RANK=0
|
||||||
|
MASTER_ADDR=192.168.0.1
|
||||||
|
MASTER_PORT=29500
|
||||||
|
|
||||||
|
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
|
||||||
--nproc_per_node $NPROC_PER_NODE \
|
--nproc_per_node $NPROC_PER_NODE \
|
||||||
--nnodes $NNODES \
|
--nnodes $NNODES \
|
||||||
--node_rank $RANK \
|
--node_rank $RANK \
|
||||||
|
|
|
@ -1,4 +1,9 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
deepspeed --include "localhost:0,1,2,3" \
|
NPROC_PER_NODE=4
|
||||||
|
|
||||||
|
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
|
||||||
|
--nproc_per_node $NPROC_PER_NODE \
|
||||||
|
--nnodes 1 \
|
||||||
|
--standalone \
|
||||||
src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
|
src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
|
||||||
|
|
|
@ -1,5 +1,9 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# ZeRO-3 enables weight sharding on multiple GPUs
|
|
||||||
|
|
||||||
deepspeed --include "localhost:0,1,2,3" \
|
NPROC_PER_NODE=4
|
||||||
|
|
||||||
|
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
|
||||||
|
--nproc_per_node $NPROC_PER_NODE \
|
||||||
|
--nnodes 1 \
|
||||||
|
--standalone \
|
||||||
src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
|
src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Note: DO NOT use quantized model or quantization_bit when merging lora weights
|
# Note: DO NOT use quantized model or quantization_bit when merging lora adapters
|
||||||
|
|
||||||
# model
|
# model
|
||||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
from llmtuner.api.app import create_app
|
||||||
|
from llmtuner.chat import ChatModel
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
chat_model = ChatModel()
|
||||||
|
app = create_app(chat_model)
|
||||||
|
api_host = os.environ.get("API_HOST", "0.0.0.0")
|
||||||
|
api_port = int(os.environ.get("API_PORT", "8000"))
|
||||||
|
print("Visit http://localhost:{}/docs for API document.".format(api_port))
|
||||||
|
uvicorn.run(app, host=api_host, port=api_port)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in New Issue