diff --git a/examples/README.md b/examples/README.md index ba993b99..ce19f9d1 100644 --- a/examples/README.md +++ b/examples/README.md @@ -148,6 +148,8 @@ bash examples/full_multi_gpu/predict.sh #### Merge LoRA Adapters +Note: DO NOT use quantized model or `quantization_bit` when merging LoRA adapters. + ```bash CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml ``` diff --git a/examples/README_zh.md b/examples/README_zh.md index 491ec688..91bdcda9 100644 --- a/examples/README_zh.md +++ b/examples/README_zh.md @@ -148,6 +148,8 @@ bash examples/full_multi_gpu/predict.sh #### 合并 LoRA 适配器 +注:请勿使用量化后的模型或 `quantization_bit` 参数来合并 LoRA 适配器。 + ```bash CUDA_VISIBLE_DEVICES=0 llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml ``` diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh index 9c2508b6..962409a1 100644 --- a/examples/full_multi_gpu/multi_node.sh +++ b/examples/full_multi_gpu/multi_node.sh @@ -1,6 +1,12 @@ #!/bin/bash -python -m torch.distributed.run \ +NPROC_PER_NODE=4 +NNODES=2 +RANK=0 +MASTER_ADDR=192.168.0.1 +MASTER_PORT=29500 + +CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \ --nproc_per_node $NPROC_PER_NODE \ --nnodes $NNODES \ --node_rank $RANK \ diff --git a/examples/full_multi_gpu/single_node.sh b/examples/full_multi_gpu/single_node.sh index f391166a..97f7af64 100644 --- a/examples/full_multi_gpu/single_node.sh +++ b/examples/full_multi_gpu/single_node.sh @@ -1,4 +1,9 @@ #!/bin/bash -deepspeed --include "localhost:0,1,2,3" \ +NPROC_PER_NODE=4 + +CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \ + --nproc_per_node $NPROC_PER_NODE \ + --nnodes 1 \ + --standalone \ src/train.py examples/full_multi_gpu/llama3_full_sft.yaml diff --git a/examples/lora_multi_gpu/ds_zero3.sh b/examples/lora_multi_gpu/ds_zero3.sh index 304f3780..b8fd2640 100644 --- a/examples/lora_multi_gpu/ds_zero3.sh +++ b/examples/lora_multi_gpu/ds_zero3.sh @@ -1,5 +1,9 @@ #!/bin/bash -# ZeRO-3 enables weight sharding on multiple GPUs -deepspeed --include "localhost:0,1,2,3" \ +NPROC_PER_NODE=4 + +CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \ + --nproc_per_node $NPROC_PER_NODE \ + --nnodes 1 \ + --standalone \ src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml diff --git a/examples/merge_lora/llama3_lora_sft.yaml b/examples/merge_lora/llama3_lora_sft.yaml index 508a0b8c..de41d48b 100644 --- a/examples/merge_lora/llama3_lora_sft.yaml +++ b/examples/merge_lora/llama3_lora_sft.yaml @@ -1,4 +1,4 @@ -# Note: DO NOT use quantized model or quantization_bit when merging lora weights +# Note: DO NOT use quantized model or quantization_bit when merging lora adapters # model model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct diff --git a/src/api.py b/src/api.py new file mode 100644 index 00000000..277920ac --- /dev/null +++ b/src/api.py @@ -0,0 +1,19 @@ +import os + +import uvicorn + +from llmtuner.api.app import create_app +from llmtuner.chat import ChatModel + + +def main(): + chat_model = ChatModel() + app = create_app(chat_model) + api_host = os.environ.get("API_HOST", "0.0.0.0") + api_port = int(os.environ.get("API_PORT", "8000")) + print("Visit http://localhost:{}/docs for API document.".format(api_port)) + uvicorn.run(app, host=api_host, port=api_port) + + +if __name__ == "__main__": + main()