diff --git a/README.md b/README.md index 0ef472b3..52b28e43 100644 --- a/README.md +++ b/README.md @@ -157,8 +157,8 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca) - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca) -- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) -- [Self-cognition (zh)](data/self_cognition.json) +- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) +- [Self Cognition (zh)](data/self_cognition.json) - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1) - [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection) - [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset) @@ -176,6 +176,7 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list - [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca) - [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct) - [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) +- [Wiki QA (en)](https://huggingface.co/datasets/wiki_qa) - [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa) - [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn) - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar) @@ -190,14 +191,14 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list - [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2) - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de) - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de) -- [FreedomIntelligence Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de) -- [LeoLM/OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de) -- [FreedomIntelligence/evol-instruct-deutsch (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de) -- [wiki_qa (de)](https://huggingface.co/datasets/wiki_qa) -- [cognitivecomputations/dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de) -- [booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de) -- [jondurbin/airoboros-3.0 (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de) -- [stingning/ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de) +- [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de) +- [OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de) +- [Evol Instruct (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de) +- [Dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de) +- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de) +- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de) +- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de) +
Preference datasets @@ -206,7 +207,7 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1) - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar) -- [Intel/orca_dpo_pairs (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de) +- [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
diff --git a/README_zh.md b/README_zh.md index 623d799b..5e81ffcf 100644 --- a/README_zh.md +++ b/README_zh.md @@ -157,8 +157,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846 - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca) - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca) -- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) -- [Self-cognition (zh)](data/self_cognition.json) +- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) +- [Self Cognition (zh)](data/self_cognition.json) - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1) - [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection) - [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset) @@ -176,6 +176,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846 - [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca) - [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct) - [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M) +- [Wiki QA (en)](https://huggingface.co/datasets/wiki_qa) - [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa) - [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn) - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar) @@ -188,6 +189,15 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846 - [LMSYS Chat 1M (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m) - [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k) - [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2) +- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de) +- [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de) +- [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de) +- [OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de) +- [Evol Instruct (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de) +- [Dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de) +- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de) +- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de) +- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de) @@ -197,6 +207,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846 - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1) - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM) - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar) +- [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de) diff --git a/assets/wechat.jpg b/assets/wechat.jpg index 28e59a16..b1744466 100644 Binary files a/assets/wechat.jpg and b/assets/wechat.jpg differ diff --git a/src/llmtuner/chat/chat_model.py b/src/llmtuner/chat/chat_model.py index f8a2b14f..0d9c6395 100644 --- a/src/llmtuner/chat/chat_model.py +++ b/src/llmtuner/chat/chat_model.py @@ -28,7 +28,7 @@ class ChatModel: ) self.tokenizer.padding_side = "left" if self.can_generate else "right" self.model = dispatch_model(self.model) - self.template = get_template_and_fix_tokenizer(data_args.template, self.tokenizer) + self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template) def _process_args( self, diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 20a5ec0d..8cae318e 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -142,7 +142,7 @@ def get_dataset( stage: Literal["pt", "sft", "rm", "ppo"], # split: Optional[str] = "train", # TODO: add split ) -> Union["Dataset", "IterableDataset"]: - template = get_template_and_fix_tokenizer(data_args.template, tokenizer) + template = get_template_and_fix_tokenizer(tokenizer, data_args.template) if data_args.train_on_prompt and template.efficient_eos: raise ValueError("Current template does not support `train_on_prompt`.") diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py index 00a005dc..41ad306c 100644 --- a/src/llmtuner/data/template.py +++ b/src/llmtuner/data/template.py @@ -198,7 +198,7 @@ class Llama2Template(Template): templates: Dict[str, Template] = {} -def register_template( +def _register_template( name: str, format_user: Optional["Formatter"] = None, format_assistant: Optional["Formatter"] = None, @@ -236,29 +236,45 @@ def register_template( ) -def get_template_and_fix_tokenizer(name: str, tokenizer: "PreTrainedTokenizer") -> Template: - if tokenizer.eos_token_id is None: - tokenizer.eos_token = "<|endoftext|>" +def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_token: str) -> None: + is_added = tokenizer.eos_token_id is None + is_oov = eos_token not in tokenizer.get_vocab() + tokenizer.add_special_tokens({"eos_token": eos_token}) + + if is_added: logger.info("Add eos token: {}".format(tokenizer.eos_token)) + else: + logger.info("Replace eos token: {}".format(tokenizer.eos_token)) - if tokenizer.pad_token_id is None: - tokenizer.pad_token = tokenizer.eos_token - logger.info("Add pad token: {}".format(tokenizer.pad_token)) + if is_oov: + logger.warning("New token is added, you must enable `resize_vocab` to activate it.") - if name is None: # for pre-training - return None - template = templates.get(name, None) - assert template is not None, "Template {} does not exist.".format(name) +def get_template_and_fix_tokenizer( + tokenizer: "PreTrainedTokenizer", + name: Optional[str] = None, +) -> Template: + if name is None: + template = templates["vanilla"] # placeholder + else: + template = templates.get(name, None) + if templates is None: + raise ValueError("Template {} does not exist.".format(name)) stop_words = template.stop_words if template.replace_eos: if not stop_words: raise ValueError("Stop words are required to replace the EOS token.") - tokenizer.eos_token = stop_words[0] + _add_or_replace_eos_token(tokenizer, eos_token=stop_words[0]) stop_words = stop_words[1:] - logger.info("Replace eos token: {}".format(tokenizer.eos_token)) + + if tokenizer.eos_token_id is None: + _add_or_replace_eos_token(tokenizer, eos_token="<|endoftext|>") + + if tokenizer.pad_token_id is None: + tokenizer.pad_token = tokenizer.eos_token + logger.info("Add pad token: {}".format(tokenizer.pad_token)) if stop_words: tokenizer.add_special_tokens( @@ -269,7 +285,7 @@ def get_template_and_fix_tokenizer(name: str, tokenizer: "PreTrainedTokenizer") return template -register_template( +_register_template( name="alpaca", format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]), format_separator=EmptyFormatter(slots=["\n\n"]), @@ -279,7 +295,7 @@ register_template( ) -register_template( +_register_template( name="aquila", format_user=StringFormatter(slots=["Human: {{content}}###Assistant:"]), format_separator=EmptyFormatter(slots=["###"]), @@ -292,21 +308,21 @@ register_template( ) -register_template( +_register_template( name="baichuan", format_user=StringFormatter(slots=[{"token": ""}, "{{content}}", {"token": ""}]), efficient_eos=True, ) -register_template( +_register_template( name="baichuan2", format_user=StringFormatter(slots=[{"token": ""}, "{{content}}", {"token": ""}]), efficient_eos=True, ) -register_template( +_register_template( name="belle", format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]), format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), @@ -315,13 +331,13 @@ register_template( ) -register_template( +_register_template( name="bluelm", format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]), ) -register_template( +_register_template( name="chatglm2", format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问:{{content}}\n\n答:"]), format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]), @@ -331,7 +347,7 @@ register_template( ) -register_template( +_register_template( name="chatglm3", format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]), format_assistant=StringFormatter(slots=["\n", "{{content}}"]), @@ -351,14 +367,25 @@ register_template( ) -register_template( +_register_template( + name="chatml_de", + format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_separator=EmptyFormatter(slots=["\n"]), + default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.", + stop_words=["<|im_end|>"], + replace_eos=True, +) + + +_register_template( name="codegeex2", format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]), force_system=True, ) -register_template( +_register_template( name="cpm", format_user=StringFormatter(slots=["<用户>{{content}}"]), format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), @@ -366,7 +393,7 @@ register_template( ) -register_template( +_register_template( name="deepseek", format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]), format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), @@ -374,7 +401,7 @@ register_template( ) -register_template( +_register_template( name="deepseekcoder", format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]), format_assistant=StringFormatter(slots=["\n", "{{content}}"]), @@ -390,14 +417,14 @@ register_template( ) -register_template( +_register_template( name="default", format_user=StringFormatter(slots=["Human: {{content}}\nAssistant: "]), format_separator=EmptyFormatter(slots=["\n"]), ) -register_template( +_register_template( name="falcon", format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]), format_separator=EmptyFormatter(slots=["\n"]), @@ -405,7 +432,7 @@ register_template( ) -register_template( +_register_template( name="intern", format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": ""}, "\n<|Bot|>:"]), format_separator=EmptyFormatter(slots=[{"token": ""}, "\n"]), @@ -414,7 +441,7 @@ register_template( ) -register_template( +_register_template( name="intern2", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_system=StringFormatter(slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]), @@ -431,7 +458,7 @@ register_template( ) -register_template( +_register_template( name="llama2", format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]), format_system=StringFormatter(slots=["<>\n{{content}}\n<>\n\n"]), @@ -448,7 +475,7 @@ register_template( ) -register_template( +_register_template( name="llama2_zh", format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]), format_system=StringFormatter(slots=["<>\n{{content}}\n<>\n\n"]), @@ -456,7 +483,7 @@ register_template( ) -register_template( +_register_template( name="mistral", format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]), format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), @@ -464,7 +491,7 @@ register_template( ) -register_template( +_register_template( name="openchat", format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]), format_assistant=StringFormatter(slots=["{{content}}"]), @@ -473,7 +500,7 @@ register_template( ) -register_template( +_register_template( name="orion", format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]), format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), @@ -481,7 +508,7 @@ register_template( ) -register_template( +_register_template( name="qwen", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), @@ -492,7 +519,7 @@ register_template( ) -register_template( +_register_template( name="solar", format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]), format_system=StringFormatter(slots=["### System:\n{{content}}\n\n"]), @@ -500,7 +527,7 @@ register_template( ) -register_template( +_register_template( name="starchat", format_user=StringFormatter( slots=[{"token": "<|user|>"}, "\n{{content}}", {"token": "<|end|>"}, "\n", {"token": "<|assistant|>"}] @@ -513,20 +540,12 @@ register_template( ) -register_template(name="vanilla") - -register_template( - name="chatml_de", - format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), - format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), - format_separator=EmptyFormatter(slots=["\n"]), - default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.", - stop_words=["<|im_end|>"], - replace_eos=True, +_register_template( + name="vanilla", ) -register_template( +_register_template( name="vicuna", format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]), default_system=( @@ -536,7 +555,7 @@ register_template( ) -register_template( +_register_template( name="xuanyuan", format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]), default_system=( @@ -547,10 +566,13 @@ register_template( ) -register_template(name="xverse", format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "])) +_register_template( + name="xverse", + format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]), +) -register_template( +_register_template( name="yayi", format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]), format_system=StringFormatter(slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]), @@ -570,7 +592,7 @@ register_template( ) -register_template( +_register_template( name="yi", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_separator=EmptyFormatter(slots=["\n"]), @@ -579,7 +601,7 @@ register_template( ) -register_template( +_register_template( name="yuan", format_user=StringFormatter(slots=["{{content}}", {"token": ""}]), format_separator=EmptyFormatter(slots=["\n"]), @@ -588,7 +610,7 @@ register_template( ) -register_template( +_register_template( name="zephyr", format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]), format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]), @@ -596,7 +618,7 @@ register_template( ) -register_template( +_register_template( name="ziya", format_user=StringFormatter(slots=[{"token": ""}, ":{{content}}\n", {"token": ""}, ":"]), format_separator=EmptyFormatter(slots=["\n"]), diff --git a/src/llmtuner/eval/evaluator.py b/src/llmtuner/eval/evaluator.py index d0c949ee..7e8b064a 100644 --- a/src/llmtuner/eval/evaluator.py +++ b/src/llmtuner/eval/evaluator.py @@ -24,7 +24,7 @@ class Evaluator: self.model, self.tokenizer = load_model_and_tokenizer(self.model_args, finetuning_args) self.tokenizer.padding_side = "right" # avoid overflow issue in batched inference for llama2 self.model = dispatch_model(self.model) - self.template = get_template_and_fix_tokenizer(self.data_args.template, self.tokenizer) + self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args.template) self.eval_template = get_eval_template(self.eval_args.lang) self.choice_inputs = [ self.tokenizer.encode(self.eval_template.prefix + ch, add_special_tokens=False)[-1] for ch in CHOICES