diff --git a/README.md b/README.md
index 0ef472b3..52b28e43 100644
--- a/README.md
+++ b/README.md
@@ -157,8 +157,8 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
-- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
-- [Self-cognition (zh)](data/self_cognition.json)
+- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
+- [Self Cognition (zh)](data/self_cognition.json)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
@@ -176,6 +176,7 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list
- [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca)
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
+- [Wiki QA (en)](https://huggingface.co/datasets/wiki_qa)
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
@@ -190,14 +191,14 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list
- [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
- [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
-- [FreedomIntelligence Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
-- [LeoLM/OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de)
-- [FreedomIntelligence/evol-instruct-deutsch (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de)
-- [wiki_qa (de)](https://huggingface.co/datasets/wiki_qa)
-- [cognitivecomputations/dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de)
-- [booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
-- [jondurbin/airoboros-3.0 (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
-- [stingning/ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
+- [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
+- [OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de)
+- [Evol Instruct (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de)
+- [Dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de)
+- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
+- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
+- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
+
Preference datasets
@@ -206,7 +207,7 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
-- [Intel/orca_dpo_pairs (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
+- [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
diff --git a/README_zh.md b/README_zh.md
index 623d799b..5e81ffcf 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -157,8 +157,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
-- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
-- [Self-cognition (zh)](data/self_cognition.json)
+- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
+- [Self Cognition (zh)](data/self_cognition.json)
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
@@ -176,6 +176,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
- [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca)
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
+- [Wiki QA (en)](https://huggingface.co/datasets/wiki_qa)
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
@@ -188,6 +189,15 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
- [LMSYS Chat 1M (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
- [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
+- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
+- [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
+- [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
+- [OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de)
+- [Evol Instruct (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de)
+- [Dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de)
+- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
+- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
+- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
@@ -197,6 +207,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
+- [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
diff --git a/assets/wechat.jpg b/assets/wechat.jpg
index 28e59a16..b1744466 100644
Binary files a/assets/wechat.jpg and b/assets/wechat.jpg differ
diff --git a/src/llmtuner/chat/chat_model.py b/src/llmtuner/chat/chat_model.py
index f8a2b14f..0d9c6395 100644
--- a/src/llmtuner/chat/chat_model.py
+++ b/src/llmtuner/chat/chat_model.py
@@ -28,7 +28,7 @@ class ChatModel:
)
self.tokenizer.padding_side = "left" if self.can_generate else "right"
self.model = dispatch_model(self.model)
- self.template = get_template_and_fix_tokenizer(data_args.template, self.tokenizer)
+ self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)
def _process_args(
self,
diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py
index 20a5ec0d..8cae318e 100644
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@@ -142,7 +142,7 @@ def get_dataset(
stage: Literal["pt", "sft", "rm", "ppo"],
# split: Optional[str] = "train", # TODO: add split
) -> Union["Dataset", "IterableDataset"]:
- template = get_template_and_fix_tokenizer(data_args.template, tokenizer)
+ template = get_template_and_fix_tokenizer(tokenizer, data_args.template)
if data_args.train_on_prompt and template.efficient_eos:
raise ValueError("Current template does not support `train_on_prompt`.")
diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py
index 00a005dc..41ad306c 100644
--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@@ -198,7 +198,7 @@ class Llama2Template(Template):
templates: Dict[str, Template] = {}
-def register_template(
+def _register_template(
name: str,
format_user: Optional["Formatter"] = None,
format_assistant: Optional["Formatter"] = None,
@@ -236,29 +236,45 @@ def register_template(
)
-def get_template_and_fix_tokenizer(name: str, tokenizer: "PreTrainedTokenizer") -> Template:
- if tokenizer.eos_token_id is None:
- tokenizer.eos_token = "<|endoftext|>"
+def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_token: str) -> None:
+ is_added = tokenizer.eos_token_id is None
+ is_oov = eos_token not in tokenizer.get_vocab()
+ tokenizer.add_special_tokens({"eos_token": eos_token})
+
+ if is_added:
logger.info("Add eos token: {}".format(tokenizer.eos_token))
+ else:
+ logger.info("Replace eos token: {}".format(tokenizer.eos_token))
- if tokenizer.pad_token_id is None:
- tokenizer.pad_token = tokenizer.eos_token
- logger.info("Add pad token: {}".format(tokenizer.pad_token))
+ if is_oov:
+ logger.warning("New token is added, you must enable `resize_vocab` to activate it.")
- if name is None: # for pre-training
- return None
- template = templates.get(name, None)
- assert template is not None, "Template {} does not exist.".format(name)
+def get_template_and_fix_tokenizer(
+ tokenizer: "PreTrainedTokenizer",
+ name: Optional[str] = None,
+) -> Template:
+ if name is None:
+ template = templates["vanilla"] # placeholder
+ else:
+ template = templates.get(name, None)
+ if templates is None:
+ raise ValueError("Template {} does not exist.".format(name))
stop_words = template.stop_words
if template.replace_eos:
if not stop_words:
raise ValueError("Stop words are required to replace the EOS token.")
- tokenizer.eos_token = stop_words[0]
+ _add_or_replace_eos_token(tokenizer, eos_token=stop_words[0])
stop_words = stop_words[1:]
- logger.info("Replace eos token: {}".format(tokenizer.eos_token))
+
+ if tokenizer.eos_token_id is None:
+ _add_or_replace_eos_token(tokenizer, eos_token="<|endoftext|>")
+
+ if tokenizer.pad_token_id is None:
+ tokenizer.pad_token = tokenizer.eos_token
+ logger.info("Add pad token: {}".format(tokenizer.pad_token))
if stop_words:
tokenizer.add_special_tokens(
@@ -269,7 +285,7 @@ def get_template_and_fix_tokenizer(name: str, tokenizer: "PreTrainedTokenizer")
return template
-register_template(
+_register_template(
name="alpaca",
format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]),
format_separator=EmptyFormatter(slots=["\n\n"]),
@@ -279,7 +295,7 @@ register_template(
)
-register_template(
+_register_template(
name="aquila",
format_user=StringFormatter(slots=["Human: {{content}}###Assistant:"]),
format_separator=EmptyFormatter(slots=["###"]),
@@ -292,21 +308,21 @@ register_template(
)
-register_template(
+_register_template(
name="baichuan",
format_user=StringFormatter(slots=[{"token": ""}, "{{content}}", {"token": ""}]),
efficient_eos=True,
)
-register_template(
+_register_template(
name="baichuan2",
format_user=StringFormatter(slots=[{"token": ""}, "{{content}}", {"token": ""}]),
efficient_eos=True,
)
-register_template(
+_register_template(
name="belle",
format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]),
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
@@ -315,13 +331,13 @@ register_template(
)
-register_template(
+_register_template(
name="bluelm",
format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]),
)
-register_template(
+_register_template(
name="chatglm2",
format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问:{{content}}\n\n答:"]),
format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
@@ -331,7 +347,7 @@ register_template(
)
-register_template(
+_register_template(
name="chatglm3",
format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
@@ -351,14 +367,25 @@ register_template(
)
-register_template(
+_register_template(
+ name="chatml_de",
+ format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+ format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+ format_separator=EmptyFormatter(slots=["\n"]),
+ default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
+ stop_words=["<|im_end|>"],
+ replace_eos=True,
+)
+
+
+_register_template(
name="codegeex2",
format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
force_system=True,
)
-register_template(
+_register_template(
name="cpm",
format_user=StringFormatter(slots=["<用户>{{content}}"]),
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
@@ -366,7 +393,7 @@ register_template(
)
-register_template(
+_register_template(
name="deepseek",
format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
@@ -374,7 +401,7 @@ register_template(
)
-register_template(
+_register_template(
name="deepseekcoder",
format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
@@ -390,14 +417,14 @@ register_template(
)
-register_template(
+_register_template(
name="default",
format_user=StringFormatter(slots=["Human: {{content}}\nAssistant: "]),
format_separator=EmptyFormatter(slots=["\n"]),
)
-register_template(
+_register_template(
name="falcon",
format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]),
format_separator=EmptyFormatter(slots=["\n"]),
@@ -405,7 +432,7 @@ register_template(
)
-register_template(
+_register_template(
name="intern",
format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": ""}, "\n<|Bot|>:"]),
format_separator=EmptyFormatter(slots=[{"token": ""}, "\n"]),
@@ -414,7 +441,7 @@ register_template(
)
-register_template(
+_register_template(
name="intern2",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_system=StringFormatter(slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]),
@@ -431,7 +458,7 @@ register_template(
)
-register_template(
+_register_template(
name="llama2",
format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
format_system=StringFormatter(slots=["<>\n{{content}}\n<>\n\n"]),
@@ -448,7 +475,7 @@ register_template(
)
-register_template(
+_register_template(
name="llama2_zh",
format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
format_system=StringFormatter(slots=["<>\n{{content}}\n<>\n\n"]),
@@ -456,7 +483,7 @@ register_template(
)
-register_template(
+_register_template(
name="mistral",
format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
@@ -464,7 +491,7 @@ register_template(
)
-register_template(
+_register_template(
name="openchat",
format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
format_assistant=StringFormatter(slots=["{{content}}"]),
@@ -473,7 +500,7 @@ register_template(
)
-register_template(
+_register_template(
name="orion",
format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
@@ -481,7 +508,7 @@ register_template(
)
-register_template(
+_register_template(
name="qwen",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
@@ -492,7 +519,7 @@ register_template(
)
-register_template(
+_register_template(
name="solar",
format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]),
format_system=StringFormatter(slots=["### System:\n{{content}}\n\n"]),
@@ -500,7 +527,7 @@ register_template(
)
-register_template(
+_register_template(
name="starchat",
format_user=StringFormatter(
slots=[{"token": "<|user|>"}, "\n{{content}}", {"token": "<|end|>"}, "\n", {"token": "<|assistant|>"}]
@@ -513,20 +540,12 @@ register_template(
)
-register_template(name="vanilla")
-
-register_template(
- name="chatml_de",
- format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
- format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
- format_separator=EmptyFormatter(slots=["\n"]),
- default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
- stop_words=["<|im_end|>"],
- replace_eos=True,
+_register_template(
+ name="vanilla",
)
-register_template(
+_register_template(
name="vicuna",
format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
default_system=(
@@ -536,7 +555,7 @@ register_template(
)
-register_template(
+_register_template(
name="xuanyuan",
format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
default_system=(
@@ -547,10 +566,13 @@ register_template(
)
-register_template(name="xverse", format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]))
+_register_template(
+ name="xverse",
+ format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]),
+)
-register_template(
+_register_template(
name="yayi",
format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]),
format_system=StringFormatter(slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]),
@@ -570,7 +592,7 @@ register_template(
)
-register_template(
+_register_template(
name="yi",
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
format_separator=EmptyFormatter(slots=["\n"]),
@@ -579,7 +601,7 @@ register_template(
)
-register_template(
+_register_template(
name="yuan",
format_user=StringFormatter(slots=["{{content}}", {"token": ""}]),
format_separator=EmptyFormatter(slots=["\n"]),
@@ -588,7 +610,7 @@ register_template(
)
-register_template(
+_register_template(
name="zephyr",
format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]),
format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]),
@@ -596,7 +618,7 @@ register_template(
)
-register_template(
+_register_template(
name="ziya",
format_user=StringFormatter(slots=[{"token": ""}, ":{{content}}\n", {"token": ""}, ":"]),
format_separator=EmptyFormatter(slots=["\n"]),
diff --git a/src/llmtuner/eval/evaluator.py b/src/llmtuner/eval/evaluator.py
index d0c949ee..7e8b064a 100644
--- a/src/llmtuner/eval/evaluator.py
+++ b/src/llmtuner/eval/evaluator.py
@@ -24,7 +24,7 @@ class Evaluator:
self.model, self.tokenizer = load_model_and_tokenizer(self.model_args, finetuning_args)
self.tokenizer.padding_side = "right" # avoid overflow issue in batched inference for llama2
self.model = dispatch_model(self.model)
- self.template = get_template_and_fix_tokenizer(self.data_args.template, self.tokenizer)
+ self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args.template)
self.eval_template = get_eval_template(self.eval_args.lang)
self.choice_inputs = [
self.tokenizer.encode(self.eval_template.prefix + ch, add_special_tokens=False)[-1] for ch in CHOICES