diff --git a/data/dataset_info.json b/data/dataset_info.json index 01f4db21..ceb81379 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -129,7 +129,6 @@ }, "firefly": { "hf_hub_url": "YeungNLP/firefly-train-1.1M", - "ms_hub_url": "AI-ModelScope/firefly-train-1.1M", "columns": { "prompt": "input", "response": "target" diff --git a/src/llmtuner/data/loader.py b/src/llmtuner/data/loader.py index 41c12422..7bd52caa 100644 --- a/src/llmtuner/data/loader.py +++ b/src/llmtuner/data/loader.py @@ -58,6 +58,11 @@ def get_dataset( dataset = MsDataset.load( dataset_name=data_path, subset_name=data_name, + split=data_args.split, + data_files=data_files, + cache_dir=model_args.cache_dir, + token=model_args.ms_hub_token, + streaming=(data_args.streaming and (dataset_attr.load_from != "file")), ).to_hf_dataset() else: dataset = load_dataset( diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py index ebf6cafa..c5819cea 100644 --- a/src/llmtuner/hparams/model_args.py +++ b/src/llmtuner/hparams/model_args.py @@ -59,6 +59,10 @@ class ModelArguments: default=None, metadata={"help": "Auth token to log in with Hugging Face Hub."} ) + ms_hub_token: Optional[str] = field( + default=None, + metadata={"help": "Auth token to log in with ModelScope Hub."} + ) def __post_init__(self): self.compute_dtype = None