LLaMA-Factory/data/example_dataset/example_dataset.py

47 lines
1.4 KiB
Python
Raw Normal View History

2023-05-28 10:09:04 +00:00
import json
import datasets
2024-03-10 05:35:20 +00:00
from typing import Any, Dict, Generator, List, Tuple
2023-05-28 10:09:04 +00:00
2023-11-09 07:53:23 +00:00
_DESCRIPTION = "An example of dataset."
2023-05-28 10:09:04 +00:00
_CITATION = ""
_HOMEPAGE = ""
_LICENSE = ""
_URL = "examples.json"
class ExampleDataset(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("0.0.0")
def _info(self) -> datasets.DatasetInfo:
features = datasets.Features({
"instruction": datasets.Value("string"),
"input": datasets.Value("string"),
"output": datasets.Value("string"),
"history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
})
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION
)
def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
file_path = dl_manager.download(_URL)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": file_path
}
)
]
2024-03-10 05:35:20 +00:00
def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]:
2023-05-28 10:09:04 +00:00
example_dataset = json.load(open(filepath, "r", encoding="utf-8"))
for key, example in enumerate(example_dataset):
yield key, example