LLaMA-Factory/data/example_dataset/example_dataset.py

import json
import datasets
from typing import Any, Dict, List


_DESCRIPTION = "An example of dataset for LLaMA."
_CITATION = ""
_HOMEPAGE = ""
_LICENSE = ""
_URL = "examples.json"


class ExampleDataset(datasets.GeneratorBasedBuilder):

    VERSION = datasets.Version("0.0.0")

    def _info(self) -> datasets.DatasetInfo:
        features = datasets.Features({
            "instruction": datasets.Value("string"),
            "input": datasets.Value("string"),
            "output": datasets.Value("string"),
            "history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
        })
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION
        )

    def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
        file_path = dl_manager.download(_URL)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": file_path
                }
            )
        ]

    def _generate_examples(self, filepath: str) -> Dict[int, Dict[str, Any]]:
        example_dataset = json.load(open(filepath, "r", encoding="utf-8"))
        for key, example in enumerate(example_dataset):
            yield key, example
Initial commit 2023-05-28 10:09:04 +00:00			`import json`
			`import datasets`
			`from typing import Any, Dict, List`


			`_DESCRIPTION = "An example of dataset for LLaMA."`
			`_CITATION = ""`
			`_HOMEPAGE = ""`
			`_LICENSE = ""`
			`_URL = "examples.json"`


			`class ExampleDataset(datasets.GeneratorBasedBuilder):`

			`VERSION = datasets.Version("0.0.0")`

			`def _info(self) -> datasets.DatasetInfo:`
			`features = datasets.Features({`
			`"instruction": datasets.Value("string"),`
			`"input": datasets.Value("string"),`
			`"output": datasets.Value("string"),`
			`"history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))`
			`})`
			`return datasets.DatasetInfo(`
			`description=_DESCRIPTION,`
			`features=features,`
			`homepage=_HOMEPAGE,`
			`license=_LICENSE,`
			`citation=_CITATION`
			`)`

			`def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:`
			`file_path = dl_manager.download(_URL)`
			`return [`
			`datasets.SplitGenerator(`
			`name=datasets.Split.TRAIN,`
			`gen_kwargs={`
			`"filepath": file_path`
			`}`
			`)`
			`]`

			`def _generate_examples(self, filepath: str) -> Dict[int, Dict[str, Any]]:`
			`example_dataset = json.load(open(filepath, "r", encoding="utf-8"))`
			`for key, example in enumerate(example_dataset):`
			`yield key, example`