diff --git a/datasets/the_pile/README.md b/datasets/the_pile/README.md index d630e4d05c1..d8cb777d04c 100644 --- a/datasets/the_pile/README.md +++ b/datasets/the_pile/README.md @@ -67,7 +67,7 @@ datasets combined together. ### Languages -[More Information Needed] +This dataset is in English (`EN`). ## Dataset Structure @@ -81,6 +81,18 @@ datasets combined together. } ``` +#### free_law +``` +{ + 'meta': { + 'case_jurisdiction': 'scotus.tar.gz', + 'case_ID': '110921.json', + 'date_created': '2010-04-28T17:12:49Z' + }, + 'text': '\n461 U.S. 238 (1983)\nOLIM ET AL.\nv.\nWAKINEKONA\nNo. 81-1581.\nSupreme Court of United States.\nArgued...' +} +``` + #### pubmed_central ``` { @@ -93,9 +105,22 @@ datasets combined together. #### all +- `text` (str): Text. +- `meta` (dict): Metadata of the data instance, with keys: + - pile_set_name: Name of the subset. + +#### free_law + +- `text` (str): Text. - `meta` (dict): Metadata of the data instance, with keys: - - pile_set_name: Name of the subset. + - case_ID + - case_jurisdiction + - date_created + +#### pubmed_central + - `text` (str): Text. +- `id` (str): ID of the data instance. ### Data Splits diff --git a/datasets/the_pile/the_pile.py b/datasets/the_pile/the_pile.py index 839e375694e..978a2e00c5e 100644 --- a/datasets/the_pile/the_pile.py +++ b/datasets/the_pile/the_pile.py @@ -39,6 +39,7 @@ _LICENSES = { "all": "MIT License", + "free_law": "Unknown", "pubmed_central": "MIT License", } @@ -48,6 +49,7 @@ "validation": ["https://the-eye.eu/public/AI/pile/val.jsonl.zst"], "test": ["https://the-eye.eu/public/AI/pile/test.jsonl.zst"], }, + "free_law": "https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst", "pubmed_central": "https://the-eye.eu/public/AI/pile_preliminary_components/PMC_extracts.tar.gz", } @@ -58,10 +60,20 @@ "meta": {"pile_set_name": datasets.Value("string")}, } ), + "free_law": datasets.Features( + { + "text": datasets.Value("string"), + "meta": { + "case_ID": datasets.Value("string"), + "case_jurisdiction": datasets.Value("string"), + "date_created": datasets.Value("string"), + }, + } + ), "pubmed_central": datasets.Features( { - "id": datasets.Value("string"), "text": datasets.Value("string"), + "id": datasets.Value("string"), } ), } @@ -100,7 +112,7 @@ def _info(self): # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types - features=_FEATURES[self.config.name], + features=_FEATURES.get(self.config.name), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. @@ -108,7 +120,7 @@ def _info(self): # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available - license=_LICENSES[self.config.name], + license=_LICENSES.get(self.config.name, "Multiple: see each subset license"), # Citation for the dataset citation=_CITATION, ) @@ -133,7 +145,12 @@ def _split_generators(self, dl_manager): datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ - "files": {subset: dl_manager.iter_archive(archive[subset]) for subset in self.config.subsets}, + "files": { + subset: dl_manager.iter_archive(archive[subset]) + if ".tar" in data_urls[subset] + else archive[subset] + for subset in self.config.subsets + }, }, ), ] @@ -152,7 +169,15 @@ def _generate_examples(self, files): key += 1 else: for subset in files: - if subset == "pubmed_central": + if subset == "free_law": + import zstandard as zstd + + with zstd.open(open(files[subset], "rb"), "rt", encoding="utf-8") as f: + for row in f: + data = json.loads(row) + yield key, data + key += 1 + elif subset == "pubmed_central": for path, file in files[subset]: id_ = path.split("/")[-1].split(".")[0] text = file.read().decode("utf-8")