Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions datasets/the_pile/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ datasets combined together.

### Languages

[More Information Needed]
This dataset is in English (`EN`).

## Dataset Structure

Expand All @@ -81,6 +81,18 @@ datasets combined together.
}
```

#### free_law
```
{
'meta': {
'case_jurisdiction': 'scotus.tar.gz',
'case_ID': '110921.json',
'date_created': '2010-04-28T17:12:49Z'
},
'text': '\n461 U.S. 238 (1983)\nOLIM ET AL.\nv.\nWAKINEKONA\nNo. 81-1581.\nSupreme Court of United States.\nArgued...'
}
```

#### pubmed_central
```
{
Expand All @@ -93,9 +105,22 @@ datasets combined together.

#### all

- `text` (str): Text.
- `meta` (dict): Metadata of the data instance, with keys:
- pile_set_name: Name of the subset.

#### free_law

- `text` (str): Text.
- `meta` (dict): Metadata of the data instance, with keys:
- pile_set_name: Name of the subset.
- case_ID
- case_jurisdiction
- date_created

#### pubmed_central

- `text` (str): Text.
- `id` (str): ID of the data instance.

### Data Splits

Expand Down
35 changes: 30 additions & 5 deletions datasets/the_pile/the_pile.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@

_LICENSES = {
"all": "MIT License",
"free_law": "Unknown",
"pubmed_central": "MIT License",
}

Expand All @@ -48,6 +49,7 @@
"validation": ["https://the-eye.eu/public/AI/pile/val.jsonl.zst"],
"test": ["https://the-eye.eu/public/AI/pile/test.jsonl.zst"],
},
"free_law": "https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst",
"pubmed_central": "https://the-eye.eu/public/AI/pile_preliminary_components/PMC_extracts.tar.gz",
}

Expand All @@ -58,10 +60,20 @@
"meta": {"pile_set_name": datasets.Value("string")},
}
),
"free_law": datasets.Features(
{
"text": datasets.Value("string"),
"meta": {
"case_ID": datasets.Value("string"),
"case_jurisdiction": datasets.Value("string"),
"date_created": datasets.Value("string"),
},
}
),
"pubmed_central": datasets.Features(
{
"id": datasets.Value("string"),
"text": datasets.Value("string"),
"id": datasets.Value("string"),
}
),
}
Expand Down Expand Up @@ -100,15 +112,15 @@ def _info(self):
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# This defines the different columns of the dataset and their types
features=_FEATURES[self.config.name],
features=_FEATURES.get(self.config.name),
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
# Homepage of the dataset for documentation
homepage=_HOMEPAGE,
# License for the dataset if available
license=_LICENSES[self.config.name],
license=_LICENSES.get(self.config.name, "Multiple: see each subset license"),
# Citation for the dataset
citation=_CITATION,
)
Expand All @@ -133,7 +145,12 @@ def _split_generators(self, dl_manager):
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"files": {subset: dl_manager.iter_archive(archive[subset]) for subset in self.config.subsets},
"files": {
subset: dl_manager.iter_archive(archive[subset])
if ".tar" in data_urls[subset]
else archive[subset]
for subset in self.config.subsets
},
},
),
]
Expand All @@ -152,7 +169,15 @@ def _generate_examples(self, files):
key += 1
else:
for subset in files:
if subset == "pubmed_central":
if subset == "free_law":
import zstandard as zstd

with zstd.open(open(files[subset], "rb"), "rt", encoding="utf-8") as f:
for row in f:
data = json.loads(row)
yield key, data
key += 1
elif subset == "pubmed_central":
for path, file in files[subset]:
id_ = path.split("/")[-1].split(".")[0]
text = file.read().decode("utf-8")
Expand Down