Skip to content

Commit 81d4b9c

Browse files
committed
more configs
1 parent 52bddcd commit 81d4b9c

30 files changed

+146
-144
lines changed

datasets/multi_woz_v22/dataset_infos.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

datasets/multi_woz_v22/multi_woz_v22.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,15 +89,15 @@ class MultiWozV22(datasets.GeneratorBasedBuilder):
8989
VERSION = datasets.Version("2.2.0")
9090

9191
BUILDER_CONFIGS = [
92-
datasets.BuilderConfig(name="v2.2", version=datasets.Version("2.2.0"), description="MultiWOZ v2.2"),
92+
datasets.BuilderConfig(name="v2_2", version=datasets.Version("2.2.0"), description="MultiWOZ v2.2"),
9393
datasets.BuilderConfig(
94-
name="v2.2_active_only",
94+
name="v2_2_active_only",
9595
version=datasets.Version("2.2.0"),
9696
description="MultiWOZ v2.2, only keeps around frames with an active intent",
9797
),
9898
]
9999

100-
DEFAULT_CONFIG_NAME = "v2.2_active_only"
100+
DEFAULT_CONFIG_NAME = "v2_2_active_only"
101101

102102
def _info(self):
103103
features = datasets.Features(

datasets/newsgroup/README.md

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ does not include cross-posts and includes only the "From" and "Subject" headers.
6262

6363
### Data Instances
6464

65-
#### 18828_alt.atheism
65+
#### 18828_alt_atheism
6666

6767
- **Size of downloaded dataset files:** 13.99 MB
6868
- **Size of the generated dataset:** 1.59 MB
@@ -73,7 +73,7 @@ An example of 'train' looks as follows.
7373
7474
```
7575

76-
#### 18828_comp.graphics
76+
#### 18828_comp_graphics
7777

7878
- **Size of downloaded dataset files:** 13.99 MB
7979
- **Size of the generated dataset:** 1.58 MB
@@ -84,7 +84,7 @@ An example of 'train' looks as follows.
8484
8585
```
8686

87-
#### 18828_comp.os.ms-windows.misc
87+
#### 18828_comp_os_ms-windows_misc
8888

8989
- **Size of downloaded dataset files:** 13.99 MB
9090
- **Size of the generated dataset:** 2.27 MB
@@ -95,7 +95,7 @@ An example of 'train' looks as follows.
9595
9696
```
9797

98-
#### 18828_comp.sys.ibm.pc.hardware
98+
#### 18828_comp_sys_ibm_pc_hardware
9999

100100
- **Size of downloaded dataset files:** 13.99 MB
101101
- **Size of the generated dataset:** 1.13 MB
@@ -106,7 +106,7 @@ An example of 'train' looks as follows.
106106
107107
```
108108

109-
#### 18828_comp.sys.mac.hardware
109+
#### 18828_comp_sys_mac_hardware
110110

111111
- **Size of downloaded dataset files:** 13.99 MB
112112
- **Size of the generated dataset:** 1.01 MB
@@ -121,30 +121,30 @@ An example of 'train' looks as follows.
121121

122122
The data fields are the same among all splits.
123123

124-
#### 18828_alt.atheism
124+
#### 18828_alt_atheism
125125
- `text`: a `string` feature.
126126

127-
#### 18828_comp.graphics
127+
#### 18828_comp_graphics
128128
- `text`: a `string` feature.
129129

130-
#### 18828_comp.os.ms-windows.misc
130+
#### 18828_comp_os_ms-windows_misc
131131
- `text`: a `string` feature.
132132

133-
#### 18828_comp.sys.ibm.pc.hardware
133+
#### 18828_comp_sys_ibm_pc_hardware
134134
- `text`: a `string` feature.
135135

136-
#### 18828_comp.sys.mac.hardware
136+
#### 18828_comp_sys_mac_hardware
137137
- `text`: a `string` feature.
138138

139139
### Data Splits
140140

141141
| name |train|
142142
|------------------------------|----:|
143-
|18828_alt.atheism | 799|
144-
|18828_comp.graphics | 973|
145-
|18828_comp.os.ms-windows.misc | 985|
146-
|18828_comp.sys.ibm.pc.hardware| 982|
147-
|18828_comp.sys.mac.hardware | 961|
143+
|18828_alt_atheism | 799|
144+
|18828_comp_graphics | 973|
145+
|18828_comp_os_ms-windows_misc | 985|
146+
|18828_comp_sys_ibm_pc_hardware| 982|
147+
|18828_comp_sys_mac_hardware | 961|
148148

149149
## Dataset Creation
150150

datasets/newsgroup/dataset_infos.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

datasets/newsgroup/newsgroup.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -42,26 +42,26 @@
4242
"18828": "http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz",
4343
}
4444
_NEWS_GROUPS = [
45-
"comp.graphics",
46-
"comp.os.ms-windows.misc",
47-
"comp.sys.ibm.pc.hardware",
48-
"comp.sys.mac.hardware",
49-
"comp.windows.x",
50-
"rec.autos",
51-
"rec.motorcycles",
52-
"rec.sport.baseball",
53-
"rec.sport.hockey",
54-
"sci.crypt",
55-
"sci.electronics",
56-
"sci.med",
57-
"sci.space",
58-
"misc.forsale",
59-
"talk.politics.misc",
60-
"talk.politics.guns",
61-
"talk.politics.mideast",
62-
"talk.religion.misc",
63-
"alt.atheism",
64-
"soc.religion.christian",
45+
"comp_graphics",
46+
"comp_os_ms-windows_misc",
47+
"comp_sys_ibm_pc_hardware",
48+
"comp_sys_mac_hardware",
49+
"comp_windows_x",
50+
"rec_autos",
51+
"rec_motorcycles",
52+
"rec_sport_baseball",
53+
"rec_sport_hockey",
54+
"sci_crypt",
55+
"sci_electronics",
56+
"sci_med",
57+
"sci_space",
58+
"misc_forsale",
59+
"talk_politics_misc",
60+
"talk_politics_guns",
61+
"talk_politics_mideast",
62+
"talk_religion_misc",
63+
"alt_atheism",
64+
"soc_religion_christian",
6565
]
6666
_VERSIONS = {"19997": "1.0.0", "bydate": "2.0.0", "18828": "3.0.0"}
6767

@@ -99,7 +99,7 @@ class Newsgroups(datasets.GeneratorBasedBuilder):
9999
NewsgroupConfig(
100100
name=name,
101101
description=_DESC[name.split("_")[0]],
102-
sub_dir=name.split("_")[1],
102+
sub_dir=name.split("_", 1)[1].replace("_", "."),
103103
version=datasets.Version(_VERSIONS[name.split("_")[0]]),
104104
)
105105
for name in _CONFIG_NAMES
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"1.0.0": {"description": "A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.\nIt is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace(\"[n]\", \"\n\")`) and then use them for your purposes.\n", "citation": "@article{pnSummary, title={Leveraging ParsBERT and Pretrained mT5 for Persian Abstractive Text Summarization},\nauthor={Mehrdad Farahani, Mohammad Gharachorloo, Mohammad Manthouri},\nyear={2020},\neprint={2012.11204},\narchivePrefix={arXiv},\nprimaryClass={cs.CL}\n}\n", "homepage": "https://github.com/hooshvare/pn-summary", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"num_classes": 18, "names": ["Economy", "Roads-Urban", "Banking-Insurance", "Agriculture", "International", "Oil-Energy", "Industry", "Transportation", "Science-Technology", "Local", "Sports", "Politics", "Art-Culture", "Society", "Health", "Research", "Education-University", "Tourism"], "names_file": null, "id": null, "_type": "ClassLabel"}, "categories": {"dtype": "string", "id": null, "_type": "Value"}, "network": {"num_classes": 6, "names": ["Tahlilbazaar", "Imna", "Shana", "Mehr", "Irna", "Khabaronline"], "names_file": null, "id": null, "_type": "ClassLabel"}, "link": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pn_summary", "config_name": "1.0.0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 309436709, "num_examples": 82022, "dataset_name": "pn_summary"}, "validation": {"name": "validation", "num_bytes": 21311841, "num_examples": 5592, "dataset_name": "pn_summary"}, "test": {"name": "test", "num_bytes": 20936844, "num_examples": 5593, "dataset_name": "pn_summary"}}, "download_checksums": {"https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download": {"num_bytes": 89591141, "checksum": "49aa6a5fdb11244714f9bbe69517f2079ab934c9c565e272a977fbd8d2d404f7"}}, "download_size": 89591141, "post_processing_size": null, "dataset_size": 351685394, "size_in_bytes": 441276535}}
1+
{"1_0_0": {"description": "A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.\nIt is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace(\"[n]\", \"\n\")`) and then use them for your purposes.\n", "citation": "@article{pnSummary, title={Leveraging ParsBERT and Pretrained mT5 for Persian Abstractive Text Summarization},\nauthor={Mehrdad Farahani, Mohammad Gharachorloo, Mohammad Manthouri},\nyear={2020},\neprint={2012.11204},\narchivePrefix={arXiv},\nprimaryClass={cs.CL}\n}\n", "homepage": "https://github.com/hooshvare/pn-summary", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"num_classes": 18, "names": ["Economy", "Roads-Urban", "Banking-Insurance", "Agriculture", "International", "Oil-Energy", "Industry", "Transportation", "Science-Technology", "Local", "Sports", "Politics", "Art-Culture", "Society", "Health", "Research", "Education-University", "Tourism"], "names_file": null, "id": null, "_type": "ClassLabel"}, "categories": {"dtype": "string", "id": null, "_type": "Value"}, "network": {"num_classes": 6, "names": ["Tahlilbazaar", "Imna", "Shana", "Mehr", "Irna", "Khabaronline"], "names_file": null, "id": null, "_type": "ClassLabel"}, "link": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pn_summary", "config_name": "1_0_0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 309436709, "num_examples": 82022, "dataset_name": "pn_summary"}, "validation": {"name": "validation", "num_bytes": 21311841, "num_examples": 5592, "dataset_name": "pn_summary"}, "test": {"name": "test", "num_bytes": 20936844, "num_examples": 5593, "dataset_name": "pn_summary"}}, "download_checksums": {"https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download": {"num_bytes": 89591141, "checksum": "49aa6a5fdb11244714f9bbe69517f2079ab934c9c565e272a977fbd8d2d404f7"}}, "download_size": 89591141, "post_processing_size": null, "dataset_size": 351685394, "size_in_bytes": 441276535}}

datasets/pn_summary/pn_summary.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
_LICENSE = "MIT License"
4141

4242
_URLs = {
43-
"1.0.0": {
43+
"1_0_0": {
4444
"data": "https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download",
4545
"features": [
4646
{"name": "id", "type": datasets.Value("string")},
@@ -97,7 +97,7 @@ class PnSummary(datasets.GeneratorBasedBuilder):
9797

9898
BUILDER_CONFIGS = [
9999
PnSummaryConfig(
100-
name="1.0.0", version=datasets.Version("1.0.0"), description="The first version of pn_summary"
100+
name="1_0_0", version=datasets.Version("1.0.0"), description="The first version of pn_summary"
101101
),
102102
]
103103

datasets/qa4mre/README.md

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ alzheimers data, and the other on entrance exams data.
6060

6161
### Data Instances
6262

63-
#### 2011.main.DE
63+
#### 2011_main_DE
6464

6565
- **Size of downloaded dataset files:** 0.21 MB
6666
- **Size of the generated dataset:** 1.67 MB
@@ -71,7 +71,7 @@ An example of 'train' looks as follows.
7171
7272
```
7373

74-
#### 2011.main.EN
74+
#### 2011_main_EN
7575

7676
- **Size of downloaded dataset files:** 0.19 MB
7777
- **Size of the generated dataset:** 1.50 MB
@@ -82,7 +82,7 @@ An example of 'train' looks as follows.
8282
8383
```
8484

85-
#### 2011.main.ES
85+
#### 2011_main_ES
8686

8787
- **Size of downloaded dataset files:** 0.21 MB
8888
- **Size of the generated dataset:** 1.62 MB
@@ -93,7 +93,7 @@ An example of 'train' looks as follows.
9393
9494
```
9595

96-
#### 2011.main.IT
96+
#### 2011_main_IT
9797

9898
- **Size of downloaded dataset files:** 0.20 MB
9999
- **Size of the generated dataset:** 1.59 MB
@@ -104,7 +104,7 @@ An example of 'train' looks as follows.
104104
105105
```
106106

107-
#### 2011.main.RO
107+
#### 2011_main_RO
108108

109109
- **Size of downloaded dataset files:** 0.21 MB
110110
- **Size of the generated dataset:** 1.66 MB
@@ -119,7 +119,7 @@ An example of 'train' looks as follows.
119119

120120
The data fields are the same among all splits.
121121

122-
#### 2011.main.DE
122+
#### 2011_main_DE
123123
- `topic_id`: a `string` feature.
124124
- `topic_name`: a `string` feature.
125125
- `test_id`: a `string` feature.
@@ -133,7 +133,7 @@ The data fields are the same among all splits.
133133
- `correct_answer_id`: a `string` feature.
134134
- `correct_answer_str`: a `string` feature.
135135

136-
#### 2011.main.EN
136+
#### 2011_main_EN
137137
- `topic_id`: a `string` feature.
138138
- `topic_name`: a `string` feature.
139139
- `test_id`: a `string` feature.
@@ -147,7 +147,7 @@ The data fields are the same among all splits.
147147
- `correct_answer_id`: a `string` feature.
148148
- `correct_answer_str`: a `string` feature.
149149

150-
#### 2011.main.ES
150+
#### 2011_main_ES
151151
- `topic_id`: a `string` feature.
152152
- `topic_name`: a `string` feature.
153153
- `test_id`: a `string` feature.
@@ -161,7 +161,7 @@ The data fields are the same among all splits.
161161
- `correct_answer_id`: a `string` feature.
162162
- `correct_answer_str`: a `string` feature.
163163

164-
#### 2011.main.IT
164+
#### 2011_main_IT
165165
- `topic_id`: a `string` feature.
166166
- `topic_name`: a `string` feature.
167167
- `test_id`: a `string` feature.
@@ -175,7 +175,7 @@ The data fields are the same among all splits.
175175
- `correct_answer_id`: a `string` feature.
176176
- `correct_answer_str`: a `string` feature.
177177

178-
#### 2011.main.RO
178+
#### 2011_main_RO
179179
- `topic_id`: a `string` feature.
180180
- `topic_name`: a `string` feature.
181181
- `test_id`: a `string` feature.
@@ -193,11 +193,11 @@ The data fields are the same among all splits.
193193

194194
| name |train|
195195
|------------|----:|
196-
|2011.main.DE| 120|
197-
|2011.main.EN| 120|
198-
|2011.main.ES| 120|
199-
|2011.main.IT| 120|
200-
|2011.main.RO| 120|
196+
|2011_main_DE| 120|
197+
|2011_main_EN| 120|
198+
|2011_main_ES| 120|
199+
|2011_main_IT| 120|
200+
|2011_main_RO| 120|
201201

202202
## Dataset Creation
203203

datasets/qa4mre/dataset_infos.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)