huggingface
diff --git a/‎datasets/multi_woz_v22/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion b/‎datasets/multi_woz_v22/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎datasets/multi_woz_v22/multi_woz_v22.py‎
Lines changed: 3 additions & 3 deletions b/‎datasets/multi_woz_v22/multi_woz_v22.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎datasets/newsgroup/README.md‎
Lines changed: 15 additions & 15 deletions b/‎datasets/newsgroup/README.md‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎datasets/newsgroup/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion b/‎datasets/newsgroup/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎datasets/newsgroup/newsgroup.py‎
Lines changed: 21 additions & 21 deletions b/‎datasets/newsgroup/newsgroup.py‎
Lines changed: 21 additions & 21 deletions
diff --git a/‎datasets/pn_summary/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion b/‎datasets/pn_summary/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎datasets/pn_summary/pn_summary.py‎
Lines changed: 2 additions & 2 deletions b/‎datasets/pn_summary/pn_summary.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎datasets/qa4mre/README.md‎
Lines changed: 15 additions & 15 deletions b/‎datasets/qa4mre/README.md‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎datasets/qa4mre/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion b/‎datasets/qa4mre/dataset_infos.json‎
Lines changed: 1 addition & 1 deletion
@@ -89,15 +89,15 @@ class MultiWozV22(datasets.GeneratorBasedBuilder):
     VERSION = datasets.Version("2.2.0")
 
     BUILDER_CONFIGS = [
-        datasets.BuilderConfig(name="v2.2", version=datasets.Version("2.2.0"), description="MultiWOZ v2.2"),
+        datasets.BuilderConfig(name="v2_2", version=datasets.Version("2.2.0"), description="MultiWOZ v2.2"),
         datasets.BuilderConfig(
-            name="v2.2_active_only",
+            name="v2_2_active_only",
             version=datasets.Version("2.2.0"),
             description="MultiWOZ v2.2, only keeps around frames with an active intent",
         ),
     ]
 
-    DEFAULT_CONFIG_NAME = "v2.2_active_only"
+    DEFAULT_CONFIG_NAME = "v2_2_active_only"
 
     def _info(self):
         features = datasets.Features(
 
@@ -62,7 +62,7 @@ does not include cross-posts and includes only the "From" and "Subject" headers.
 
 ### Data Instances
 
-#### 18828_alt.atheism
+#### 18828_alt_atheism
 
 - **Size of downloaded dataset files:** 13.99 MB
 - **Size of the generated dataset:** 1.59 MB
@@ -73,7 +73,7 @@ An example of 'train' looks as follows.
 
 ```
 
-#### 18828_comp.graphics
+#### 18828_comp_graphics
 
 - **Size of downloaded dataset files:** 13.99 MB
 - **Size of the generated dataset:** 1.58 MB
@@ -84,7 +84,7 @@ An example of 'train' looks as follows.
 
 ```
 
-#### 18828_comp.os.ms-windows.misc
+#### 18828_comp_os_ms-windows_misc
 
 - **Size of downloaded dataset files:** 13.99 MB
 - **Size of the generated dataset:** 2.27 MB
@@ -95,7 +95,7 @@ An example of 'train' looks as follows.
 
 ```
 
-#### 18828_comp.sys.ibm.pc.hardware
+#### 18828_comp_sys_ibm_pc_hardware
 
 - **Size of downloaded dataset files:** 13.99 MB
 - **Size of the generated dataset:** 1.13 MB
@@ -106,7 +106,7 @@ An example of 'train' looks as follows.
 
 ```
 
-#### 18828_comp.sys.mac.hardware
+#### 18828_comp_sys_mac_hardware
 
 - **Size of downloaded dataset files:** 13.99 MB
 - **Size of the generated dataset:** 1.01 MB
@@ -121,30 +121,30 @@ An example of 'train' looks as follows.
 
 The data fields are the same among all splits.
 
-#### 18828_alt.atheism
+#### 18828_alt_atheism
 - `text`: a `string` feature.
 
-#### 18828_comp.graphics
+#### 18828_comp_graphics
 - `text`: a `string` feature.
 
-#### 18828_comp.os.ms-windows.misc
+#### 18828_comp_os_ms-windows_misc
 - `text`: a `string` feature.
 
-#### 18828_comp.sys.ibm.pc.hardware
+#### 18828_comp_sys_ibm_pc_hardware
 - `text`: a `string` feature.
 
-#### 18828_comp.sys.mac.hardware
+#### 18828_comp_sys_mac_hardware
 - `text`: a `string` feature.
 
 ### Data Splits
 
 |             name             |train|
 |------------------------------|----:|
-|18828_alt.atheism             |  799|
-|18828_comp.graphics           |  973|
-|18828_comp.os.ms-windows.misc |  985|
-|18828_comp.sys.ibm.pc.hardware|  982|
-|18828_comp.sys.mac.hardware   |  961|
+|18828_alt_atheism             |  799|
+|18828_comp_graphics           |  973|
+|18828_comp_os_ms-windows_misc |  985|
+|18828_comp_sys_ibm_pc_hardware|  982|
+|18828_comp_sys_mac_hardware   |  961|
 
 ## Dataset Creation
 
 
@@ -42,26 +42,26 @@
     "18828": "http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz",
 }
 _NEWS_GROUPS = [
-    "comp.graphics",
-    "comp.os.ms-windows.misc",
-    "comp.sys.ibm.pc.hardware",
-    "comp.sys.mac.hardware",
-    "comp.windows.x",
-    "rec.autos",
-    "rec.motorcycles",
-    "rec.sport.baseball",
-    "rec.sport.hockey",
-    "sci.crypt",
-    "sci.electronics",
-    "sci.med",
-    "sci.space",
-    "misc.forsale",
-    "talk.politics.misc",
-    "talk.politics.guns",
-    "talk.politics.mideast",
-    "talk.religion.misc",
-    "alt.atheism",
-    "soc.religion.christian",
+    "comp_graphics",
+    "comp_os_ms-windows_misc",
+    "comp_sys_ibm_pc_hardware",
+    "comp_sys_mac_hardware",
+    "comp_windows_x",
+    "rec_autos",
+    "rec_motorcycles",
+    "rec_sport_baseball",
+    "rec_sport_hockey",
+    "sci_crypt",
+    "sci_electronics",
+    "sci_med",
+    "sci_space",
+    "misc_forsale",
+    "talk_politics_misc",
+    "talk_politics_guns",
+    "talk_politics_mideast",
+    "talk_religion_misc",
+    "alt_atheism",
+    "soc_religion_christian",
 ]
 _VERSIONS = {"19997": "1.0.0", "bydate": "2.0.0", "18828": "3.0.0"}
 
@@ -99,7 +99,7 @@ class Newsgroups(datasets.GeneratorBasedBuilder):
         NewsgroupConfig(
             name=name,
             description=_DESC[name.split("_")[0]],
-            sub_dir=name.split("_")[1],
+            sub_dir=name.split("_", 1)[1].replace("_", "."),
             version=datasets.Version(_VERSIONS[name.split("_")[0]]),
         )
         for name in _CONFIG_NAMES
 
@@ -1 +1 @@
-{"1.0.0": {"description": "A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.\nIt is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace(\"[n]\", \"\n\")`) and then use them for your purposes.\n", "citation": "@article{pnSummary, title={Leveraging ParsBERT and Pretrained mT5 for Persian Abstractive Text Summarization},\nauthor={Mehrdad Farahani, Mohammad Gharachorloo, Mohammad Manthouri},\nyear={2020},\neprint={2012.11204},\narchivePrefix={arXiv},\nprimaryClass={cs.CL}\n}\n", "homepage": "https://github.com/hooshvare/pn-summary", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"num_classes": 18, "names": ["Economy", "Roads-Urban", "Banking-Insurance", "Agriculture", "International", "Oil-Energy", "Industry", "Transportation", "Science-Technology", "Local", "Sports", "Politics", "Art-Culture", "Society", "Health", "Research", "Education-University", "Tourism"], "names_file": null, "id": null, "_type": "ClassLabel"}, "categories": {"dtype": "string", "id": null, "_type": "Value"}, "network": {"num_classes": 6, "names": ["Tahlilbazaar", "Imna", "Shana", "Mehr", "Irna", "Khabaronline"], "names_file": null, "id": null, "_type": "ClassLabel"}, "link": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pn_summary", "config_name": "1.0.0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 309436709, "num_examples": 82022, "dataset_name": "pn_summary"}, "validation": {"name": "validation", "num_bytes": 21311841, "num_examples": 5592, "dataset_name": "pn_summary"}, "test": {"name": "test", "num_bytes": 20936844, "num_examples": 5593, "dataset_name": "pn_summary"}}, "download_checksums": {"https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download": {"num_bytes": 89591141, "checksum": "49aa6a5fdb11244714f9bbe69517f2079ab934c9c565e272a977fbd8d2d404f7"}}, "download_size": 89591141, "post_processing_size": null, "dataset_size": 351685394, "size_in_bytes": 441276535}}
+{"1_0_0": {"description": "A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.\nIt is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace(\"[n]\", \"\n\")`) and then use them for your purposes.\n", "citation": "@article{pnSummary, title={Leveraging ParsBERT and Pretrained mT5 for Persian Abstractive Text Summarization},\nauthor={Mehrdad Farahani, Mohammad Gharachorloo, Mohammad Manthouri},\nyear={2020},\neprint={2012.11204},\narchivePrefix={arXiv},\nprimaryClass={cs.CL}\n}\n", "homepage": "https://github.com/hooshvare/pn-summary", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"num_classes": 18, "names": ["Economy", "Roads-Urban", "Banking-Insurance", "Agriculture", "International", "Oil-Energy", "Industry", "Transportation", "Science-Technology", "Local", "Sports", "Politics", "Art-Culture", "Society", "Health", "Research", "Education-University", "Tourism"], "names_file": null, "id": null, "_type": "ClassLabel"}, "categories": {"dtype": "string", "id": null, "_type": "Value"}, "network": {"num_classes": 6, "names": ["Tahlilbazaar", "Imna", "Shana", "Mehr", "Irna", "Khabaronline"], "names_file": null, "id": null, "_type": "ClassLabel"}, "link": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pn_summary", "config_name": "1_0_0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 309436709, "num_examples": 82022, "dataset_name": "pn_summary"}, "validation": {"name": "validation", "num_bytes": 21311841, "num_examples": 5592, "dataset_name": "pn_summary"}, "test": {"name": "test", "num_bytes": 20936844, "num_examples": 5593, "dataset_name": "pn_summary"}}, "download_checksums": {"https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download": {"num_bytes": 89591141, "checksum": "49aa6a5fdb11244714f9bbe69517f2079ab934c9c565e272a977fbd8d2d404f7"}}, "download_size": 89591141, "post_processing_size": null, "dataset_size": 351685394, "size_in_bytes": 441276535}}
@@ -40,7 +40,7 @@
 _LICENSE = "MIT License"
 
 _URLs = {
-    "1.0.0": {
+    "1_0_0": {
         "data": "https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download",
         "features": [
             {"name": "id", "type": datasets.Value("string")},
@@ -97,7 +97,7 @@ class PnSummary(datasets.GeneratorBasedBuilder):
 
     BUILDER_CONFIGS = [
         PnSummaryConfig(
-            name="1.0.0", version=datasets.Version("1.0.0"), description="The first version of pn_summary"
+            name="1_0_0", version=datasets.Version("1.0.0"), description="The first version of pn_summary"
         ),
     ]
 
 
@@ -60,7 +60,7 @@ alzheimers data, and the other on entrance exams data.
 
 ### Data Instances
 
-#### 2011.main.DE
+#### 2011_main_DE
 
 - **Size of downloaded dataset files:** 0.21 MB
 - **Size of the generated dataset:** 1.67 MB
@@ -71,7 +71,7 @@ An example of 'train' looks as follows.
 
 ```
 
-#### 2011.main.EN
+#### 2011_main_EN
 
 - **Size of downloaded dataset files:** 0.19 MB
 - **Size of the generated dataset:** 1.50 MB
@@ -82,7 +82,7 @@ An example of 'train' looks as follows.
 
 ```
 
-#### 2011.main.ES
+#### 2011_main_ES
 
 - **Size of downloaded dataset files:** 0.21 MB
 - **Size of the generated dataset:** 1.62 MB
@@ -93,7 +93,7 @@ An example of 'train' looks as follows.
 
 ```
 
-#### 2011.main.IT
+#### 2011_main_IT
 
 - **Size of downloaded dataset files:** 0.20 MB
 - **Size of the generated dataset:** 1.59 MB
@@ -104,7 +104,7 @@ An example of 'train' looks as follows.
 
 ```
 
-#### 2011.main.RO
+#### 2011_main_RO
 
 - **Size of downloaded dataset files:** 0.21 MB
 - **Size of the generated dataset:** 1.66 MB
@@ -119,7 +119,7 @@ An example of 'train' looks as follows.
 
 The data fields are the same among all splits.
 
-#### 2011.main.DE
+#### 2011_main_DE
 - `topic_id`: a `string` feature.
 - `topic_name`: a `string` feature.
 - `test_id`: a `string` feature.
@@ -133,7 +133,7 @@ The data fields are the same among all splits.
 - `correct_answer_id`: a `string` feature.
 - `correct_answer_str`: a `string` feature.
 
-#### 2011.main.EN
+#### 2011_main_EN
 - `topic_id`: a `string` feature.
 - `topic_name`: a `string` feature.
 - `test_id`: a `string` feature.
@@ -147,7 +147,7 @@ The data fields are the same among all splits.
 - `correct_answer_id`: a `string` feature.
 - `correct_answer_str`: a `string` feature.
 
-#### 2011.main.ES
+#### 2011_main_ES
 - `topic_id`: a `string` feature.
 - `topic_name`: a `string` feature.
 - `test_id`: a `string` feature.
@@ -161,7 +161,7 @@ The data fields are the same among all splits.
 - `correct_answer_id`: a `string` feature.
 - `correct_answer_str`: a `string` feature.
 
-#### 2011.main.IT
+#### 2011_main_IT
 - `topic_id`: a `string` feature.
 - `topic_name`: a `string` feature.
 - `test_id`: a `string` feature.
@@ -175,7 +175,7 @@ The data fields are the same among all splits.
 - `correct_answer_id`: a `string` feature.
 - `correct_answer_str`: a `string` feature.
 
-#### 2011.main.RO
+#### 2011_main_RO
 - `topic_id`: a `string` feature.
 - `topic_name`: a `string` feature.
 - `test_id`: a `string` feature.
@@ -193,11 +193,11 @@ The data fields are the same among all splits.
 
 |    name    |train|
 |------------|----:|
-|2011.main.DE|  120|
-|2011.main.EN|  120|
-|2011.main.ES|  120|
-|2011.main.IT|  120|
-|2011.main.RO|  120|
+|2011_main_DE|  120|
+|2011_main_EN|  120|
+|2011_main_ES|  120|
+|2011_main_IT|  120|
+|2011_main_RO|  120|
 
 ## Dataset Creation
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"1.0.0": {"description": "A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.\nIt is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace(\"[n]\", \"\n\")`) and then use them for your purposes.\n", "citation": "@article{pnSummary, title={Leveraging ParsBERT and Pretrained mT5 for Persian Abstractive Text Summarization},\nauthor={Mehrdad Farahani, Mohammad Gharachorloo, Mohammad Manthouri},\nyear={2020},\neprint={2012.11204},\narchivePrefix={arXiv},\nprimaryClass={cs.CL}\n}\n", "homepage": "https://github.com/hooshvare/pn-summary", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"num_classes": 18, "names": ["Economy", "Roads-Urban", "Banking-Insurance", "Agriculture", "International", "Oil-Energy", "Industry", "Transportation", "Science-Technology", "Local", "Sports", "Politics", "Art-Culture", "Society", "Health", "Research", "Education-University", "Tourism"], "names_file": null, "id": null, "_type": "ClassLabel"}, "categories": {"dtype": "string", "id": null, "_type": "Value"}, "network": {"num_classes": 6, "names": ["Tahlilbazaar", "Imna", "Shana", "Mehr", "Irna", "Khabaronline"], "names_file": null, "id": null, "_type": "ClassLabel"}, "link": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pn_summary", "config_name": "1.0.0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 309436709, "num_examples": 82022, "dataset_name": "pn_summary"}, "validation": {"name": "validation", "num_bytes": 21311841, "num_examples": 5592, "dataset_name": "pn_summary"}, "test": {"name": "test", "num_bytes": 20936844, "num_examples": 5593, "dataset_name": "pn_summary"}}, "download_checksums": {"https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download": {"num_bytes": 89591141, "checksum": "49aa6a5fdb11244714f9bbe69517f2079ab934c9c565e272a977fbd8d2d404f7"}}, "download_size": 89591141, "post_processing_size": null, "dataset_size": 351685394, "size_in_bytes": 441276535}}
	`1`	+{"1_0_0": {"description": "A well-structured summarization dataset for the Persian language consists of 93,207 records. It is prepared for Abstractive/Extractive tasks (like cnn_dailymail for English). It can also be used in other scopes like Text Generation, Title Generation, and News Category Classification.\nIt is imperative to consider that the newlines were replaced with the `[n]` symbol. Please interpret them into normal newlines (for ex. `t.replace(\"[n]\", \"\n\")`) and then use them for your purposes.\n", "citation": "@article{pnSummary, title={Leveraging ParsBERT and Pretrained mT5 for Persian Abstractive Text Summarization},\nauthor={Mehrdad Farahani, Mohammad Gharachorloo, Mohammad Manthouri},\nyear={2020},\neprint={2012.11204},\narchivePrefix={arXiv},\nprimaryClass={cs.CL}\n}\n", "homepage": "https://github.com/hooshvare/pn-summary", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "summary": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"num_classes": 18, "names": ["Economy", "Roads-Urban", "Banking-Insurance", "Agriculture", "International", "Oil-Energy", "Industry", "Transportation", "Science-Technology", "Local", "Sports", "Politics", "Art-Culture", "Society", "Health", "Research", "Education-University", "Tourism"], "names_file": null, "id": null, "_type": "ClassLabel"}, "categories": {"dtype": "string", "id": null, "_type": "Value"}, "network": {"num_classes": 6, "names": ["Tahlilbazaar", "Imna", "Shana", "Mehr", "Irna", "Khabaronline"], "names_file": null, "id": null, "_type": "ClassLabel"}, "link": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "pn_summary", "config_name": "1_0_0", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 309436709, "num_examples": 82022, "dataset_name": "pn_summary"}, "validation": {"name": "validation", "num_bytes": 21311841, "num_examples": 5592, "dataset_name": "pn_summary"}, "test": {"name": "test", "num_bytes": 20936844, "num_examples": 5593, "dataset_name": "pn_summary"}}, "download_checksums": {"https://drive.google.com/u/0/uc?id=16OgJ_OrfzUF_i3ftLjFn9kpcyoi7UJeO&export=download": {"num_bytes": 89591141, "checksum": "49aa6a5fdb11244714f9bbe69517f2079ab934c9c565e272a977fbd8d2d404f7"}}, "download_size": 89591141, "post_processing_size": null, "dataset_size": 351685394, "size_in_bytes": 441276535}}