huggingface · albertvillanova · Sep 29, 2022 · Sep 29, 2022 · Sep 29, 2022 · Sep 29, 2022
diff --git a/datasets/hendrycks_test/README.md b/datasets/hendrycks_test/README.md
@@ -51,17 +51,17 @@ pretty_name: HendrycksTest
 
 ## Dataset Description
 
-[Measuring Massive Multitask Language Understanding](https://arxiv.org/pdf/2009.03300) by [Dan Hendrycks](https://people.eecs.berkeley.edu/~hendrycks/), [Collin Burns](http://collinpburns.com), [Steven Basart](https://stevenbas.art), Andy Zou, Mantas Mazeika, [Dawn Song](https://people.eecs.berkeley.edu/~dawnsong/), and [Jacob Steinhardt](https://www.stat.berkeley.edu/~jsteinhardt/) (ICLR 2021).
-
 - **Repository**: https://github.com/hendrycks/test
 - **Paper**: https://arxiv.org/abs/2009.03300
 
-A complete list of tasks: ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
-
 ### Dataset Summary
 
+[Measuring Massive Multitask Language Understanding](https://arxiv.org/pdf/2009.03300) by [Dan Hendrycks](https://people.eecs.berkeley.edu/~hendrycks/), [Collin Burns](http://collinpburns.com), [Steven Basart](https://stevenbas.art), Andy Zou, Mantas Mazeika, [Dawn Song](https://people.eecs.berkeley.edu/~dawnsong/), and [Jacob Steinhardt](https://www.stat.berkeley.edu/~jsteinhardt/) (ICLR 2021).
+
 This is a massive multitask test consisting of multiple-choice questions from various branches of knowledge. The test spans subjects in the humanities, social sciences, hard sciences, and other areas that are important for some people to learn. This covers 57 tasks including elementary mathematics, US history, computer science, law, and more. To attain high accuracy on this test, models must possess extensive world knowledge and problem solving ability.
 
+A complete list of tasks: ['abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics', 'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics', 'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics', 'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics', 'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history', 'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning', 'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition', 'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology', 'world_religions']
+
 ### Supported Tasks and Leaderboards
 
 |                Model               | Authors |  Humanities |  Social Science  | STEM | Other | Average |

diff --git a/datasets/hendrycks_test/dataset_infos.json b/datasets/hendrycks_test/dataset_infos.json
diff --git a/datasets/hendrycks_test/hendrycks_test.py b/datasets/hendrycks_test/hendrycks_test.py
@@ -15,7 +15,6 @@
 
 
 import csv
-import os
 
 import datasets
 
@@ -109,7 +108,6 @@ class HendrycksTest(datasets.GeneratorBasedBuilder):
     ]
 
     def _info(self):
-        # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
         features = datasets.Features(
             {
                 "question": datasets.Value("string"),
@@ -118,75 +116,53 @@ def _info(self):
             }
         )
         return datasets.DatasetInfo(
-            # This is the description that will appear on the datasets page.
             description=_DESCRIPTION,
-            # This defines the different columns of the dataset and their types
-            features=features,  # Here we define them above because they are different between the two configurations
-            # If there's a common (input, target) tuple from the features,
-            # specify them here. They'll be used if as_supervised=True in
-            # builder.as_dataset.
-            supervised_keys=None,
-            # Homepage of the dataset for documentation
+            features=features,
             homepage=_HOMEPAGE,
-            # Citation for the dataset
             citation=_CITATION,
         )
 
     def _split_generators(self, dl_manager):
         """Returns SplitGenerators."""
-        data_dir = dl_manager.download_and_extract(_URL)
+        archive = dl_manager.download(_URL)
         return [
             datasets.SplitGenerator(
                 name=datasets.Split("auxiliary_train"),
-                # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "datadir": os.path.join(data_dir, "data", "auxiliary_train"),
+                    "iter_archive": dl_manager.iter_archive(archive),
                     "split": "auxiliary_train",
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
-                # These kwargs will be passed to _generate_examples
-                gen_kwargs={"datadir": os.path.join(data_dir, "data", "test"), "split": "test"},
+                gen_kwargs={"iter_archive": dl_manager.iter_archive(archive), "split": "test"},
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.VALIDATION,
-                # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "datadir": os.path.join(data_dir, "data", "val"),
+                    "iter_archive": dl_manager.iter_archive(archive),
                     "split": "val",
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split("dev"),
-                # These kwargs will be passed to _generate_examples
                 gen_kwargs={
-                    "datadir": os.path.join(data_dir, "data", "dev"),
+                    "iter_archive": dl_manager.iter_archive(archive),
                     "split": "dev",
                 },
             ),
         ]
 
-    def _generate_examples(self, datadir, split):
+    def _generate_examples(self, iter_archive, split):
         """Yields examples as (key, example) tuples."""
-        # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
-        # The `key` is here for legacy reason (tfds) and is not important in itself.
-
-        id_ = 0
-        if split == "auxiliary_train":
-            for f in sorted(os.listdir(datadir)):
-                reader = csv.reader(
-                    open(os.path.join(datadir, f), "r", encoding="utf-8"), quotechar='"', delimiter=","
-                )
-                for data in reader:
-                    yield id_, {"question": data[0], "choices": data[1:5], "answer": data[5]}
-                    id_ += 1
-        else:
-            reader = csv.reader(
-                open(os.path.join(datadir, f"{self.config.name}_{split}.csv"), "r", encoding="utf-8"),
-                quotechar='"',
-                delimiter=",",
-            )
-            for data in reader:
-                yield id_, {"question": data[0], "choices": data[1:5], "answer": data[5]}
-                id_ += 1
+        n_yielded_files = 0
+        for id_file, (path, file) in enumerate(iter_archive):
+            if f"data/{split}/" in path:
+                if split == "auxiliary_train" or f"{self.config.name}_{split}.csv" in path:
+                    n_yielded_files += 1
+                    lines = (line.decode("utf-8") for line in file)
+                    reader = csv.reader(lines)
+                    for id_line, data in enumerate(reader):
+                        yield f"{id_file}_{id_line}", {"question": data[0], "choices": data[1:5], "answer": data[5]}
+                    if n_yielded_files == 8 or split != "auxiliary_train":
+                        break