huggingface · lhoestq · Jun 28, 2022 · Jun 23, 2022 · Jun 23, 2022 · Jun 23, 2022
diff --git a/datasets/banking77/banking77.py b/datasets/banking77/banking77.py
@@ -21,7 +21,7 @@
 from datasets.tasks import TextClassification
 
 
-_CITATION = """\
+_CITATION = r"""\
 @inproceedings{Casanueva2020,
     author      = {I{\~{n}}igo Casanueva and Tadas Temcinas and Daniela Gerz and Matthew Henderson and Ivan Vulic},
     title       = {Efficient Intent Detection with Dual Sentence Encoders},

diff --git a/datasets/universal_dependencies/README.md b/datasets/universal_dependencies/README.md
@@ -79,8 +79,6 @@ languages:
 - pcm
 - pl
 - pt
-- qhe
-- qtd
 - ro
 - ru
 - sa
@@ -117,10 +115,11 @@ size_categories:
 source_datasets:
 - original
 task_categories:
-- other
+- token-classification
 task_ids:
-- constituency-parsing
-- dependency-parsing
+- parsing
+- token-classification-other-constituency-parsing
+- token-classification-other-dependency-parsing
 paperswithcode_id: universal-dependencies
 pretty_name: Universal Dependencies Treebank
 configs:

diff --git a/datasets/universal_dependencies/universal_dependencies.py b/datasets/universal_dependencies/universal_dependencies.py
@@ -3,7 +3,7 @@
 import datasets
 
 
-_CITATION = """\
+_CITATION = r"""\
 @misc{11234/1-3424,
 title = {Universal Dependencies 2.7},
 author = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\"e}mi and Aghaei, Hamid and Agi{\'c}, {\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Aleksandravi{\v c}i{\=u}t{\.e}, Gabriel{\.e} and Alfina, Ika and Antonsen, Lene and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranzabe, Maria Jesus and Arnard{\'o}ttir, {\t H}{\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Bengoetxea, Kepa and Berk, G{\"o}zde and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\.e}, Agn{\.e} and Bjarnad{\'o}ttir, Krist{\'{\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\"{\i}}c and Borges V{\"o}lker, Emanuel and B{\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroglu Eryigit, Gulsen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Ceplo, Slavomir and Cetin, Savas and Cetinoglu, Ozlem and Chalub, Fabricio and Chi, Ethan and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Cignarella, Alessandra T. and Cinkova, Silvie and Collomb, Aurelie and Coltekin, Cagr{\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaz and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\'a}rd and Fernanda, Mar{\'{\i}}lia and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\'a}udia and Fujita, Kazunori and Gajdosov{\'a}, Katar{\'{\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\"a}rdenfors, Moa and Garza, Sebastian and Gerardi, Fabr{\'{\i}}cio Ferraz and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\"o}k{\i}rmak, Memduh and Goldberg, Yoav and G{\'o}mez Guinovart, Xavier and Gonz{\'a}lez Saavedra,

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -281,7 +281,7 @@ def _get_output_signature(
                 else:
                     np_arrays.append(np.array(array))
 
-            if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == np.bool:
+            if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == np.dtype("bool"):
-            if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == np.dtype("bool"):
+            if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == bool:
-            if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == np.dtype("bool"):
+            if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == bool:
                 tf_dtype = tf.int64
                 np_dtype = np.int64
             elif np.issubdtype(np_arrays[0].dtype, np.number):
@@ -3804,7 +3804,7 @@ def _feature(values: Union[float, int, str, np.ndarray]) -> "tf.train.Feature":
                     return _float_feature([values.item()])
                 elif np.issubdtype(values.dtype, np.integer):
                     return _int64_feature([values.item()])
-                elif np.issubdtype(values.dtype, np.str):
+                elif np.issubdtype(values.dtype, str):
                     return _bytes_feature([values.item().encode()])
                 else:
                     raise ValueError(f"values={values} has dtype {values.dtype}, which cannot be serialized")

diff --git a/src/datasets/commands/datasets_cli.py b/src/datasets/commands/datasets_cli.py
@@ -5,7 +5,7 @@
 from datasets.commands.dummy_data import DummyDataCommand
 from datasets.commands.env import EnvironmentCommand
 from datasets.commands.run_beam import RunBeamCommand
-from datasets.commands.test import TestCommand
+from datasets.commands.test import CLITestCommand
 from datasets.utils.logging import set_verbosity_info
 
 
@@ -23,7 +23,7 @@ def main():
     # Register commands
     ConvertCommand.register_subcommand(commands_parser)
     EnvironmentCommand.register_subcommand(commands_parser)
-    TestCommand.register_subcommand(commands_parser)
+    CLITestCommand.register_subcommand(commands_parser)
     RunBeamCommand.register_subcommand(commands_parser)
     DummyDataCommand.register_subcommand(commands_parser)
 

diff --git a/src/datasets/commands/test.py b/src/datasets/commands/test.py
@@ -16,8 +16,8 @@
 logger = get_logger(__name__)
 
 
-def test_command_factory(args):
-    return TestCommand(
+def cli_test_command_factory(args):
+    return CLITestCommand(
         args.dataset,
         args.name,
         args.cache_dir,
@@ -30,7 +30,7 @@ def test_command_factory(args):
     )
 
 
-class TestCommand(BaseDatasetsCLICommand):
+class CLITestCommand(BaseDatasetsCLICommand):
     @staticmethod
     def register_subcommand(parser: ArgumentParser):
         test_parser = parser.add_parser("test", help="Test dataset implementation.")
@@ -59,7 +59,7 @@ def register_subcommand(parser: ArgumentParser):
             help="Remove downloaded files and cached datasets after each config test",
         )
         test_parser.add_argument("dataset", type=str, help="Name of the dataset to download")
-        test_parser.set_defaults(func=test_command_factory)
+        test_parser.set_defaults(func=cli_test_command_factory)
 
     def __init__(
         self,

diff --git a/src/datasets/formatting/formatting.py b/src/datasets/formatting/formatting.py
@@ -194,11 +194,11 @@ def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:
                 array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
         if len(array) > 0:
             if any(
-                (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
+                (isinstance(x, np.ndarray) and (x.dtype == np.dtype("object") or x.shape != array[0].shape))
                 or (isinstance(x, float) and np.isnan(x))
                 for x in array
             ):
-                return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object})
+                return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.dtype("object")})
         return np.array(array, copy=False, **self.np_array_kwargs)
 
 

diff --git a/src/datasets/formatting/jax_formatter.py b/src/datasets/formatting/jax_formatter.py
@@ -54,7 +54,7 @@ def _recursive_tensorize(self, data_struct: dict):
         # support for nested types like struct of list of struct
         if isinstance(data_struct, (list, np.ndarray)):
             data_struct = np.array(data_struct, copy=False)
-            if data_struct.dtype == np.object:  # jax arrays cannot be instantied from an array of objects
+            if data_struct.dtype == np.dtype("object"):  # jax arrays cannot be instantied from an array of objects
                 return [self.recursive_tensorize(substruct) for substruct in data_struct]
         return self._tensorize(data_struct)
 

diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py
@@ -65,8 +65,8 @@ def _tensorize(self, value):
     def _recursive_tensorize(self, data_struct: dict):
         # support for nested types like struct of list of struct
         if isinstance(data_struct, (list, np.ndarray)):
-            if (
-                data_struct.dtype == np.object
+            if data_struct.dtype == np.dtype(
+                "object"
-            if data_struct.dtype == np.dtype(
-                "object"
+            if (
+                data_struct.dtype == object
-            if data_struct.dtype == np.dtype(
-                "object"
+            if (
+                data_struct.dtype == object
             ):  # tensorflow tensors can sometimes be instantied from an array of objects
                 try:
                     return self._tensorize(data_struct)

diff --git a/src/datasets/formatting/torch_formatter.py b/src/datasets/formatting/torch_formatter.py
@@ -46,7 +46,9 @@ def _recursive_tensorize(self, data_struct: dict):
         # support for nested types like struct of list of struct
         if isinstance(data_struct, (list, np.ndarray)):
             data_struct = np.array(data_struct, copy=False)
-            if data_struct.dtype == np.object:  # pytorch tensors cannot be instantied from an array of objects
+            if data_struct.dtype == np.dtype(
+                "object"
+            ):  # pytorch tensors cannot be instantied from an array of objects
-            if data_struct.dtype == np.dtype(
-                "object"
-            ):  # pytorch tensors cannot be instantied from an array of objects
+            if data_struct.dtype == object:  # pytorch tensors cannot be instantied from an array of objects
-            if data_struct.dtype == np.dtype(
-                "object"
-            ):  # pytorch tensors cannot be instantied from an array of objects
+            if data_struct.dtype == object:  # pytorch tensors cannot be instantied from an array of objects
                 return [self.recursive_tensorize(substruct) for substruct in data_struct]
         return self._tensorize(data_struct)
 

diff --git a/src/datasets/utils/resources/languages.json b/src/datasets/utils/resources/languages.json
@@ -13,7 +13,9 @@
     "agr": "Aguaruna",
     "aii": "Assyrian Neo-Aramaic",
     "ajg": "Aja (Benin)",
+    "ajp": "South Levantine Arabic",
     "ak": "Akan",
+    "akk": "Akkadian",
     "ak-GH": "Akan (Ghana)",
     "als": "Tosk Albanian",
     "alt": "Southern Altai",
@@ -25,6 +27,8 @@
     "amr": "Amarakaeri",
     "an": "Aragonese",
     "ang": "English, Old (ca.450-1100)",
+    "apu": "Apurinã",
+    "aqz": "Akuntsu",
     "ar": "Arabic",
     "ar-001": "Arabic (World)",
     "ar-AE": "Arabic (United Arab Emirates)",
@@ -175,13 +179,15 @@
     "ckb-IQ": "Central Kurdish (Iraq)",
     "ckb-IR": "Central Kurdish (Iran)",
     "ckb-Latn": "Central Kurdish (Latin)",
+    "ckt": "Chukot",
     "cmn": "Mandarin Chinese",
     "cnh": "Hakha Chin",
     "cni": "Ashaninka",
     "cnr": "Montenegrin",
     "co": "Corsican",
     "code": "Programming language (C++, Java, Javascript, Python, etc.)",
     "cof": "Colorado",
+    "cop": "Coptic",
     "cot": "Caquinte",
     "cpu": "Pichis Ashéninka",
     "cr": "Cree",
@@ -487,6 +493,7 @@
     "fr-WF": "French (Wallis & Futuna)",
     "fr-YT": "French (Mayotte)",
     "frm": "Middle French (ca. 1400-1600)",
+    "fro": "Old French (842-ca. 1400)",
     "frp": "Franco-Provençal",
     "frr": "Northern Frisian",
     "fuf": "Pular",
@@ -526,6 +533,7 @@
     "gu-IN": "Gujarati (India)",
     "guc": "Wayuu",
     "gug": "Paraguayan Guaraní",
+    "gun": "Mbyá Guaraní",
     "guu": "Yanomamö",
     "guz": "Gusii",
     "guz-KE": "Gusii (Kenya)",
@@ -626,6 +634,7 @@
     "kea": "Kabuverdianu",
     "kea-CV": "Kabuverdianu (Cape Verde)",
     "kek": "Kekchí",
+    "kfm": "Khunsari",
     "kg": "Kongo",
     "kg-AO": "Kongo (Angola)",
     "kha": "Khasi",
@@ -661,6 +670,7 @@
     "kok": "Konkani",
     "kok-IN": "Konkani (India)",
     "koo": "Konzo",
+    "kpv": "Komi-Zyrian",
     "kqn": "Kaonde",
     "kqs": "Northern Kissi",
     "kr": "Kanuri",
@@ -801,6 +811,7 @@
     "my": "Burmese",
     "my-MM": "Burmese (Myanmar (Burma))",
     "my-x-zawgyi": "Myanmar (Zawgyi)",
+    "myu": "Mundurukú",
     "myv": "Erzya",
     "mzi": "Ixcatlán Mazatec",
     "mzn": "Mazanderani",
@@ -861,6 +872,7 @@
     "nym": "Nyamwezi",
     "nyn": "Nyankole",
     "nyn-UG": "Nyankole (Uganda)",
+    "nyq": "Nayini",
     "nzi": "Nzima",
     "oaa": "Orok",
     "oc": "Occitan (post 1500)",
@@ -874,10 +886,12 @@
     "or": "Odia",
     "or-IN": "Odia (India)",
     "orh": "Oroqen",
+    "orv": "Old Russian",
     "os": "Ossetic",
     "os-GE": "Ossetic (Georgia)",
     "os-RU": "Ossetic (Russia)",
     "ote": "Mezquital Otomi",
+    "otk": "Old Turkish",
     "oto": "Otomi",
     "pa": "Punjabi",
     "pa-Arab": "Punjabi (Arabic)",
@@ -930,7 +944,7 @@
     "pt-MZ": "Portuguese (Mozambique)",
     "pt-PT": "Portuguese (Portugal)",
     "pt-ST": "Portuguese (São Tomé & Príncipe)",
-    "pt-TL": "Portuguese (Timor-Leste)",
+    "pt-TL": "Portuguese (Timor-Leste)", 
     "qu": "Quechua",
     "qu-BO": "Quechua (Bolivia)",
     "qu-EC": "Quechua (Ecuador)",
@@ -1033,9 +1047,11 @@
     "sl-SI": "Slovenian (Slovenia)",
     "slr": "Salar",
     "sm": "San Marino",
+    "sme": "Northern Sami",
     "sml": "Central Sama",
     "smn": "Inari Sami",
     "smn-FI": "Inari Sami (Finland)",
+    "sms": "Skolt Sami",
     "sn": "Shona",
     "sn-ZW": "Shona (Zimbabwe)",
     "snk": "Soninke",
@@ -1045,6 +1061,7 @@
     "so-ET": "Somali (Ethiopia)",
     "so-KE": "Somali (Kenya)",
     "so-SO": "Somali (Somalia)",
+    "soj": "Soi",
     "son": "Songhai languages",
     "sq": "Albanian",
     "sq-AL": "Albanian (Albania)",
@@ -1084,6 +1101,7 @@
     "sw-UG": "Swahili (Uganda)",
     "swb": "Maore Comorian",
     "swh": "Swahili (individual language); Kiswahili",
+    "swl": "Swedish Sign Language",
     "syc": "Classical Syriac",
     "syr": "Syriac",
     "szl": "Silesian",
@@ -1131,6 +1149,7 @@
     "toj": "Tojolabal",
     "top": "Papantla Totonac",
     "tpi": "Tok Pisin",
+    "tpn": "Tupinambá",
     "tr": "Turkish",
     "tr-CY": "Turkish (Cyprus)",
     "tr-TR": "Turkish (Turkey)",
@@ -1197,6 +1216,7 @@
     "wae": "Walser",
     "wae-CH": "Walser (Switzerland)",
     "war": "Waray (Philippines)",
+    "wbp": "Warlpiri",
     "wo": "Wolof",
     "wo-SN": "Wolof (Senegal)",
     "wuu": "Wu Chinese",

diff --git a/src/datasets/utils/stratify.py b/src/datasets/utils/stratify.py
@@ -48,7 +48,7 @@ def approximate_mode(class_counts, n_draws, rng):
             need_to_add -= add_now
             if need_to_add == 0:
                 break
-    return floored.astype(np.int)
+    return floored.astype(np.int64)
 
 
 def stratified_shuffle_split_generate_indices(y, n_train, n_test, rng, n_splits=10):

diff --git a/tests/commands/test_test.py b/tests/commands/test_test.py
@@ -6,12 +6,12 @@
 from packaging import version
 
 from datasets import config
-from datasets.commands.test import TestCommand
+from datasets.commands.test import CLITestCommand
 
 
 if config.PY_VERSION >= version.parse("3.7"):
-    TestCommandArgs = namedtuple(
-        "TestCommandArgs",
+    _TestCommandArgs = namedtuple(
+        "_TestCommandArgs",
         [
             "dataset",
             "name",
@@ -28,7 +28,7 @@
 else:
 
     @dataclass
-    class TestCommandArgs:
+    class _TestCommandArgs:
         dataset: str
         name: str = None
         cache_dir: str = None
@@ -44,8 +44,8 @@ def __iter__(self):
 
 
 def test_test_command(dataset_loading_script_dir):
-    args = TestCommandArgs(dataset=dataset_loading_script_dir, all_configs=True, save_infos=True)
-    test_command = TestCommand(*args)
+    args = _TestCommandArgs(dataset=dataset_loading_script_dir, all_configs=True, save_infos=True)
+    test_command = CLITestCommand(*args)
     test_command.run()
     dataset_infos_path = os.path.join(dataset_loading_script_dir, config.DATASETDICT_INFOS_FILENAME)
     assert os.path.exists(dataset_infos_path)

diff --git a/tests/features/test_array_xd.py b/tests/features/test_array_xd.py
@@ -335,7 +335,7 @@ def test_array_xd_with_none():
     dummy_array = np.array([[1, 2], [3, 4]], dtype="int32")
     dataset = datasets.Dataset.from_dict({"foo": [dummy_array, None, dummy_array]}, features=features)
     arr = NumpyArrowExtractor().extract_column(dataset._data)
-    assert isinstance(arr, np.ndarray) and arr.dtype == np.object and arr.shape == (3,)
+    assert isinstance(arr, np.ndarray) and arr.dtype == np.dtype("object") and arr.shape == (3,)
     np.testing.assert_equal(arr[0], dummy_array)
     np.testing.assert_equal(arr[2], dummy_array)
     assert np.isnan(arr[1])  # a single np.nan value - np.all not needed