diff --git a/datasets/banking77/banking77.py b/datasets/banking77/banking77.py index 5ef1f984656..3ea7594c31b 100644 --- a/datasets/banking77/banking77.py +++ b/datasets/banking77/banking77.py @@ -21,7 +21,7 @@ from datasets.tasks import TextClassification -_CITATION = """\ +_CITATION = r"""\ @inproceedings{Casanueva2020, author = {I{\~{n}}igo Casanueva and Tadas Temcinas and Daniela Gerz and Matthew Henderson and Ivan Vulic}, title = {Efficient Intent Detection with Dual Sentence Encoders}, diff --git a/datasets/universal_dependencies/README.md b/datasets/universal_dependencies/README.md index 45cb58a2b06..627edd2acd6 100644 --- a/datasets/universal_dependencies/README.md +++ b/datasets/universal_dependencies/README.md @@ -79,8 +79,6 @@ languages: - pcm - pl - pt -- qhe -- qtd - ro - ru - sa @@ -117,10 +115,11 @@ size_categories: source_datasets: - original task_categories: -- other +- token-classification task_ids: -- constituency-parsing -- dependency-parsing +- parsing +- token-classification-other-constituency-parsing +- token-classification-other-dependency-parsing paperswithcode_id: universal-dependencies pretty_name: Universal Dependencies Treebank configs: diff --git a/datasets/universal_dependencies/universal_dependencies.py b/datasets/universal_dependencies/universal_dependencies.py index 6466bbad2b6..6ee775790ca 100644 --- a/datasets/universal_dependencies/universal_dependencies.py +++ b/datasets/universal_dependencies/universal_dependencies.py @@ -3,7 +3,7 @@ import datasets -_CITATION = """\ +_CITATION = r"""\ @misc{11234/1-3424, title = {Universal Dependencies 2.7}, author = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\"e}mi and Aghaei, Hamid and Agi{\'c}, {\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Aleksandravi{\v c}i{\=u}t{\.e}, Gabriel{\.e} and Alfina, Ika and Antonsen, Lene and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranzabe, Maria Jesus and Arnard{\'o}ttir, {\t H}{\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Bengoetxea, Kepa and Berk, G{\"o}zde and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\.e}, Agn{\.e} and Bjarnad{\'o}ttir, Krist{\'{\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\"{\i}}c and Borges V{\"o}lker, Emanuel and B{\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroglu Eryigit, Gulsen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Ceplo, Slavomir and Cetin, Savas and Cetinoglu, Ozlem and Chalub, Fabricio and Chi, Ethan and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Cignarella, Alessandra T. and Cinkova, Silvie and Collomb, Aurelie and Coltekin, Cagr{\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaz and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\'a}rd and Fernanda, Mar{\'{\i}}lia and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\'a}udia and Fujita, Kazunori and Gajdosov{\'a}, Katar{\'{\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\"a}rdenfors, Moa and Garza, Sebastian and Gerardi, Fabr{\'{\i}}cio Ferraz and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\"o}k{\i}rmak, Memduh and Goldberg, Yoav and G{\'o}mez Guinovart, Xavier and Gonz{\'a}lez Saavedra, diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index ea420babffd..39221a16c46 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -281,7 +281,7 @@ def _get_output_signature( else: np_arrays.append(np.array(array)) - if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == np.bool: + if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == bool: tf_dtype = tf.int64 np_dtype = np.int64 elif np.issubdtype(np_arrays[0].dtype, np.number): @@ -3804,7 +3804,7 @@ def _feature(values: Union[float, int, str, np.ndarray]) -> "tf.train.Feature": return _float_feature([values.item()]) elif np.issubdtype(values.dtype, np.integer): return _int64_feature([values.item()]) - elif np.issubdtype(values.dtype, np.str): + elif np.issubdtype(values.dtype, str): return _bytes_feature([values.item().encode()]) else: raise ValueError(f"values={values} has dtype {values.dtype}, which cannot be serialized") diff --git a/src/datasets/commands/dummy_data.py b/src/datasets/commands/dummy_data.py index 27beda390eb..cb779480f5e 100644 --- a/src/datasets/commands/dummy_data.py +++ b/src/datasets/commands/dummy_data.py @@ -23,7 +23,7 @@ DEFAULT_ENCODING = "utf-8" -def test_command_factory(args): +def dummy_data_command_factory(args): return DummyDataCommand( args.path_to_dataset, args.auto_generate, @@ -256,7 +256,7 @@ def register_subcommand(parser: ArgumentParser): help=f"Encoding to use when auto-generating dummy data. Defaults to {DEFAULT_ENCODING}", ) test_parser.add_argument("path_to_dataset", type=str, help="Path to the dataset (example: ./datasets/squad)") - test_parser.set_defaults(func=test_command_factory) + test_parser.set_defaults(func=dummy_data_command_factory) def __init__( self, diff --git a/src/datasets/commands/test.py b/src/datasets/commands/test.py index a9f82f68653..59fa9690e47 100644 --- a/src/datasets/commands/test.py +++ b/src/datasets/commands/test.py @@ -16,7 +16,7 @@ logger = get_logger(__name__) -def test_command_factory(args): +def _test_command_factory(args): return TestCommand( args.dataset, args.name, @@ -31,6 +31,8 @@ def test_command_factory(args): class TestCommand(BaseDatasetsCLICommand): + __test__ = False # to tell pytest it's not a test class + @staticmethod def register_subcommand(parser: ArgumentParser): test_parser = parser.add_parser("test", help="Test dataset implementation.") @@ -59,7 +61,7 @@ def register_subcommand(parser: ArgumentParser): help="Remove downloaded files and cached datasets after each config test", ) test_parser.add_argument("dataset", type=str, help="Name of the dataset to download") - test_parser.set_defaults(func=test_command_factory) + test_parser.set_defaults(func=_test_command_factory) def __init__( self, diff --git a/src/datasets/formatting/formatting.py b/src/datasets/formatting/formatting.py index f857a53a992..ecc5b5e6606 100644 --- a/src/datasets/formatting/formatting.py +++ b/src/datasets/formatting/formatting.py @@ -194,11 +194,11 @@ def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray: array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist() if len(array) > 0: if any( - (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape)) + (isinstance(x, np.ndarray) and (x.dtype == object or x.shape != array[0].shape)) or (isinstance(x, float) and np.isnan(x)) for x in array ): - return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object}) + return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": object}) return np.array(array, copy=False, **self.np_array_kwargs) diff --git a/src/datasets/formatting/jax_formatter.py b/src/datasets/formatting/jax_formatter.py index dffe37bc5f0..0a554203be5 100644 --- a/src/datasets/formatting/jax_formatter.py +++ b/src/datasets/formatting/jax_formatter.py @@ -54,7 +54,7 @@ def _recursive_tensorize(self, data_struct: dict): # support for nested types like struct of list of struct if isinstance(data_struct, (list, np.ndarray)): data_struct = np.array(data_struct, copy=False) - if data_struct.dtype == np.object: # jax arrays cannot be instantied from an array of objects + if data_struct.dtype == object: # jax arrays cannot be instantied from an array of objects return [self.recursive_tensorize(substruct) for substruct in data_struct] return self._tensorize(data_struct) diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py index 7e835280c54..d07f1f636cc 100644 --- a/src/datasets/formatting/tf_formatter.py +++ b/src/datasets/formatting/tf_formatter.py @@ -65,9 +65,7 @@ def _tensorize(self, value): def _recursive_tensorize(self, data_struct: dict): # support for nested types like struct of list of struct if isinstance(data_struct, (list, np.ndarray)): - if ( - data_struct.dtype == np.object - ): # tensorflow tensors can sometimes be instantied from an array of objects + if data_struct.dtype == object: # tensorflow tensors can sometimes be instantied from an array of objects try: return self._tensorize(data_struct) except ValueError: diff --git a/src/datasets/formatting/torch_formatter.py b/src/datasets/formatting/torch_formatter.py index 3106a024920..c5a7d3c214f 100644 --- a/src/datasets/formatting/torch_formatter.py +++ b/src/datasets/formatting/torch_formatter.py @@ -46,7 +46,7 @@ def _recursive_tensorize(self, data_struct: dict): # support for nested types like struct of list of struct if isinstance(data_struct, (list, np.ndarray)): data_struct = np.array(data_struct, copy=False) - if data_struct.dtype == np.object: # pytorch tensors cannot be instantied from an array of objects + if data_struct.dtype == object: # pytorch tensors cannot be instantied from an array of objects return [self.recursive_tensorize(substruct) for substruct in data_struct] return self._tensorize(data_struct) diff --git a/src/datasets/utils/resources/languages.json b/src/datasets/utils/resources/languages.json index c064df801ce..9665b8d50aa 100644 --- a/src/datasets/utils/resources/languages.json +++ b/src/datasets/utils/resources/languages.json @@ -13,7 +13,9 @@ "agr": "Aguaruna", "aii": "Assyrian Neo-Aramaic", "ajg": "Aja (Benin)", + "ajp": "South Levantine Arabic", "ak": "Akan", + "akk": "Akkadian", "ak-GH": "Akan (Ghana)", "als": "Tosk Albanian", "alt": "Southern Altai", @@ -25,6 +27,8 @@ "amr": "Amarakaeri", "an": "Aragonese", "ang": "English, Old (ca.450-1100)", + "apu": "Apurinã", + "aqz": "Akuntsu", "ar": "Arabic", "ar-001": "Arabic (World)", "ar-AE": "Arabic (United Arab Emirates)", @@ -175,6 +179,7 @@ "ckb-IQ": "Central Kurdish (Iraq)", "ckb-IR": "Central Kurdish (Iran)", "ckb-Latn": "Central Kurdish (Latin)", + "ckt": "Chukot", "cmn": "Mandarin Chinese", "cnh": "Hakha Chin", "cni": "Ashaninka", @@ -182,6 +187,7 @@ "co": "Corsican", "code": "Programming language (C++, Java, Javascript, Python, etc.)", "cof": "Colorado", + "cop": "Coptic", "cot": "Caquinte", "cpu": "Pichis Ashéninka", "cr": "Cree", @@ -487,6 +493,7 @@ "fr-WF": "French (Wallis & Futuna)", "fr-YT": "French (Mayotte)", "frm": "Middle French (ca. 1400-1600)", + "fro": "Old French (842-ca. 1400)", "frp": "Franco-Provençal", "frr": "Northern Frisian", "fuf": "Pular", @@ -526,6 +533,7 @@ "gu-IN": "Gujarati (India)", "guc": "Wayuu", "gug": "Paraguayan Guaraní", + "gun": "Mbyá Guaraní", "guu": "Yanomamö", "guz": "Gusii", "guz-KE": "Gusii (Kenya)", @@ -626,6 +634,7 @@ "kea": "Kabuverdianu", "kea-CV": "Kabuverdianu (Cape Verde)", "kek": "Kekchí", + "kfm": "Khunsari", "kg": "Kongo", "kg-AO": "Kongo (Angola)", "kha": "Khasi", @@ -661,6 +670,7 @@ "kok": "Konkani", "kok-IN": "Konkani (India)", "koo": "Konzo", + "kpv": "Komi-Zyrian", "kqn": "Kaonde", "kqs": "Northern Kissi", "kr": "Kanuri", @@ -801,6 +811,7 @@ "my": "Burmese", "my-MM": "Burmese (Myanmar (Burma))", "my-x-zawgyi": "Myanmar (Zawgyi)", + "myu": "Mundurukú", "myv": "Erzya", "mzi": "Ixcatlán Mazatec", "mzn": "Mazanderani", @@ -861,6 +872,7 @@ "nym": "Nyamwezi", "nyn": "Nyankole", "nyn-UG": "Nyankole (Uganda)", + "nyq": "Nayini", "nzi": "Nzima", "oaa": "Orok", "oc": "Occitan (post 1500)", @@ -874,10 +886,12 @@ "or": "Odia", "or-IN": "Odia (India)", "orh": "Oroqen", + "orv": "Old Russian", "os": "Ossetic", "os-GE": "Ossetic (Georgia)", "os-RU": "Ossetic (Russia)", "ote": "Mezquital Otomi", + "otk": "Old Turkish", "oto": "Otomi", "pa": "Punjabi", "pa-Arab": "Punjabi (Arabic)", @@ -930,7 +944,7 @@ "pt-MZ": "Portuguese (Mozambique)", "pt-PT": "Portuguese (Portugal)", "pt-ST": "Portuguese (São Tomé & Príncipe)", - "pt-TL": "Portuguese (Timor-Leste)", + "pt-TL": "Portuguese (Timor-Leste)", "qu": "Quechua", "qu-BO": "Quechua (Bolivia)", "qu-EC": "Quechua (Ecuador)", @@ -1033,9 +1047,11 @@ "sl-SI": "Slovenian (Slovenia)", "slr": "Salar", "sm": "San Marino", + "sme": "Northern Sami", "sml": "Central Sama", "smn": "Inari Sami", "smn-FI": "Inari Sami (Finland)", + "sms": "Skolt Sami", "sn": "Shona", "sn-ZW": "Shona (Zimbabwe)", "snk": "Soninke", @@ -1045,6 +1061,7 @@ "so-ET": "Somali (Ethiopia)", "so-KE": "Somali (Kenya)", "so-SO": "Somali (Somalia)", + "soj": "Soi", "son": "Songhai languages", "sq": "Albanian", "sq-AL": "Albanian (Albania)", @@ -1084,6 +1101,7 @@ "sw-UG": "Swahili (Uganda)", "swb": "Maore Comorian", "swh": "Swahili (individual language); Kiswahili", + "swl": "Swedish Sign Language", "syc": "Classical Syriac", "syr": "Syriac", "szl": "Silesian", @@ -1131,6 +1149,7 @@ "toj": "Tojolabal", "top": "Papantla Totonac", "tpi": "Tok Pisin", + "tpn": "Tupinambá", "tr": "Turkish", "tr-CY": "Turkish (Cyprus)", "tr-TR": "Turkish (Turkey)", @@ -1197,6 +1216,7 @@ "wae": "Walser", "wae-CH": "Walser (Switzerland)", "war": "Waray (Philippines)", + "wbp": "Warlpiri", "wo": "Wolof", "wo-SN": "Wolof (Senegal)", "wuu": "Wu Chinese", diff --git a/src/datasets/utils/stratify.py b/src/datasets/utils/stratify.py index 58ea04f2f85..d0967aa1abb 100644 --- a/src/datasets/utils/stratify.py +++ b/src/datasets/utils/stratify.py @@ -48,7 +48,7 @@ def approximate_mode(class_counts, n_draws, rng): need_to_add -= add_now if need_to_add == 0: break - return floored.astype(np.int) + return floored.astype(np.int64) def stratified_shuffle_split_generate_indices(y, n_train, n_test, rng, n_splits=10): diff --git a/tests/commands/test_test.py b/tests/commands/test_test.py index ccb9ca664ab..d169afc43dd 100644 --- a/tests/commands/test_test.py +++ b/tests/commands/test_test.py @@ -10,8 +10,8 @@ if config.PY_VERSION >= version.parse("3.7"): - TestCommandArgs = namedtuple( - "TestCommandArgs", + _TestCommandArgs = namedtuple( + "_TestCommandArgs", [ "dataset", "name", @@ -28,7 +28,7 @@ else: @dataclass - class TestCommandArgs: + class _TestCommandArgs: dataset: str name: str = None cache_dir: str = None @@ -44,7 +44,7 @@ def __iter__(self): def test_test_command(dataset_loading_script_dir): - args = TestCommandArgs(dataset=dataset_loading_script_dir, all_configs=True, save_infos=True) + args = _TestCommandArgs(dataset=dataset_loading_script_dir, all_configs=True, save_infos=True) test_command = TestCommand(*args) test_command.run() dataset_infos_path = os.path.join(dataset_loading_script_dir, config.DATASETDICT_INFOS_FILENAME) diff --git a/tests/features/test_array_xd.py b/tests/features/test_array_xd.py index ae007abbe00..2f6d9d94009 100644 --- a/tests/features/test_array_xd.py +++ b/tests/features/test_array_xd.py @@ -335,7 +335,7 @@ def test_array_xd_with_none(): dummy_array = np.array([[1, 2], [3, 4]], dtype="int32") dataset = datasets.Dataset.from_dict({"foo": [dummy_array, None, dummy_array]}, features=features) arr = NumpyArrowExtractor().extract_column(dataset._data) - assert isinstance(arr, np.ndarray) and arr.dtype == np.object and arr.shape == (3,) + assert isinstance(arr, np.ndarray) and arr.dtype == object and arr.shape == (3,) np.testing.assert_equal(arr[0], dummy_array) np.testing.assert_equal(arr[2], dummy_array) assert np.isnan(arr[1]) # a single np.nan value - np.all not needed diff --git a/tests/hub_fixtures.py b/tests/hub_fixtures.py index 133d05223e8..6bbc4f27e18 100644 --- a/tests/hub_fixtures.py +++ b/tests/hub_fixtures.py @@ -5,6 +5,8 @@ import requests from huggingface_hub.hf_api import HfApi, HfFolder +from datasets.utils._hf_hub_fixes import create_repo, delete_repo + USER = "__DUMMY_TRANSFORMERS_USER__" FULL_NAME = "Dummy User" @@ -34,7 +36,7 @@ def hf_token(hf_api: HfApi): @pytest.fixture(scope="session") def hf_private_dataset_repo_txt_data_(hf_api: HfApi, hf_token, text_file): repo_name = f"repo_txt_data-{int(time.time() * 10e3)}" - hf_api.create_repo(token=hf_token, name=repo_name, repo_type="dataset", private=True) + create_repo(hf_api, repo_name, token=hf_token, organization=USER, repo_type="dataset", private=True) repo_id = f"{USER}/{repo_name}" hf_api.upload_file( token=hf_token, @@ -45,7 +47,7 @@ def hf_private_dataset_repo_txt_data_(hf_api: HfApi, hf_token, text_file): ) yield repo_id try: - hf_api.delete_repo(repo_name, token=hf_token, repo_type="dataset") + delete_repo(hf_api, repo_name, token=hf_token, organization=USER, repo_type="dataset") except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error pass @@ -60,7 +62,7 @@ def hf_private_dataset_repo_txt_data(hf_private_dataset_repo_txt_data_): @pytest.fixture(scope="session") def hf_private_dataset_repo_zipped_txt_data_(hf_api: HfApi, hf_token, zip_csv_path): repo_name = f"repo_zipped_txt_data-{int(time.time() * 10e3)}" - hf_api.create_repo(token=hf_token, name=repo_name, repo_type="dataset", private=True) + create_repo(hf_api, repo_name, token=hf_token, organization=USER, repo_type="dataset", private=True) repo_id = f"{USER}/{repo_name}" hf_api.upload_file( token=hf_token, @@ -71,7 +73,7 @@ def hf_private_dataset_repo_zipped_txt_data_(hf_api: HfApi, hf_token, zip_csv_pa ) yield repo_id try: - hf_api.delete_repo(repo_name, token=hf_token, repo_type="dataset") + delete_repo(hf_api, repo_name, token=hf_token, organization=USER, repo_type="dataset") except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error pass @@ -86,7 +88,7 @@ def hf_private_dataset_repo_zipped_txt_data(hf_private_dataset_repo_zipped_txt_d @pytest.fixture(scope="session") def hf_private_dataset_repo_zipped_img_data_(hf_api: HfApi, hf_token, zip_image_path): repo_name = f"repo_zipped_img_data-{int(time.time() * 10e3)}" - hf_api.create_repo(token=hf_token, name=repo_name, repo_type="dataset", private=True) + create_repo(hf_api, repo_name, token=hf_token, organization=USER, repo_type="dataset", private=True) repo_id = f"{USER}/{repo_name}" hf_api.upload_file( token=hf_token, @@ -97,7 +99,7 @@ def hf_private_dataset_repo_zipped_img_data_(hf_api: HfApi, hf_token, zip_image_ ) yield repo_id try: - hf_api.delete_repo(repo_name, token=hf_token, repo_type="dataset") + delete_repo(hf_api, repo_name, token=hf_token, organization=USER, repo_type="dataset") except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error pass diff --git a/tests/test_builder.py b/tests/test_builder.py index 853859d40ca..c7e08e0f01b 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -837,8 +837,8 @@ def _generate_examples(self): "builder_class, kwargs", [ (DummyBuilderWithVersion, {}), - (DummyBuilderWithBuilderConfigs, {"name": "custom"}), - (DummyBuilderWithCustomBuilderConfigs, {"name": "20220501.en"}), + (DummyBuilderWithBuilderConfigs, {"config_name": "custom"}), + (DummyBuilderWithCustomBuilderConfigs, {"config_name": "20220501.en"}), (DummyBuilderWithCustomBuilderConfigs, {"date": "20220501", "language": "ca"}), ], )