Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datasets/banking77/banking77.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from datasets.tasks import TextClassification


_CITATION = """\
_CITATION = r"""\
@inproceedings{Casanueva2020,
author = {I{\~{n}}igo Casanueva and Tadas Temcinas and Daniela Gerz and Matthew Henderson and Ivan Vulic},
title = {Efficient Intent Detection with Dual Sentence Encoders},
Expand Down
9 changes: 4 additions & 5 deletions datasets/universal_dependencies/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,6 @@ languages:
- pcm
- pl
- pt
- qhe
- qtd
- ro
- ru
- sa
Expand Down Expand Up @@ -117,10 +115,11 @@ size_categories:
source_datasets:
- original
task_categories:
- other
- token-classification
task_ids:
- constituency-parsing
- dependency-parsing
- parsing
- token-classification-other-constituency-parsing
- token-classification-other-dependency-parsing
paperswithcode_id: universal-dependencies
pretty_name: Universal Dependencies Treebank
configs:
Expand Down
2 changes: 1 addition & 1 deletion datasets/universal_dependencies/universal_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import datasets


_CITATION = """\
_CITATION = r"""\
@misc{11234/1-3424,
title = {Universal Dependencies 2.7},
author = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\"e}mi and Aghaei, Hamid and Agi{\'c}, {\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Aleksandravi{\v c}i{\=u}t{\.e}, Gabriel{\.e} and Alfina, Ika and Antonsen, Lene and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranzabe, Maria Jesus and Arnard{\'o}ttir, {\t H}{\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Bengoetxea, Kepa and Berk, G{\"o}zde and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\.e}, Agn{\.e} and Bjarnad{\'o}ttir, Krist{\'{\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\"{\i}}c and Borges V{\"o}lker, Emanuel and B{\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroglu Eryigit, Gulsen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Ceplo, Slavomir and Cetin, Savas and Cetinoglu, Ozlem and Chalub, Fabricio and Chi, Ethan and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Cignarella, Alessandra T. and Cinkova, Silvie and Collomb, Aurelie and Coltekin, Cagr{\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaz and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\'a}rd and Fernanda, Mar{\'{\i}}lia and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\'a}udia and Fujita, Kazunori and Gajdosov{\'a}, Katar{\'{\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\"a}rdenfors, Moa and Garza, Sebastian and Gerardi, Fabr{\'{\i}}cio Ferraz and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\"o}k{\i}rmak, Memduh and Goldberg, Yoav and G{\'o}mez Guinovart, Xavier and Gonz{\'a}lez Saavedra,
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def _get_output_signature(
else:
np_arrays.append(np.array(array))

if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == np.bool:
if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == bool:
tf_dtype = tf.int64
np_dtype = np.int64
elif np.issubdtype(np_arrays[0].dtype, np.number):
Expand Down Expand Up @@ -3804,7 +3804,7 @@ def _feature(values: Union[float, int, str, np.ndarray]) -> "tf.train.Feature":
return _float_feature([values.item()])
elif np.issubdtype(values.dtype, np.integer):
return _int64_feature([values.item()])
elif np.issubdtype(values.dtype, np.str):
elif np.issubdtype(values.dtype, str):
return _bytes_feature([values.item().encode()])
else:
raise ValueError(f"values={values} has dtype {values.dtype}, which cannot be serialized")
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/commands/dummy_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
DEFAULT_ENCODING = "utf-8"


def test_command_factory(args):
def dummy_data_command_factory(args):
return DummyDataCommand(
args.path_to_dataset,
args.auto_generate,
Expand Down Expand Up @@ -256,7 +256,7 @@ def register_subcommand(parser: ArgumentParser):
help=f"Encoding to use when auto-generating dummy data. Defaults to {DEFAULT_ENCODING}",
)
test_parser.add_argument("path_to_dataset", type=str, help="Path to the dataset (example: ./datasets/squad)")
test_parser.set_defaults(func=test_command_factory)
test_parser.set_defaults(func=dummy_data_command_factory)

def __init__(
self,
Expand Down
6 changes: 4 additions & 2 deletions src/datasets/commands/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
logger = get_logger(__name__)


def test_command_factory(args):
def _test_command_factory(args):
return TestCommand(
args.dataset,
args.name,
Expand All @@ -31,6 +31,8 @@ def test_command_factory(args):


class TestCommand(BaseDatasetsCLICommand):
__test__ = False # to tell pytest it's not a test class

@staticmethod
def register_subcommand(parser: ArgumentParser):
test_parser = parser.add_parser("test", help="Test dataset implementation.")
Expand Down Expand Up @@ -59,7 +61,7 @@ def register_subcommand(parser: ArgumentParser):
help="Remove downloaded files and cached datasets after each config test",
)
test_parser.add_argument("dataset", type=str, help="Name of the dataset to download")
test_parser.set_defaults(func=test_command_factory)
test_parser.set_defaults(func=_test_command_factory)

def __init__(
self,
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/formatting/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,11 @@ def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:
array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
if len(array) > 0:
if any(
(isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
(isinstance(x, np.ndarray) and (x.dtype == object or x.shape != array[0].shape))
or (isinstance(x, float) and np.isnan(x))
for x in array
):
return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object})
return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": object})
return np.array(array, copy=False, **self.np_array_kwargs)


Expand Down
2 changes: 1 addition & 1 deletion src/datasets/formatting/jax_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def _recursive_tensorize(self, data_struct: dict):
# support for nested types like struct of list of struct
if isinstance(data_struct, (list, np.ndarray)):
data_struct = np.array(data_struct, copy=False)
if data_struct.dtype == np.object: # jax arrays cannot be instantied from an array of objects
if data_struct.dtype == object: # jax arrays cannot be instantied from an array of objects
return [self.recursive_tensorize(substruct) for substruct in data_struct]
return self._tensorize(data_struct)

Expand Down
4 changes: 1 addition & 3 deletions src/datasets/formatting/tf_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,7 @@ def _tensorize(self, value):
def _recursive_tensorize(self, data_struct: dict):
# support for nested types like struct of list of struct
if isinstance(data_struct, (list, np.ndarray)):
if (
data_struct.dtype == np.object
): # tensorflow tensors can sometimes be instantied from an array of objects
if data_struct.dtype == object: # tensorflow tensors can sometimes be instantied from an array of objects
try:
return self._tensorize(data_struct)
except ValueError:
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/formatting/torch_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def _recursive_tensorize(self, data_struct: dict):
# support for nested types like struct of list of struct
if isinstance(data_struct, (list, np.ndarray)):
data_struct = np.array(data_struct, copy=False)
if data_struct.dtype == np.object: # pytorch tensors cannot be instantied from an array of objects
if data_struct.dtype == object: # pytorch tensors cannot be instantied from an array of objects
return [self.recursive_tensorize(substruct) for substruct in data_struct]
return self._tensorize(data_struct)

Expand Down
22 changes: 21 additions & 1 deletion src/datasets/utils/resources/languages.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
"agr": "Aguaruna",
"aii": "Assyrian Neo-Aramaic",
"ajg": "Aja (Benin)",
"ajp": "South Levantine Arabic",
"ak": "Akan",
"akk": "Akkadian",
"ak-GH": "Akan (Ghana)",
"als": "Tosk Albanian",
"alt": "Southern Altai",
Expand All @@ -25,6 +27,8 @@
"amr": "Amarakaeri",
"an": "Aragonese",
"ang": "English, Old (ca.450-1100)",
"apu": "Apurinã",
"aqz": "Akuntsu",
"ar": "Arabic",
"ar-001": "Arabic (World)",
"ar-AE": "Arabic (United Arab Emirates)",
Expand Down Expand Up @@ -175,13 +179,15 @@
"ckb-IQ": "Central Kurdish (Iraq)",
"ckb-IR": "Central Kurdish (Iran)",
"ckb-Latn": "Central Kurdish (Latin)",
"ckt": "Chukot",
"cmn": "Mandarin Chinese",
"cnh": "Hakha Chin",
"cni": "Ashaninka",
"cnr": "Montenegrin",
"co": "Corsican",
"code": "Programming language (C++, Java, Javascript, Python, etc.)",
"cof": "Colorado",
"cop": "Coptic",
"cot": "Caquinte",
"cpu": "Pichis Ashéninka",
"cr": "Cree",
Expand Down Expand Up @@ -487,6 +493,7 @@
"fr-WF": "French (Wallis & Futuna)",
"fr-YT": "French (Mayotte)",
"frm": "Middle French (ca. 1400-1600)",
"fro": "Old French (842-ca. 1400)",
"frp": "Franco-Provençal",
"frr": "Northern Frisian",
"fuf": "Pular",
Expand Down Expand Up @@ -526,6 +533,7 @@
"gu-IN": "Gujarati (India)",
"guc": "Wayuu",
"gug": "Paraguayan Guaraní",
"gun": "Mbyá Guaraní",
"guu": "Yanomamö",
"guz": "Gusii",
"guz-KE": "Gusii (Kenya)",
Expand Down Expand Up @@ -626,6 +634,7 @@
"kea": "Kabuverdianu",
"kea-CV": "Kabuverdianu (Cape Verde)",
"kek": "Kekchí",
"kfm": "Khunsari",
"kg": "Kongo",
"kg-AO": "Kongo (Angola)",
"kha": "Khasi",
Expand Down Expand Up @@ -661,6 +670,7 @@
"kok": "Konkani",
"kok-IN": "Konkani (India)",
"koo": "Konzo",
"kpv": "Komi-Zyrian",
"kqn": "Kaonde",
"kqs": "Northern Kissi",
"kr": "Kanuri",
Expand Down Expand Up @@ -801,6 +811,7 @@
"my": "Burmese",
"my-MM": "Burmese (Myanmar (Burma))",
"my-x-zawgyi": "Myanmar (Zawgyi)",
"myu": "Mundurukú",
"myv": "Erzya",
"mzi": "Ixcatlán Mazatec",
"mzn": "Mazanderani",
Expand Down Expand Up @@ -861,6 +872,7 @@
"nym": "Nyamwezi",
"nyn": "Nyankole",
"nyn-UG": "Nyankole (Uganda)",
"nyq": "Nayini",
"nzi": "Nzima",
"oaa": "Orok",
"oc": "Occitan (post 1500)",
Expand All @@ -874,10 +886,12 @@
"or": "Odia",
"or-IN": "Odia (India)",
"orh": "Oroqen",
"orv": "Old Russian",
"os": "Ossetic",
"os-GE": "Ossetic (Georgia)",
"os-RU": "Ossetic (Russia)",
"ote": "Mezquital Otomi",
"otk": "Old Turkish",
"oto": "Otomi",
"pa": "Punjabi",
"pa-Arab": "Punjabi (Arabic)",
Expand Down Expand Up @@ -930,7 +944,7 @@
"pt-MZ": "Portuguese (Mozambique)",
"pt-PT": "Portuguese (Portugal)",
"pt-ST": "Portuguese (São Tomé & Príncipe)",
"pt-TL": "Portuguese (Timor-Leste)",
"pt-TL": "Portuguese (Timor-Leste)",
"qu": "Quechua",
"qu-BO": "Quechua (Bolivia)",
"qu-EC": "Quechua (Ecuador)",
Expand Down Expand Up @@ -1033,9 +1047,11 @@
"sl-SI": "Slovenian (Slovenia)",
"slr": "Salar",
"sm": "San Marino",
"sme": "Northern Sami",
"sml": "Central Sama",
"smn": "Inari Sami",
"smn-FI": "Inari Sami (Finland)",
"sms": "Skolt Sami",
"sn": "Shona",
"sn-ZW": "Shona (Zimbabwe)",
"snk": "Soninke",
Expand All @@ -1045,6 +1061,7 @@
"so-ET": "Somali (Ethiopia)",
"so-KE": "Somali (Kenya)",
"so-SO": "Somali (Somalia)",
"soj": "Soi",
"son": "Songhai languages",
"sq": "Albanian",
"sq-AL": "Albanian (Albania)",
Expand Down Expand Up @@ -1084,6 +1101,7 @@
"sw-UG": "Swahili (Uganda)",
"swb": "Maore Comorian",
"swh": "Swahili (individual language); Kiswahili",
"swl": "Swedish Sign Language",
"syc": "Classical Syriac",
"syr": "Syriac",
"szl": "Silesian",
Expand Down Expand Up @@ -1131,6 +1149,7 @@
"toj": "Tojolabal",
"top": "Papantla Totonac",
"tpi": "Tok Pisin",
"tpn": "Tupinambá",
"tr": "Turkish",
"tr-CY": "Turkish (Cyprus)",
"tr-TR": "Turkish (Turkey)",
Expand Down Expand Up @@ -1197,6 +1216,7 @@
"wae": "Walser",
"wae-CH": "Walser (Switzerland)",
"war": "Waray (Philippines)",
"wbp": "Warlpiri",
"wo": "Wolof",
"wo-SN": "Wolof (Senegal)",
"wuu": "Wu Chinese",
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/utils/stratify.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def approximate_mode(class_counts, n_draws, rng):
need_to_add -= add_now
if need_to_add == 0:
break
return floored.astype(np.int)
return floored.astype(np.int64)


def stratified_shuffle_split_generate_indices(y, n_train, n_test, rng, n_splits=10):
Expand Down
8 changes: 4 additions & 4 deletions tests/commands/test_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@


if config.PY_VERSION >= version.parse("3.7"):
TestCommandArgs = namedtuple(
"TestCommandArgs",
_TestCommandArgs = namedtuple(
"_TestCommandArgs",
[
"dataset",
"name",
Expand All @@ -28,7 +28,7 @@
else:

@dataclass
class TestCommandArgs:
class _TestCommandArgs:
dataset: str
name: str = None
cache_dir: str = None
Expand All @@ -44,7 +44,7 @@ def __iter__(self):


def test_test_command(dataset_loading_script_dir):
args = TestCommandArgs(dataset=dataset_loading_script_dir, all_configs=True, save_infos=True)
args = _TestCommandArgs(dataset=dataset_loading_script_dir, all_configs=True, save_infos=True)
test_command = TestCommand(*args)
test_command.run()
dataset_infos_path = os.path.join(dataset_loading_script_dir, config.DATASETDICT_INFOS_FILENAME)
Expand Down
2 changes: 1 addition & 1 deletion tests/features/test_array_xd.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ def test_array_xd_with_none():
dummy_array = np.array([[1, 2], [3, 4]], dtype="int32")
dataset = datasets.Dataset.from_dict({"foo": [dummy_array, None, dummy_array]}, features=features)
arr = NumpyArrowExtractor().extract_column(dataset._data)
assert isinstance(arr, np.ndarray) and arr.dtype == np.object and arr.shape == (3,)
assert isinstance(arr, np.ndarray) and arr.dtype == object and arr.shape == (3,)
np.testing.assert_equal(arr[0], dummy_array)
np.testing.assert_equal(arr[2], dummy_array)
assert np.isnan(arr[1]) # a single np.nan value - np.all not needed
Expand Down
Loading