Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datasets/banking77/banking77.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from datasets.tasks import TextClassification


_CITATION = """\
_CITATION = r"""\
@inproceedings{Casanueva2020,
author = {I{\~{n}}igo Casanueva and Tadas Temcinas and Daniela Gerz and Matthew Henderson and Ivan Vulic},
title = {Efficient Intent Detection with Dual Sentence Encoders},
Expand Down
9 changes: 4 additions & 5 deletions datasets/universal_dependencies/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,6 @@ languages:
- pcm
- pl
- pt
- qhe
- qtd
- ro
- ru
- sa
Expand Down Expand Up @@ -117,10 +115,11 @@ size_categories:
source_datasets:
- original
task_categories:
- other
- token-classification
task_ids:
- constituency-parsing
- dependency-parsing
- parsing
- token-classification-other-constituency-parsing
- token-classification-other-dependency-parsing
paperswithcode_id: universal-dependencies
pretty_name: Universal Dependencies Treebank
configs:
Expand Down
2 changes: 1 addition & 1 deletion datasets/universal_dependencies/universal_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import datasets


_CITATION = """\
_CITATION = r"""\
@misc{11234/1-3424,
title = {Universal Dependencies 2.7},
author = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\"e}mi and Aghaei, Hamid and Agi{\'c}, {\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Aleksandravi{\v c}i{\=u}t{\.e}, Gabriel{\.e} and Alfina, Ika and Antonsen, Lene and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranzabe, Maria Jesus and Arnard{\'o}ttir, {\t H}{\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Bengoetxea, Kepa and Berk, G{\"o}zde and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\.e}, Agn{\.e} and Bjarnad{\'o}ttir, Krist{\'{\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\"{\i}}c and Borges V{\"o}lker, Emanuel and B{\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroglu Eryigit, Gulsen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Ceplo, Slavomir and Cetin, Savas and Cetinoglu, Ozlem and Chalub, Fabricio and Chi, Ethan and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Cignarella, Alessandra T. and Cinkova, Silvie and Collomb, Aurelie and Coltekin, Cagr{\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaz and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\'a}rd and Fernanda, Mar{\'{\i}}lia and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\'a}udia and Fujita, Kazunori and Gajdosov{\'a}, Katar{\'{\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\"a}rdenfors, Moa and Garza, Sebastian and Gerardi, Fabr{\'{\i}}cio Ferraz and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\"o}k{\i}rmak, Memduh and Goldberg, Yoav and G{\'o}mez Guinovart, Xavier and Gonz{\'a}lez Saavedra,
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def _get_output_signature(
else:
np_arrays.append(np.array(array))

if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == np.bool:
if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == np.dtype("bool"):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The warning message suggests to use bool instead.

Suggested change
if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == np.dtype("bool"):
if np.issubdtype(np_arrays[0].dtype, np.integer) or np_arrays[0].dtype == bool:

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, and for object as well

tf_dtype = tf.int64
np_dtype = np.int64
elif np.issubdtype(np_arrays[0].dtype, np.number):
Expand Down Expand Up @@ -3804,7 +3804,7 @@ def _feature(values: Union[float, int, str, np.ndarray]) -> "tf.train.Feature":
return _float_feature([values.item()])
elif np.issubdtype(values.dtype, np.integer):
return _int64_feature([values.item()])
elif np.issubdtype(values.dtype, np.str):
elif np.issubdtype(values.dtype, str):
return _bytes_feature([values.item().encode()])
else:
raise ValueError(f"values={values} has dtype {values.dtype}, which cannot be serialized")
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/commands/datasets_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from datasets.commands.dummy_data import DummyDataCommand
from datasets.commands.env import EnvironmentCommand
from datasets.commands.run_beam import RunBeamCommand
from datasets.commands.test import TestCommand
from datasets.commands.test import CLITestCommand
Copy link
Member

@albertvillanova albertvillanova Jun 24, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are preceding with CLI only the TestCommand; i.e. the rest of the command names are not preceded by CLI.

IMHO, either all or none should be preceded by CLI.

On the other hand, from your commit message, I guess you introduce this modification to prevent pytest default test discovery to identify this class as a test. If this is the reason, I would strongly suggest using testpaths instead: https://docs.pytest.org/en/latest/reference/reference.html#confval-testpaths

  • We tell pytest to search for tests only inside the tests directory
  • Moreover, tests discovery/collection will be faster

I just realized pytest is called with arg ./tests... I did not consider imported classes as possible test candidates, thus I thought pytest was called without arguments and it was searching inside src to find TestCommand. Sorry for the confusion.

Copy link
Member

@albertvillanova albertvillanova Jun 24, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pytest suggests adding the class attribute __test__ = False to TestCommand, but I am not sure if this is better... :/

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went for __test__ = False since it's simpler

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please note there are still some CLI... leftovers you should fix before merging.

from datasets.utils.logging import set_verbosity_info


Expand All @@ -23,7 +23,7 @@ def main():
# Register commands
ConvertCommand.register_subcommand(commands_parser)
EnvironmentCommand.register_subcommand(commands_parser)
TestCommand.register_subcommand(commands_parser)
CLITestCommand.register_subcommand(commands_parser)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above...

RunBeamCommand.register_subcommand(commands_parser)
DummyDataCommand.register_subcommand(commands_parser)

Expand Down
8 changes: 4 additions & 4 deletions src/datasets/commands/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
logger = get_logger(__name__)


def test_command_factory(args):
return TestCommand(
def cli_test_command_factory(args):
return CLITestCommand(
args.dataset,
args.name,
args.cache_dir,
Expand All @@ -30,7 +30,7 @@ def test_command_factory(args):
)


class TestCommand(BaseDatasetsCLICommand):
class CLITestCommand(BaseDatasetsCLICommand):
@staticmethod
def register_subcommand(parser: ArgumentParser):
test_parser = parser.add_parser("test", help="Test dataset implementation.")
Expand Down Expand Up @@ -59,7 +59,7 @@ def register_subcommand(parser: ArgumentParser):
help="Remove downloaded files and cached datasets after each config test",
)
test_parser.add_argument("dataset", type=str, help="Name of the dataset to download")
test_parser.set_defaults(func=test_command_factory)
test_parser.set_defaults(func=cli_test_command_factory)

def __init__(
self,
Expand Down
4 changes: 2 additions & 2 deletions src/datasets/formatting/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,11 @@ def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:
array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
if len(array) > 0:
if any(
(isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
(isinstance(x, np.ndarray) and (x.dtype == np.dtype("object") or x.shape != array[0].shape))
or (isinstance(x, float) and np.isnan(x))
for x in array
):
return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object})
return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.dtype("object")})
return np.array(array, copy=False, **self.np_array_kwargs)


Expand Down
2 changes: 1 addition & 1 deletion src/datasets/formatting/jax_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def _recursive_tensorize(self, data_struct: dict):
# support for nested types like struct of list of struct
if isinstance(data_struct, (list, np.ndarray)):
data_struct = np.array(data_struct, copy=False)
if data_struct.dtype == np.object: # jax arrays cannot be instantied from an array of objects
if data_struct.dtype == np.dtype("object"): # jax arrays cannot be instantied from an array of objects
return [self.recursive_tensorize(substruct) for substruct in data_struct]
return self._tensorize(data_struct)

Expand Down
4 changes: 2 additions & 2 deletions src/datasets/formatting/tf_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ def _tensorize(self, value):
def _recursive_tensorize(self, data_struct: dict):
# support for nested types like struct of list of struct
if isinstance(data_struct, (list, np.ndarray)):
if (
data_struct.dtype == np.object
if data_struct.dtype == np.dtype(
"object"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above.

Suggested change
if data_struct.dtype == np.dtype(
"object"
if (
data_struct.dtype == object

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not resolved yet.

): # tensorflow tensors can sometimes be instantied from an array of objects
try:
return self._tensorize(data_struct)
Expand Down
4 changes: 3 additions & 1 deletion src/datasets/formatting/torch_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def _recursive_tensorize(self, data_struct: dict):
# support for nested types like struct of list of struct
if isinstance(data_struct, (list, np.ndarray)):
data_struct = np.array(data_struct, copy=False)
if data_struct.dtype == np.object: # pytorch tensors cannot be instantied from an array of objects
if data_struct.dtype == np.dtype(
"object"
): # pytorch tensors cannot be instantied from an array of objects
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same.

Suggested change
if data_struct.dtype == np.dtype(
"object"
): # pytorch tensors cannot be instantied from an array of objects
if data_struct.dtype == object: # pytorch tensors cannot be instantied from an array of objects

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not resolved yet.

return [self.recursive_tensorize(substruct) for substruct in data_struct]
return self._tensorize(data_struct)

Expand Down
22 changes: 21 additions & 1 deletion src/datasets/utils/resources/languages.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
"agr": "Aguaruna",
"aii": "Assyrian Neo-Aramaic",
"ajg": "Aja (Benin)",
"ajp": "South Levantine Arabic",
"ak": "Akan",
"akk": "Akkadian",
"ak-GH": "Akan (Ghana)",
"als": "Tosk Albanian",
"alt": "Southern Altai",
Expand All @@ -25,6 +27,8 @@
"amr": "Amarakaeri",
"an": "Aragonese",
"ang": "English, Old (ca.450-1100)",
"apu": "Apurinã",
"aqz": "Akuntsu",
"ar": "Arabic",
"ar-001": "Arabic (World)",
"ar-AE": "Arabic (United Arab Emirates)",
Expand Down Expand Up @@ -175,13 +179,15 @@
"ckb-IQ": "Central Kurdish (Iraq)",
"ckb-IR": "Central Kurdish (Iran)",
"ckb-Latn": "Central Kurdish (Latin)",
"ckt": "Chukot",
"cmn": "Mandarin Chinese",
"cnh": "Hakha Chin",
"cni": "Ashaninka",
"cnr": "Montenegrin",
"co": "Corsican",
"code": "Programming language (C++, Java, Javascript, Python, etc.)",
"cof": "Colorado",
"cop": "Coptic",
"cot": "Caquinte",
"cpu": "Pichis Ashéninka",
"cr": "Cree",
Expand Down Expand Up @@ -487,6 +493,7 @@
"fr-WF": "French (Wallis & Futuna)",
"fr-YT": "French (Mayotte)",
"frm": "Middle French (ca. 1400-1600)",
"fro": "Old French (842-ca. 1400)",
"frp": "Franco-Provençal",
"frr": "Northern Frisian",
"fuf": "Pular",
Expand Down Expand Up @@ -526,6 +533,7 @@
"gu-IN": "Gujarati (India)",
"guc": "Wayuu",
"gug": "Paraguayan Guaraní",
"gun": "Mbyá Guaraní",
"guu": "Yanomamö",
"guz": "Gusii",
"guz-KE": "Gusii (Kenya)",
Expand Down Expand Up @@ -626,6 +634,7 @@
"kea": "Kabuverdianu",
"kea-CV": "Kabuverdianu (Cape Verde)",
"kek": "Kekchí",
"kfm": "Khunsari",
"kg": "Kongo",
"kg-AO": "Kongo (Angola)",
"kha": "Khasi",
Expand Down Expand Up @@ -661,6 +670,7 @@
"kok": "Konkani",
"kok-IN": "Konkani (India)",
"koo": "Konzo",
"kpv": "Komi-Zyrian",
"kqn": "Kaonde",
"kqs": "Northern Kissi",
"kr": "Kanuri",
Expand Down Expand Up @@ -801,6 +811,7 @@
"my": "Burmese",
"my-MM": "Burmese (Myanmar (Burma))",
"my-x-zawgyi": "Myanmar (Zawgyi)",
"myu": "Mundurukú",
"myv": "Erzya",
"mzi": "Ixcatlán Mazatec",
"mzn": "Mazanderani",
Expand Down Expand Up @@ -861,6 +872,7 @@
"nym": "Nyamwezi",
"nyn": "Nyankole",
"nyn-UG": "Nyankole (Uganda)",
"nyq": "Nayini",
"nzi": "Nzima",
"oaa": "Orok",
"oc": "Occitan (post 1500)",
Expand All @@ -874,10 +886,12 @@
"or": "Odia",
"or-IN": "Odia (India)",
"orh": "Oroqen",
"orv": "Old Russian",
"os": "Ossetic",
"os-GE": "Ossetic (Georgia)",
"os-RU": "Ossetic (Russia)",
"ote": "Mezquital Otomi",
"otk": "Old Turkish",
"oto": "Otomi",
"pa": "Punjabi",
"pa-Arab": "Punjabi (Arabic)",
Expand Down Expand Up @@ -930,7 +944,7 @@
"pt-MZ": "Portuguese (Mozambique)",
"pt-PT": "Portuguese (Portugal)",
"pt-ST": "Portuguese (São Tomé & Príncipe)",
"pt-TL": "Portuguese (Timor-Leste)",
"pt-TL": "Portuguese (Timor-Leste)",
"qu": "Quechua",
"qu-BO": "Quechua (Bolivia)",
"qu-EC": "Quechua (Ecuador)",
Expand Down Expand Up @@ -1033,9 +1047,11 @@
"sl-SI": "Slovenian (Slovenia)",
"slr": "Salar",
"sm": "San Marino",
"sme": "Northern Sami",
"sml": "Central Sama",
"smn": "Inari Sami",
"smn-FI": "Inari Sami (Finland)",
"sms": "Skolt Sami",
"sn": "Shona",
"sn-ZW": "Shona (Zimbabwe)",
"snk": "Soninke",
Expand All @@ -1045,6 +1061,7 @@
"so-ET": "Somali (Ethiopia)",
"so-KE": "Somali (Kenya)",
"so-SO": "Somali (Somalia)",
"soj": "Soi",
"son": "Songhai languages",
"sq": "Albanian",
"sq-AL": "Albanian (Albania)",
Expand Down Expand Up @@ -1084,6 +1101,7 @@
"sw-UG": "Swahili (Uganda)",
"swb": "Maore Comorian",
"swh": "Swahili (individual language); Kiswahili",
"swl": "Swedish Sign Language",
"syc": "Classical Syriac",
"syr": "Syriac",
"szl": "Silesian",
Expand Down Expand Up @@ -1131,6 +1149,7 @@
"toj": "Tojolabal",
"top": "Papantla Totonac",
"tpi": "Tok Pisin",
"tpn": "Tupinambá",
"tr": "Turkish",
"tr-CY": "Turkish (Cyprus)",
"tr-TR": "Turkish (Turkey)",
Expand Down Expand Up @@ -1197,6 +1216,7 @@
"wae": "Walser",
"wae-CH": "Walser (Switzerland)",
"war": "Waray (Philippines)",
"wbp": "Warlpiri",
"wo": "Wolof",
"wo-SN": "Wolof (Senegal)",
"wuu": "Wu Chinese",
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/utils/stratify.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def approximate_mode(class_counts, n_draws, rng):
need_to_add -= add_now
if need_to_add == 0:
break
return floored.astype(np.int)
return floored.astype(np.int64)


def stratified_shuffle_split_generate_indices(y, n_train, n_test, rng, n_splits=10):
Expand Down
12 changes: 6 additions & 6 deletions tests/commands/test_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
from packaging import version

from datasets import config
from datasets.commands.test import TestCommand
from datasets.commands.test import CLITestCommand


if config.PY_VERSION >= version.parse("3.7"):
TestCommandArgs = namedtuple(
"TestCommandArgs",
_TestCommandArgs = namedtuple(
"_TestCommandArgs",
[
"dataset",
"name",
Expand All @@ -28,7 +28,7 @@
else:

@dataclass
class TestCommandArgs:
class _TestCommandArgs:
dataset: str
name: str = None
cache_dir: str = None
Expand All @@ -44,8 +44,8 @@ def __iter__(self):


def test_test_command(dataset_loading_script_dir):
args = TestCommandArgs(dataset=dataset_loading_script_dir, all_configs=True, save_infos=True)
test_command = TestCommand(*args)
args = _TestCommandArgs(dataset=dataset_loading_script_dir, all_configs=True, save_infos=True)
test_command = CLITestCommand(*args)
test_command.run()
dataset_infos_path = os.path.join(dataset_loading_script_dir, config.DATASETDICT_INFOS_FILENAME)
assert os.path.exists(dataset_infos_path)
Expand Down
2 changes: 1 addition & 1 deletion tests/features/test_array_xd.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ def test_array_xd_with_none():
dummy_array = np.array([[1, 2], [3, 4]], dtype="int32")
dataset = datasets.Dataset.from_dict({"foo": [dummy_array, None, dummy_array]}, features=features)
arr = NumpyArrowExtractor().extract_column(dataset._data)
assert isinstance(arr, np.ndarray) and arr.dtype == np.object and arr.shape == (3,)
assert isinstance(arr, np.ndarray) and arr.dtype == np.dtype("object") and arr.shape == (3,)
np.testing.assert_equal(arr[0], dummy_array)
np.testing.assert_equal(arr[2], dummy_array)
assert np.isnan(arr[1]) # a single np.nan value - np.all not needed
Expand Down
Loading