diff --git a/datasets/bn_hate_speech/bn_hate_speech.py b/datasets/bn_hate_speech/bn_hate_speech.py index 73260d86b08..451415eefb5 100644 --- a/datasets/bn_hate_speech/bn_hate_speech.py +++ b/datasets/bn_hate_speech/bn_hate_speech.py @@ -45,7 +45,7 @@ _LICENSE = "MIT License" -_URL = "https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/Bengali_%20Hate_Speech_Dataset_Subset.csv" +_URL = "https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/bengali_%20hate_v1.0.csv" # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case diff --git a/datasets/bn_hate_speech/dataset_infos.json b/datasets/bn_hate_speech/dataset_infos.json index 1ab68587217..c75219d049e 100644 --- a/datasets/bn_hate_speech/dataset_infos.json +++ b/datasets/bn_hate_speech/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "The Bengali Hate Speech Dataset is a collection of Bengali articles collected from Bengali news articles,\nnews dump of Bengali TV channels, books, blogs, and social media. Emphasis was placed on Facebook pages and \nnewspaper sources because they attract close to 50 million followers and is a common source of opinions\nand hate speech. The raw text corpus contains 250 million articles and the full dataset is being prepared\nfor release. This is a subset of the full dataset. \n\nThis dataset was prepared for hate-speech text classification benchmark on Bengali, an under-resourced language.\n", "citation": "@misc{karim2020classification,\n title={Classification Benchmarks for Under-resourced Bengali Language based on Multichannel Convolutional-LSTM Network}, \n author={Md. Rezaul Karim and Bharathi Raja Chakravarthi and John P. McCrae and Michael Cochez},\n year={2020},\n eprint={2004.07807},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/rezacsedu/Bengali-Hate-Speech-Dataset", "license": "MIT License", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 5, "names": ["Personal", "Political", "Religious", "Geopolitical", "Gender abusive"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "bn_hate_speech", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 972635, "num_examples": 3418, "dataset_name": "bn_hate_speech"}}, "download_checksums": {"https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/Bengali_%20Hate_Speech_Dataset_Subset.csv": {"num_bytes": 974312, "checksum": "8ad2aab71c0a9ee61119767c3b628d18e9f2556d5cf52c6c80e677fb928c5420"}}, "download_size": 974312, "post_processing_size": null, "dataset_size": 972635, "size_in_bytes": 1946947}} \ No newline at end of file +{"default": {"description": "The Bengali Hate Speech Dataset is a collection of Bengali articles collected from Bengali news articles,\nnews dump of Bengali TV channels, books, blogs, and social media. Emphasis was placed on Facebook pages and\nnewspaper sources because they attract close to 50 million followers and is a common source of opinions\nand hate speech. The raw text corpus contains 250 million articles and the full dataset is being prepared\nfor release. This is a subset of the full dataset.\n\nThis dataset was prepared for hate-speech text classification benchmark on Bengali, an under-resourced language.\n", "citation": "@misc{karim2020classification,\n title={Classification Benchmarks for Under-resourced Bengali Language based on Multichannel Convolutional-LSTM Network},\n author={Md. Rezaul Karim and Bharathi Raja Chakravarthi and John P. McCrae and Michael Cochez},\n year={2020},\n eprint={2004.07807},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/rezacsedu/Bengali-Hate-Speech-Dataset", "license": "MIT License", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 5, "names": ["Personal", "Political", "Religious", "Geopolitical", "Gender abusive"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "bn_hate_speech", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 972635, "num_examples": 3418, "dataset_name": "bn_hate_speech"}}, "download_checksums": {"https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/bengali_%20hate_v1.0.csv": {"num_bytes": 974312, "checksum": "8ad2aab71c0a9ee61119767c3b628d18e9f2556d5cf52c6c80e677fb928c5420"}}, "download_size": 974312, "post_processing_size": null, "dataset_size": 972635, "size_in_bytes": 1946947}} \ No newline at end of file diff --git a/datasets/bn_hate_speech/dummy/0.0.0/dummy_data.zip b/datasets/bn_hate_speech/dummy/0.0.0/dummy_data.zip index d8c831b3851..8ec0adf331b 100644 Binary files a/datasets/bn_hate_speech/dummy/0.0.0/dummy_data.zip and b/datasets/bn_hate_speech/dummy/0.0.0/dummy_data.zip differ diff --git a/datasets/covid_tweets_japanese/covid_tweets_japanese.py b/datasets/covid_tweets_japanese/covid_tweets_japanese.py index 9d5e3981fa4..3155ca15263 100644 --- a/datasets/covid_tweets_japanese/covid_tweets_japanese.py +++ b/datasets/covid_tweets_japanese/covid_tweets_japanese.py @@ -23,7 +23,7 @@ _CITATION = """\ No paper about this dataset is published yet. \ -Please cite this dataset as "鈴木 優: COVID-19 日本語 Twitter データセット (http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1)" +Please cite this dataset as "鈴木 優: COVID-19 日本語 Twitter データセット (http://www.db.info.gifu-u.ac.jp/covid-19-twitter-dataset/)" """ _DESCRIPTION = """\ @@ -32,14 +32,14 @@ The original tweets are not contained. Please use Twitter API to get them, for example. """ -_HOMEPAGE = "http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1" +_HOMEPAGE = "http://www.db.info.gifu-u.ac.jp/covid-19-twitter-dataset/" _LICENSE = "CC-BY-ND 4.0" # The HuggingFace dataset library don't host the datasets but only point to the original files # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) _URLs = { - "url": "http://www.db.info.gifu-u.ac.jp/data/data.csv.bz2", + "url": "http://www.db.info.gifu-u.ac.jp/data/covid19.csv.bz2", } diff --git a/datasets/covid_tweets_japanese/dataset_infos.json b/datasets/covid_tweets_japanese/dataset_infos.json index 88c362c07bc..d6314ee3a96 100644 --- a/datasets/covid_tweets_japanese/dataset_infos.json +++ b/datasets/covid_tweets_japanese/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "53,640 Japanese tweets with annotation if a tweet is related to COVID-19 or not. The annotation is by majority decision by 5 - 10 crowd workers. Target tweets include \"COVID\" or \"\u30b3\u30ed\u30ca\". The period of the tweets is from around January 2020 to around June 2020. The original tweets are not contained. Please use Twitter API to get them, for example.\n", "citation": "No paper about this dataset is published yet. Please cite this dataset as \"\u9234\u6728 \u512a: COVID-19 \u65e5\u672c\u8a9e Twitter \u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 \uff08http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1\uff09\"\n", "homepage": "http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1", "license": "CC-BY-ND 4.0", "features": {"tweet_id": {"dtype": "string", "id": null, "_type": "Value"}, "assessment_option_id": {"num_classes": 6, "names": ["63", "64", "65", "66", "67", "68"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "covid_tweets_japanese", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1662833, "num_examples": 53639, "dataset_name": "covid_tweets_japanese"}}, "download_checksums": {"http://www.db.info.gifu-u.ac.jp/data/data.csv.bz2": {"num_bytes": 406005, "checksum": "b1023e49df7717db7eedf3b318511b6163ec2651cbf78a8d72f7e1e0bc3fd4c6"}}, "download_size": 406005, "post_processing_size": null, "dataset_size": 1662833, "size_in_bytes": 2068838}} \ No newline at end of file +{"default": {"description": "53,640 Japanese tweets with annotation if a tweet is related to COVID-19 or not. The annotation is by majority decision by 5 - 10 crowd workers. Target tweets include \"COVID\" or \"\u30b3\u30ed\u30ca\". The period of the tweets is from around January 2020 to around June 2020. The original tweets are not contained. Please use Twitter API to get them, for example.\n", "citation": "No paper about this dataset is published yet. Please cite this dataset as \"\u9234\u6728 \u512a: COVID-19 \u65e5\u672c\u8a9e Twitter \u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 \uff08http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1\uff09\"\n", "homepage": "http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1", "license": "CC-BY-ND 4.0", "features": {"tweet_id": {"dtype": "string", "id": null, "_type": "Value"}, "assessment_option_id": {"num_classes": 6, "names": ["63", "64", "65", "66", "67", "68"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "covid_tweets_japanese", "config_name": "default", "version": {"version_str": "1.1.1", "description": null, "major": 1, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1662833, "num_examples": 53639, "dataset_name": "covid_tweets_japanese"}}, "download_checksums": {"http://www.db.info.gifu-u.ac.jp/data/covid19.csv.bz2": {"num_bytes": 406005, "checksum": "b1023e49df7717db7eedf3b318511b6163ec2651cbf78a8d72f7e1e0bc3fd4c6"}}, "download_size": 406005, "post_processing_size": null, "dataset_size": 1662833, "size_in_bytes": 2068838}} \ No newline at end of file diff --git a/datasets/covid_tweets_japanese/dummy/1.1.0/dummy_data.zip b/datasets/covid_tweets_japanese/dummy/1.1.0/dummy_data.zip index 6bb59082d66..1be6007b84e 100644 Binary files a/datasets/covid_tweets_japanese/dummy/1.1.0/dummy_data.zip and b/datasets/covid_tweets_japanese/dummy/1.1.0/dummy_data.zip differ