huggingface · lhoestq · Jun 4, 2021 · Jun 4, 2021 · Jun 4, 2021 · Jun 4, 2021
diff --git a/datasets/bn_hate_speech/bn_hate_speech.py b/datasets/bn_hate_speech/bn_hate_speech.py
@@ -45,7 +45,7 @@
 
 _LICENSE = "MIT License"
 
-_URL = "https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/Bengali_%20Hate_Speech_Dataset_Subset.csv"
+_URL = "https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/bengali_%20hate_v1.0.csv"
 
 
 # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case

diff --git a/datasets/bn_hate_speech/dataset_infos.json b/datasets/bn_hate_speech/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "The Bengali Hate Speech Dataset is a collection of Bengali articles collected from Bengali news articles,\nnews dump of Bengali TV channels, books, blogs, and social media. Emphasis was placed on Facebook pages and \nnewspaper sources because they attract close to 50 million followers and is a common source of opinions\nand hate speech. The raw text corpus contains 250 million articles and the full dataset is being prepared\nfor release. This is a subset of the full dataset. \n\nThis dataset was prepared for hate-speech text classification benchmark on Bengali, an under-resourced language.\n", "citation": "@misc{karim2020classification,\n      title={Classification Benchmarks for Under-resourced Bengali Language based on Multichannel Convolutional-LSTM Network}, \n      author={Md. Rezaul Karim and Bharathi Raja Chakravarthi and John P. McCrae and Michael Cochez},\n      year={2020},\n      eprint={2004.07807},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/rezacsedu/Bengali-Hate-Speech-Dataset", "license": "MIT License", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 5, "names": ["Personal", "Political", "Religious", "Geopolitical", "Gender abusive"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "bn_hate_speech", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 972635, "num_examples": 3418, "dataset_name": "bn_hate_speech"}}, "download_checksums": {"https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/Bengali_%20Hate_Speech_Dataset_Subset.csv": {"num_bytes": 974312, "checksum": "8ad2aab71c0a9ee61119767c3b628d18e9f2556d5cf52c6c80e677fb928c5420"}}, "download_size": 974312, "post_processing_size": null, "dataset_size": 972635, "size_in_bytes": 1946947}}
+{"default": {"description": "The Bengali Hate Speech Dataset is a collection of Bengali articles collected from Bengali news articles,\nnews dump of Bengali TV channels, books, blogs, and social media. Emphasis was placed on Facebook pages and\nnewspaper sources because they attract close to 50 million followers and is a common source of opinions\nand hate speech. The raw text corpus contains 250 million articles and the full dataset is being prepared\nfor release. This is a subset of the full dataset.\n\nThis dataset was prepared for hate-speech text classification benchmark on Bengali, an under-resourced language.\n", "citation": "@misc{karim2020classification,\n      title={Classification Benchmarks for Under-resourced Bengali Language based on Multichannel Convolutional-LSTM Network},\n      author={Md. Rezaul Karim and Bharathi Raja Chakravarthi and John P. McCrae and Michael Cochez},\n      year={2020},\n      eprint={2004.07807},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/rezacsedu/Bengali-Hate-Speech-Dataset", "license": "MIT License", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 5, "names": ["Personal", "Political", "Religious", "Geopolitical", "Gender abusive"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "bn_hate_speech", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 972635, "num_examples": 3418, "dataset_name": "bn_hate_speech"}}, "download_checksums": {"https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/bengali_%20hate_v1.0.csv": {"num_bytes": 974312, "checksum": "8ad2aab71c0a9ee61119767c3b628d18e9f2556d5cf52c6c80e677fb928c5420"}}, "download_size": 974312, "post_processing_size": null, "dataset_size": 972635, "size_in_bytes": 1946947}}
diff --git a/datasets/bn_hate_speech/dummy/0.0.0/dummy_data.zip b/datasets/bn_hate_speech/dummy/0.0.0/dummy_data.zip
diff --git a/datasets/covid_tweets_japanese/covid_tweets_japanese.py b/datasets/covid_tweets_japanese/covid_tweets_japanese.py
@@ -23,7 +23,7 @@
 
 _CITATION = """\
 No paper about this dataset is published yet. \
-Please cite this dataset as "鈴木 優: COVID-19 日本語 Twitter データセット （http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1）"
+Please cite this dataset as "鈴木 優: COVID-19 日本語 Twitter データセット （http://www.db.info.gifu-u.ac.jp/covid-19-twitter-dataset/）"
 """
 
 _DESCRIPTION = """\
@@ -32,14 +32,14 @@
 The original tweets are not contained. Please use Twitter API to get them, for example.
 """
 
-_HOMEPAGE = "http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1"
+_HOMEPAGE = "http://www.db.info.gifu-u.ac.jp/covid-19-twitter-dataset/"
 
 _LICENSE = "CC-BY-ND 4.0"
 
 # The HuggingFace dataset library don't host the datasets but only point to the original files
 # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
 _URLs = {
-    "url": "http://www.db.info.gifu-u.ac.jp/data/data.csv.bz2",
+    "url": "http://www.db.info.gifu-u.ac.jp/data/covid19.csv.bz2",
 }
 
 

diff --git a/datasets/covid_tweets_japanese/dataset_infos.json b/datasets/covid_tweets_japanese/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "53,640 Japanese tweets with annotation if a tweet is related to COVID-19 or not. The annotation is by majority decision by 5 - 10 crowd workers. Target tweets include \"COVID\" or \"\u30b3\u30ed\u30ca\". The period of the tweets is from around January 2020 to around June 2020. The original tweets are not contained. Please use Twitter API to get them, for example.\n", "citation": "No paper about this dataset is published yet. Please cite this dataset as \"\u9234\u6728 \u512a: COVID-19 \u65e5\u672c\u8a9e Twitter \u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 \uff08http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1\uff09\"\n", "homepage": "http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1", "license": "CC-BY-ND 4.0", "features": {"tweet_id": {"dtype": "string", "id": null, "_type": "Value"}, "assessment_option_id": {"num_classes": 6, "names": ["63", "64", "65", "66", "67", "68"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "covid_tweets_japanese", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1662833, "num_examples": 53639, "dataset_name": "covid_tweets_japanese"}}, "download_checksums": {"http://www.db.info.gifu-u.ac.jp/data/data.csv.bz2": {"num_bytes": 406005, "checksum": "b1023e49df7717db7eedf3b318511b6163ec2651cbf78a8d72f7e1e0bc3fd4c6"}}, "download_size": 406005, "post_processing_size": null, "dataset_size": 1662833, "size_in_bytes": 2068838}}
+{"default": {"description": "53,640 Japanese tweets with annotation if a tweet is related to COVID-19 or not. The annotation is by majority decision by 5 - 10 crowd workers. Target tweets include \"COVID\" or \"\u30b3\u30ed\u30ca\". The period of the tweets is from around January 2020 to around June 2020. The original tweets are not contained. Please use Twitter API to get them, for example.\n", "citation": "No paper about this dataset is published yet. Please cite this dataset as \"\u9234\u6728 \u512a: COVID-19 \u65e5\u672c\u8a9e Twitter \u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 \uff08http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1\uff09\"\n", "homepage": "http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1", "license": "CC-BY-ND 4.0", "features": {"tweet_id": {"dtype": "string", "id": null, "_type": "Value"}, "assessment_option_id": {"num_classes": 6, "names": ["63", "64", "65", "66", "67", "68"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "covid_tweets_japanese", "config_name": "default", "version": {"version_str": "1.1.1", "description": null, "major": 1, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1662833, "num_examples": 53639, "dataset_name": "covid_tweets_japanese"}}, "download_checksums": {"http://www.db.info.gifu-u.ac.jp/data/covid19.csv.bz2": {"num_bytes": 406005, "checksum": "b1023e49df7717db7eedf3b318511b6163ec2651cbf78a8d72f7e1e0bc3fd4c6"}}, "download_size": 406005, "post_processing_size": null, "dataset_size": 1662833, "size_in_bytes": 2068838}}
diff --git a/datasets/covid_tweets_japanese/dummy/1.1.0/dummy_data.zip b/datasets/covid_tweets_japanese/dummy/1.1.0/dummy_data.zip
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"default": {"description": "The Bengali Hate Speech Dataset is a collection of Bengali articles collected from Bengali news articles,\nnews dump of Bengali TV channels, books, blogs, and social media. Emphasis was placed on Facebook pages and \nnewspaper sources because they attract close to 50 million followers and is a common source of opinions\nand hate speech. The raw text corpus contains 250 million articles and the full dataset is being prepared\nfor release. This is a subset of the full dataset. \n\nThis dataset was prepared for hate-speech text classification benchmark on Bengali, an under-resourced language.\n", "citation": "@misc{karim2020classification,\n title={Classification Benchmarks for Under-resourced Bengali Language based on Multichannel Convolutional-LSTM Network}, \n author={Md. Rezaul Karim and Bharathi Raja Chakravarthi and John P. McCrae and Michael Cochez},\n year={2020},\n eprint={2004.07807},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/rezacsedu/Bengali-Hate-Speech-Dataset", "license": "MIT License", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 5, "names": ["Personal", "Political", "Religious", "Geopolitical", "Gender abusive"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "bn_hate_speech", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 972635, "num_examples": 3418, "dataset_name": "bn_hate_speech"}}, "download_checksums": {"https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/Bengali_%20Hate_Speech_Dataset_Subset.csv": {"num_bytes": 974312, "checksum": "8ad2aab71c0a9ee61119767c3b628d18e9f2556d5cf52c6c80e677fb928c5420"}}, "download_size": 974312, "post_processing_size": null, "dataset_size": 972635, "size_in_bytes": 1946947}}
		{"default": {"description": "The Bengali Hate Speech Dataset is a collection of Bengali articles collected from Bengali news articles,\nnews dump of Bengali TV channels, books, blogs, and social media. Emphasis was placed on Facebook pages and\nnewspaper sources because they attract close to 50 million followers and is a common source of opinions\nand hate speech. The raw text corpus contains 250 million articles and the full dataset is being prepared\nfor release. This is a subset of the full dataset.\n\nThis dataset was prepared for hate-speech text classification benchmark on Bengali, an under-resourced language.\n", "citation": "@misc{karim2020classification,\n title={Classification Benchmarks for Under-resourced Bengali Language based on Multichannel Convolutional-LSTM Network},\n author={Md. Rezaul Karim and Bharathi Raja Chakravarthi and John P. McCrae and Michael Cochez},\n year={2020},\n eprint={2004.07807},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/rezacsedu/Bengali-Hate-Speech-Dataset", "license": "MIT License", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 5, "names": ["Personal", "Political", "Religious", "Geopolitical", "Gender abusive"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "bn_hate_speech", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 972635, "num_examples": 3418, "dataset_name": "bn_hate_speech"}}, "download_checksums": {"https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/bengali_%20hate_v1.0.csv": {"num_bytes": 974312, "checksum": "8ad2aab71c0a9ee61119767c3b628d18e9f2556d5cf52c6c80e677fb928c5420"}}, "download_size": 974312, "post_processing_size": null, "dataset_size": 972635, "size_in_bytes": 1946947}}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"default": {"description": "53,640 Japanese tweets with annotation if a tweet is related to COVID-19 or not. The annotation is by majority decision by 5 - 10 crowd workers. Target tweets include \"COVID\" or \"\u30b3\u30ed\u30ca\". The period of the tweets is from around January 2020 to around June 2020. The original tweets are not contained. Please use Twitter API to get them, for example.\n", "citation": "No paper about this dataset is published yet. Please cite this dataset as \"\u9234\u6728 \u512a: COVID-19 \u65e5\u672c\u8a9e Twitter \u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 \uff08http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1\uff09\"\n", "homepage": "http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1", "license": "CC-BY-ND 4.0", "features": {"tweet_id": {"dtype": "string", "id": null, "_type": "Value"}, "assessment_option_id": {"num_classes": 6, "names": ["63", "64", "65", "66", "67", "68"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "covid_tweets_japanese", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1662833, "num_examples": 53639, "dataset_name": "covid_tweets_japanese"}}, "download_checksums": {"http://www.db.info.gifu-u.ac.jp/data/data.csv.bz2": {"num_bytes": 406005, "checksum": "b1023e49df7717db7eedf3b318511b6163ec2651cbf78a8d72f7e1e0bc3fd4c6"}}, "download_size": 406005, "post_processing_size": null, "dataset_size": 1662833, "size_in_bytes": 2068838}}
		{"default": {"description": "53,640 Japanese tweets with annotation if a tweet is related to COVID-19 or not. The annotation is by majority decision by 5 - 10 crowd workers. Target tweets include \"COVID\" or \"\u30b3\u30ed\u30ca\". The period of the tweets is from around January 2020 to around June 2020. The original tweets are not contained. Please use Twitter API to get them, for example.\n", "citation": "No paper about this dataset is published yet. Please cite this dataset as \"\u9234\u6728 \u512a: COVID-19 \u65e5\u672c\u8a9e Twitter \u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 \uff08http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1\uff09\"\n", "homepage": "http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1", "license": "CC-BY-ND 4.0", "features": {"tweet_id": {"dtype": "string", "id": null, "_type": "Value"}, "assessment_option_id": {"num_classes": 6, "names": ["63", "64", "65", "66", "67", "68"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "covid_tweets_japanese", "config_name": "default", "version": {"version_str": "1.1.1", "description": null, "major": 1, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1662833, "num_examples": 53639, "dataset_name": "covid_tweets_japanese"}}, "download_checksums": {"http://www.db.info.gifu-u.ac.jp/data/covid19.csv.bz2": {"num_bytes": 406005, "checksum": "b1023e49df7717db7eedf3b318511b6163ec2651cbf78a8d72f7e1e0bc3fd4c6"}}, "download_size": 406005, "post_processing_size": null, "dataset_size": 1662833, "size_in_bytes": 2068838}}