Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datasets/bn_hate_speech/bn_hate_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@

_LICENSE = "MIT License"

_URL = "https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/Bengali_%20Hate_Speech_Dataset_Subset.csv"
_URL = "https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/bengali_%20hate_v1.0.csv"


# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
Expand Down
2 changes: 1 addition & 1 deletion datasets/bn_hate_speech/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"default": {"description": "The Bengali Hate Speech Dataset is a collection of Bengali articles collected from Bengali news articles,\nnews dump of Bengali TV channels, books, blogs, and social media. Emphasis was placed on Facebook pages and \nnewspaper sources because they attract close to 50 million followers and is a common source of opinions\nand hate speech. The raw text corpus contains 250 million articles and the full dataset is being prepared\nfor release. This is a subset of the full dataset. \n\nThis dataset was prepared for hate-speech text classification benchmark on Bengali, an under-resourced language.\n", "citation": "@misc{karim2020classification,\n title={Classification Benchmarks for Under-resourced Bengali Language based on Multichannel Convolutional-LSTM Network}, \n author={Md. Rezaul Karim and Bharathi Raja Chakravarthi and John P. McCrae and Michael Cochez},\n year={2020},\n eprint={2004.07807},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/rezacsedu/Bengali-Hate-Speech-Dataset", "license": "MIT License", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 5, "names": ["Personal", "Political", "Religious", "Geopolitical", "Gender abusive"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "bn_hate_speech", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 972635, "num_examples": 3418, "dataset_name": "bn_hate_speech"}}, "download_checksums": {"https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/Bengali_%20Hate_Speech_Dataset_Subset.csv": {"num_bytes": 974312, "checksum": "8ad2aab71c0a9ee61119767c3b628d18e9f2556d5cf52c6c80e677fb928c5420"}}, "download_size": 974312, "post_processing_size": null, "dataset_size": 972635, "size_in_bytes": 1946947}}
{"default": {"description": "The Bengali Hate Speech Dataset is a collection of Bengali articles collected from Bengali news articles,\nnews dump of Bengali TV channels, books, blogs, and social media. Emphasis was placed on Facebook pages and\nnewspaper sources because they attract close to 50 million followers and is a common source of opinions\nand hate speech. The raw text corpus contains 250 million articles and the full dataset is being prepared\nfor release. This is a subset of the full dataset.\n\nThis dataset was prepared for hate-speech text classification benchmark on Bengali, an under-resourced language.\n", "citation": "@misc{karim2020classification,\n title={Classification Benchmarks for Under-resourced Bengali Language based on Multichannel Convolutional-LSTM Network},\n author={Md. Rezaul Karim and Bharathi Raja Chakravarthi and John P. McCrae and Michael Cochez},\n year={2020},\n eprint={2004.07807},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/rezacsedu/Bengali-Hate-Speech-Dataset", "license": "MIT License", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 5, "names": ["Personal", "Political", "Religious", "Geopolitical", "Gender abusive"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "bn_hate_speech", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 972635, "num_examples": 3418, "dataset_name": "bn_hate_speech"}}, "download_checksums": {"https://raw.githubusercontent.com/rezacsedu/Bengali-Hate-Speech-Dataset/main/bengali_%20hate_v1.0.csv": {"num_bytes": 974312, "checksum": "8ad2aab71c0a9ee61119767c3b628d18e9f2556d5cf52c6c80e677fb928c5420"}}, "download_size": 974312, "post_processing_size": null, "dataset_size": 972635, "size_in_bytes": 1946947}}
Binary file modified datasets/bn_hate_speech/dummy/0.0.0/dummy_data.zip
Binary file not shown.
6 changes: 3 additions & 3 deletions datasets/covid_tweets_japanese/covid_tweets_japanese.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

_CITATION = """\
No paper about this dataset is published yet. \
Please cite this dataset as "鈴木 優: COVID-19 日本語 Twitter データセット (http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1)"
Please cite this dataset as "鈴木 優: COVID-19 日本語 Twitter データセット (http://www.db.info.gifu-u.ac.jp/covid-19-twitter-dataset/)"
"""

_DESCRIPTION = """\
Expand All @@ -32,14 +32,14 @@
The original tweets are not contained. Please use Twitter API to get them, for example.
"""

_HOMEPAGE = "http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1"
_HOMEPAGE = "http://www.db.info.gifu-u.ac.jp/covid-19-twitter-dataset/"

_LICENSE = "CC-BY-ND 4.0"

# The HuggingFace dataset library don't host the datasets but only point to the original files
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
_URLs = {
"url": "http://www.db.info.gifu-u.ac.jp/data/data.csv.bz2",
"url": "http://www.db.info.gifu-u.ac.jp/data/covid19.csv.bz2",
}


Expand Down
2 changes: 1 addition & 1 deletion datasets/covid_tweets_japanese/dataset_infos.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"default": {"description": "53,640 Japanese tweets with annotation if a tweet is related to COVID-19 or not. The annotation is by majority decision by 5 - 10 crowd workers. Target tweets include \"COVID\" or \"\u30b3\u30ed\u30ca\". The period of the tweets is from around January 2020 to around June 2020. The original tweets are not contained. Please use Twitter API to get them, for example.\n", "citation": "No paper about this dataset is published yet. Please cite this dataset as \"\u9234\u6728 \u512a: COVID-19 \u65e5\u672c\u8a9e Twitter \u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 \uff08http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1\uff09\"\n", "homepage": "http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1", "license": "CC-BY-ND 4.0", "features": {"tweet_id": {"dtype": "string", "id": null, "_type": "Value"}, "assessment_option_id": {"num_classes": 6, "names": ["63", "64", "65", "66", "67", "68"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "covid_tweets_japanese", "config_name": "default", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1662833, "num_examples": 53639, "dataset_name": "covid_tweets_japanese"}}, "download_checksums": {"http://www.db.info.gifu-u.ac.jp/data/data.csv.bz2": {"num_bytes": 406005, "checksum": "b1023e49df7717db7eedf3b318511b6163ec2651cbf78a8d72f7e1e0bc3fd4c6"}}, "download_size": 406005, "post_processing_size": null, "dataset_size": 1662833, "size_in_bytes": 2068838}}
{"default": {"description": "53,640 Japanese tweets with annotation if a tweet is related to COVID-19 or not. The annotation is by majority decision by 5 - 10 crowd workers. Target tweets include \"COVID\" or \"\u30b3\u30ed\u30ca\". The period of the tweets is from around January 2020 to around June 2020. The original tweets are not contained. Please use Twitter API to get them, for example.\n", "citation": "No paper about this dataset is published yet. Please cite this dataset as \"\u9234\u6728 \u512a: COVID-19 \u65e5\u672c\u8a9e Twitter \u30c7\u30fc\u30bf\u30bb\u30c3\u30c8 \uff08http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1\uff09\"\n", "homepage": "http://www.db.info.gifu-u.ac.jp/data/Data_5f02db873363f976fce930d1", "license": "CC-BY-ND 4.0", "features": {"tweet_id": {"dtype": "string", "id": null, "_type": "Value"}, "assessment_option_id": {"num_classes": 6, "names": ["63", "64", "65", "66", "67", "68"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "covid_tweets_japanese", "config_name": "default", "version": {"version_str": "1.1.1", "description": null, "major": 1, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1662833, "num_examples": 53639, "dataset_name": "covid_tweets_japanese"}}, "download_checksums": {"http://www.db.info.gifu-u.ac.jp/data/covid19.csv.bz2": {"num_bytes": 406005, "checksum": "b1023e49df7717db7eedf3b318511b6163ec2651cbf78a8d72f7e1e0bc3fd4c6"}}, "download_size": 406005, "post_processing_size": null, "dataset_size": 1662833, "size_in_bytes": 2068838}}
Binary file modified datasets/covid_tweets_japanese/dummy/1.1.0/dummy_data.zip
Binary file not shown.