diff --git a/datasets/ttc4900/README.md b/datasets/ttc4900/README.md index 023e4aa211a..fdf1cc52cb3 100644 --- a/datasets/ttc4900/README.md +++ b/datasets/ttc4900/README.md @@ -18,11 +18,13 @@ task_categories: task_ids: - text-classification-other-news-category-classification paperswithcode_id: null +pretty_name: TTC4900 - A Benchmark Data for Turkish Text Categorization --- # Dataset Card for TTC4900: A Benchmark Data for Turkish Text Categorization ## Table of Contents +- [Table of Contents](#table-of-contents) - [Dataset Description](#dataset-description) - [Dataset Summary](#dataset-summary) - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards) @@ -48,15 +50,26 @@ paperswithcode_id: null ## Dataset Description -- **Homepage:** [https://www.kaggle.com/savasy/ttc4900](https://www.kaggle.com/savasy/ttc4900) -- **Point of Contact:** [ Avatar -Savaş Yıldırım](mailto:savasy@gmail.com) +- **Homepage:** [TTC4900 Homepage](https://www.kaggle.com/savasy/ttc4900) +- **Repository:** [TTC4900 Repository](https://github.com/savasy/TurkishTextClassification) +- **Paper:** [A Comparison of Different Approaches to Document Representation in Turkish Language](https://dergipark.org.tr/en/pub/sdufenbed/issue/38975/456349) +- **Point of Contact:** [Savaş Yıldırım](mailto:savasy@gmail.com) ### Dataset Summary The data set is taken from [kemik group](http://www.kemik.yildiz.edu.tr/) +The data are pre-processed for the text categorization, collocations are found, character set is corrected, and so forth. +We named TTC4900 by mimicking the name convention of TTC 3600 dataset shared by the study ["A Knowledge-poor Approach to Turkish Text Categorization with a Comparative Analysis, Proceedings of CICLING 2014, Springer LNCS, Nepal, 2014"](https://link.springer.com/chapter/10.1007/978-3-642-54903-8_36) -The data are pre-processed (noun phrase chunking etc.) for the text categorization problem by the study ["A Knowledge-poor Approach to Turkish Text Categorization with a Comparative Analysis, Proceedings of CICLING 2014, Springer LNCS, Nepal, 2014"](https://link.springer.com/chapter/10.1007/978-3-642-54903-8_36) +If you use the dataset in a paper, please refer https://www.kaggle.com/savasy/ttc4900 as footnote and cite one of the papers as follows: + +- A Comparison of Different Approaches to Document Representation in Turkish Language, SDU Journal of Natural and Applied Science, Vol 22, Issue 2, 2018 +- A comparative analysis of text classification for Turkish language, Pamukkale University Journal of Engineering Science Volume 25 Issue 5, 2018 +- A Knowledge-poor Approach to Turkish Text Categorization with a Comparative Analysis, Proceedings of CICLING 2014, Springer LNCS, Nepal, 2014. + +### Supported Tasks and Leaderboards + +[More Information Needed] ### Languages @@ -77,7 +90,6 @@ Here is an example from the dataset: } ``` - ### Data Fields - **category** : Indicates to which category the news text belongs. @@ -96,21 +108,16 @@ It is not divided into Train set and Test set. ### Source Data -[More Information Needed] - #### Initial Data Collection and Normalization The data are pre-processed for the text categorization, collocations are found, character set is corrected, and so forth. - #### Who are the source language producers? Turkish online news sites. ### Annotations -The dataset does not contain any additional annotations. - #### Annotation process [More Information Needed] @@ -125,7 +132,11 @@ The dataset does not contain any additional annotations. ## Considerations for Using the Data -### Discussion of Social Impact and Biases +### Social Impact of Dataset + +[More Information Needed] + +### Discussion of Biases [More Information Needed] @@ -137,7 +148,7 @@ The dataset does not contain any additional annotations. ### Dataset Curators -[More Information Needed] +The dataset was created by [Savaş Yıldırım](https://github.com/savasy) ### Licensing Information @@ -145,8 +156,23 @@ The dataset does not contain any additional annotations. ### Citation Information -[More Information Needed] +``` +@article{doi:10.5505/pajes.2018.15931, + author = {Yıldırım, Savaş and Yıldız, Tuğba}, + title = {A comparative analysis of text classification for Turkish language}, + journal = {Pamukkale Univ Muh Bilim Derg}, + volume = {24}, + number = {5}, + pages = {879-886}, + year = {2018}, + doi = {10.5505/pajes.2018.15931}, + note ={doi: 10.5505/pajes.2018.15931}, + + URL = {https://dx.doi.org/10.5505/pajes.2018.15931}, + eprint = {https://dx.doi.org/10.5505/pajes.2018.15931} +} +``` ### Contributions -Thanks to [@yavuzKomecoglu](https://github.com/yavuzKomecoglu) for adding this dataset. \ No newline at end of file +Thanks to [@yavuzKomecoglu](https://github.com/yavuzKomecoglu) for adding this dataset. diff --git a/datasets/ttc4900/dataset_infos.json b/datasets/ttc4900/dataset_infos.json index 258bbc93f70..263f6f370e5 100644 --- a/datasets/ttc4900/dataset_infos.json +++ b/datasets/ttc4900/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "The data set is taken from kemik group\nhttp://www.kemik.yildiz.edu.tr/\nThe data are pre-processed for the text categorization, collocations are found, character set is corrected, and so forth.\nWe named TTC4900 by mimicking the name convention of TTC 3600 dataset shared by the study http://journals.sagepub.com/doi/abs/10.1177/0165551515620551\n", "citation": "", "homepage": "https://www.kaggle.com/savasy/ttc4900", "license": "CC0: Public Domain", "features": {"category": {"num_classes": 7, "names": ["siyaset", "dunya", "ekonomi", "kultur", "saglik", "spor", "teknoloji"], "names_file": null, "id": null, "_type": "ClassLabel"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "tt_c4900", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 10640831, "num_examples": 4900, "dataset_name": "tt_c4900"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 10640831, "size_in_bytes": 10640831}, "ttc4900": {"description": "The data set is taken from kemik group\nhttp://www.kemik.yildiz.edu.tr/\nThe data are pre-processed for the text categorization, collocations are found, character set is corrected, and so forth.\nWe named TTC4900 by mimicking the name convention of TTC 3600 dataset shared by the study http://journals.sagepub.com/doi/abs/10.1177/0165551515620551\n", "citation": "", "homepage": "https://www.kaggle.com/savasy/ttc4900", "license": "CC0: Public Domain", "features": {"category": {"num_classes": 7, "names": ["siyaset", "dunya", "ekonomi", "kultur", "saglik", "spor", "teknoloji"], "names_file": null, "id": null, "_type": "ClassLabel"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "tt_c4900", "config_name": "ttc4900", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 10640831, "num_examples": 4900, "dataset_name": "tt_c4900"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 10640831, "size_in_bytes": 10640831}} \ No newline at end of file +{"ttc4900": {"description": "The data set is taken from kemik group\nhttp://www.kemik.yildiz.edu.tr/\nThe data are pre-processed for the text categorization, collocations are found, character set is corrected, and so forth.\nWe named TTC4900 by mimicking the name convention of TTC 3600 dataset shared by the study http://journals.sagepub.com/doi/abs/10.1177/0165551515620551\n\nIf you use the dataset in a paper, please refer https://www.kaggle.com/savasy/ttc4900 as footnote and cite one of the papers as follows:\n\n- A Comparison of Different Approaches to Document Representation in Turkish Language, SDU Journal of Natural and Applied Science, Vol 22, Issue 2, 2018\n- A comparative analysis of text classification for Turkish language, Pamukkale University Journal of Engineering Science Volume 25 Issue 5, 2018\n- A Knowledge-poor Approach to Turkish Text Categorization with a Comparative Analysis, Proceedings of CICLING 2014, Springer LNCS, Nepal, 2014.\n", "citation": "@article{doi:10.5505/pajes.2018.15931,\nauthor = {Y\u0131ld\u0131r\u0131m, Sava\u015f and Y\u0131ld\u0131z, Tu\u011fba},\ntitle = {A comparative analysis of text classification for Turkish language},\njournal = {Pamukkale Univ Muh Bilim Derg},\nvolume = {24},\nnumber = {5},\npages = {879-886},\nyear = {2018},\ndoi = {10.5505/pajes.2018.15931},\nnote ={doi: 10.5505/pajes.2018.15931},\n\nURL = {https://dx.doi.org/10.5505/pajes.2018.15931},\neprint = {https://dx.doi.org/10.5505/pajes.2018.15931}\n}\n", "homepage": "https://www.kaggle.com/savasy/ttc4900", "license": "CC0: Public Domain", "features": {"category": {"num_classes": 7, "names": ["siyaset", "dunya", "ekonomi", "kultur", "saglik", "spor", "teknoloji"], "names_file": null, "id": null, "_type": "ClassLabel"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "text-classification", "text_column": "text", "label_column": "category", "labels": ["dunya", "ekonomi", "kultur", "saglik", "siyaset", "spor", "teknoloji"]}], "builder_name": "ttc4900", "config_name": "ttc4900", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 10640831, "num_examples": 4900, "dataset_name": "ttc4900"}}, "download_checksums": {"https://raw.githubusercontent.com/savasy/TurkishTextClassification/master/7allV03.csv": {"num_bytes": 10627541, "checksum": "e17b79e89a3679ed77b3d5fd6d855fca43e9986a714cd4927c646c2be692c23e"}}, "download_size": 10627541, "post_processing_size": null, "dataset_size": 10640831, "size_in_bytes": 21268372}} \ No newline at end of file diff --git a/datasets/ttc4900/ttc4900.py b/datasets/ttc4900/ttc4900.py index 4f56b991882..d5a5762c199 100644 --- a/datasets/ttc4900/ttc4900.py +++ b/datasets/ttc4900/ttc4900.py @@ -17,9 +17,9 @@ import csv -import os import datasets +from datasets.tasks import TextClassification logger = datasets.logging.get_logger(__name__) @@ -30,11 +30,34 @@ http://www.kemik.yildiz.edu.tr/ The data are pre-processed for the text categorization, collocations are found, character set is corrected, and so forth. We named TTC4900 by mimicking the name convention of TTC 3600 dataset shared by the study http://journals.sagepub.com/doi/abs/10.1177/0165551515620551 + +If you use the dataset in a paper, please refer https://www.kaggle.com/savasy/ttc4900 as footnote and cite one of the papers as follows: + +- A Comparison of Different Approaches to Document Representation in Turkish Language, SDU Journal of Natural and Applied Science, Vol 22, Issue 2, 2018 +- A comparative analysis of text classification for Turkish language, Pamukkale University Journal of Engineering Science Volume 25 Issue 5, 2018 +- A Knowledge-poor Approach to Turkish Text Categorization with a Comparative Analysis, Proceedings of CICLING 2014, Springer LNCS, Nepal, 2014. +""" + +_CITATION = """\ +@article{doi:10.5505/pajes.2018.15931, +author = {Yıldırım, Savaş and Yıldız, Tuğba}, +title = {A comparative analysis of text classification for Turkish language}, +journal = {Pamukkale Univ Muh Bilim Derg}, +volume = {24}, +number = {5}, +pages = {879-886}, +year = {2018}, +doi = {10.5505/pajes.2018.15931}, +note ={doi: 10.5505/pajes.2018.15931}, + +URL = {https://dx.doi.org/10.5505/pajes.2018.15931}, +eprint = {https://dx.doi.org/10.5505/pajes.2018.15931} +} """ -_CITATION = "" _LICENSE = "CC0: Public Domain" _HOMEPAGE = "https://www.kaggle.com/savasy/ttc4900" +_DOWNLOAD_URL = "https://raw.githubusercontent.com/savasy/TurkishTextClassification/master" _FILENAME = "7allV03.csv" @@ -60,18 +83,6 @@ class TTC4900(datasets.GeneratorBasedBuilder): ), ] - @property - def manual_download_instructions(self): - return """\ - You need to go to https://www.kaggle.com/savasy/ttc4900, - and manually download the ttc4900. Once it is completed, - a file named archive.zip will be appeared in your Downloads folder - or whichever folder your browser chooses to save files to. You then have - to unzip the file and move 7allV03.csv under . - The can e.g. be "~/manual_data". - ttc4900 can then be loaded using the following command `datasets.load_dataset("ttc4900", data_dir="")`. - """ - def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, @@ -90,21 +101,18 @@ def _info(self): license=_LICENSE, # Citation for the dataset citation=_CITATION, + task_templates=[TextClassification(text_column="text", label_column="category")], ) def _split_generators(self, dl_manager): """Returns SplitGenerators.""" - path_to_manual_file = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) - if not os.path.exists(path_to_manual_file): - raise FileNotFoundError( - "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('ttc4900', data_dir=...)` that includes a file name {}. Manual download instructions: {})".format( - path_to_manual_file, _FILENAME, self.manual_download_instructions - ) - ) + + urls_to_download = { + "train": _DOWNLOAD_URL + "/" + _FILENAME, + } + downloaded_files = dl_manager.download(urls_to_download) return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, gen_kwargs={"filepath": os.path.join(path_to_manual_file, _FILENAME)} - ) + datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}), ] def _generate_examples(self, filepath):