diff --git a/datasets/code_x_glue_cc_clone_detection_big_clone_bench/README.md b/datasets/code_x_glue_cc_clone_detection_big_clone_bench/README.md new file mode 100644 index 00000000000..a096b8a38f4 --- /dev/null +++ b/datasets/code_x_glue_cc_clone_detection_big_clone_bench/README.md @@ -0,0 +1,185 @@ +--- +annotations_creators: +- found +language_creators: +- found +languages: +- code +licenses: +- other-C-UDA +multilinguality: +- monolingual +size_categories: +- 1M List[datasets.SplitGenerator]: + return self.child._split_generators(dl_manager=dl_manager) + + def _generate_examples(self, split_name, file_paths): + return self.child._generate_examples(split_name, file_paths) diff --git a/datasets/code_x_glue_cc_clone_detection_big_clone_bench/common.py b/datasets/code_x_glue_cc_clone_detection_big_clone_bench/common.py new file mode 100644 index 00000000000..bbd28960601 --- /dev/null +++ b/datasets/code_x_glue_cc_clone_detection_big_clone_bench/common.py @@ -0,0 +1,75 @@ +from typing import List + +import datasets + + +# Citation, taken from https://github.com/microsoft/CodeXGLUE +_DEFAULT_CITATION = """@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},}""" + + +class Child: + _DESCRIPTION = None + _FEATURES = None + _CITATION = None + SPLITS = {"train": datasets.Split.TRAIN} + _SUPERVISED_KEYS = None + + def __init__(self, info): + self.info = info + + def homepage(self): + return self.info["project_url"] + + def _info(self): + # This is the description that will appear on the datasets page. + return datasets.DatasetInfo( + description=self.info["description"] + "\n\n" + self._DESCRIPTION, + features=datasets.Features(self._FEATURES), + homepage=self.homepage(), + citation=self._CITATION or _DEFAULT_CITATION, + supervised_keys=self._SUPERVISED_KEYS, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + SPLITS = self.SPLITS + _URL = self.info["raw_url"] + urls_to_download = {} + for split in SPLITS: + if split not in urls_to_download: + urls_to_download[split] = {} + + for key, url in self.generate_urls(split): + if not url.startswith("http"): + url = _URL + "/" + url + urls_to_download[split][key] = url + + downloaded_files = {} + for k, v in urls_to_download.items(): + downloaded_files[k] = dl_manager.download_and_extract(v) + + return [ + datasets.SplitGenerator( + name=SPLITS[k], + gen_kwargs={"split_name": k, "file_paths": downloaded_files[k]}, + ) + for k in SPLITS + ] + + def check_empty(self, entries): + all_empty = all([v == "" for v in entries.values()]) + all_non_empty = all([v != "" for v in entries.values()]) + + if not all_non_empty and not all_empty: + raise RuntimeError("Parallel data files should have the same number of lines.") + + return all_empty + + +class TrainValidTestChild(Child): + SPLITS = { + "train": datasets.Split.TRAIN, + "valid": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } diff --git a/datasets/code_x_glue_cc_clone_detection_big_clone_bench/dataset_infos.json b/datasets/code_x_glue_cc_clone_detection_big_clone_bench/dataset_infos.json new file mode 100644 index 00000000000..38c7fa87b2f --- /dev/null +++ b/datasets/code_x_glue_cc_clone_detection_big_clone_bench/dataset_infos.json @@ -0,0 +1 @@ +{"default": {"description": "CodeXGLUE Clone-detection-BigCloneBench dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Clone-detection-BigCloneBench\n\nGiven two codes as the input, the task is to do binary classification (0/1), where 1 stands for semantic equivalence and 0 for others. Models are evaluated by F1 score.\nThe dataset we use is BigCloneBench and filtered following the paper Detecting Code Clones with Graph Neural Network and Flow-Augmented Abstract Syntax Tree.", "citation": "@inproceedings{svajlenko2014towards,\ntitle={Towards a big data curated benchmark of inter-project code clones},\nauthor={Svajlenko, Jeffrey and Islam, Judith F and Keivanloo, Iman and Roy, Chanchal K and Mia, Mohammad Mamun},\nbooktitle={2014 IEEE International Conference on Software Maintenance and Evolution},\npages={476--480},\nyear={2014},\norganization={IEEE}\n}\n\n@inproceedings{wang2020detecting,\ntitle={Detecting Code Clones with Graph Neural Network and Flow-Augmented Abstract Syntax Tree},\nauthor={Wang, Wenhan and Li, Ge and Ma, Bo and Xia, Xin and Jin, Zhi},\nbooktitle={2020 IEEE 27th International Conference on Software Analysis, Evolution and Reengineering (SANER)},\npages={261--271},\nyear={2020},\norganization={IEEE}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/Clone-detection-BigCloneBench", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "id1": {"dtype": "int32", "id": null, "_type": "Value"}, "id2": {"dtype": "int32", "id": null, "_type": "Value"}, "func1": {"dtype": "string", "id": null, "_type": "Value"}, "func2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "bool", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "label", "output": ""}, "task_templates": null, "builder_name": "code_x_glue_cc_clone_detection_big_clone_bench", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2888035757, "num_examples": 901028, "dataset_name": "code_x_glue_cc_clone_detection_big_clone_bench"}, "validation": {"name": "validation", "num_bytes": 1371399694, "num_examples": 415416, "dataset_name": "code_x_glue_cc_clone_detection_big_clone_bench"}, "test": {"name": "test", "num_bytes": 1220662901, "num_examples": 415416, "dataset_name": "code_x_glue_cc_clone_detection_big_clone_bench"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/Clone-detection-BigCloneBench/dataset/train.txt": {"num_bytes": 17043552, "checksum": "29119bfa94673374249c3424809fbe6baaa1f0e87a13e3c727bbd6cdf1224b77"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/Clone-detection-BigCloneBench/dataset/data.jsonl": {"num_bytes": 15174797, "checksum": "d8bc51e62deddcc45bd26c5b57f5add2a2cf377f13b9f6c2fb656fbc8fca4dd2"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/Clone-detection-BigCloneBench/dataset/valid.txt": {"num_bytes": 7861019, "checksum": "e59e8c1321df59b6ab0143165cb603030c55800c00e2d782e06810517b8de1e4"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/Clone-detection-BigCloneBench/dataset/test.txt": {"num_bytes": 7876506, "checksum": "a6c0cf79be34e582fdc64007aa894ed094e4f9ff2e5395a8d2b5c39eeef2737a"}}, "download_size": 47955874, "post_processing_size": null, "dataset_size": 5480098352, "size_in_bytes": 5528054226}} \ No newline at end of file diff --git a/datasets/code_x_glue_cc_clone_detection_big_clone_bench/dummy/default/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_clone_detection_big_clone_bench/dummy/default/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..c7d09c0a394 Binary files /dev/null and b/datasets/code_x_glue_cc_clone_detection_big_clone_bench/dummy/default/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_clone_detection_big_clone_bench/generated_definitions.py b/datasets/code_x_glue_cc_clone_detection_big_clone_bench/generated_definitions.py new file mode 100644 index 00000000000..47146310cc9 --- /dev/null +++ b/datasets/code_x_glue_cc_clone_detection_big_clone_bench/generated_definitions.py @@ -0,0 +1,12 @@ +DEFINITIONS = { + "default": { + "class_name": "CodeXGlueCcCloneDetectionBigCloneBench", + "dataset_type": "Code-Code", + "description": "CodeXGLUE Clone-detection-BigCloneBench dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Clone-detection-BigCloneBench", + "dir_name": "Clone-detection-BigCloneBench", + "name": "default", + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/Clone-detection-BigCloneBench", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/Clone-detection-BigCloneBench/dataset", + "sizes": {"test": 415416, "train": 901028, "validation": 415416}, + } +} diff --git a/datasets/code_x_glue_cc_clone_detection_poj104/README.md b/datasets/code_x_glue_cc_clone_detection_poj104/README.md new file mode 100644 index 00000000000..05ea632ef48 --- /dev/null +++ b/datasets/code_x_glue_cc_clone_detection_poj104/README.md @@ -0,0 +1,165 @@ +--- +annotations_creators: +- found +language_creators: +- found +languages: +- code +licenses: +- other-C-UDA +multilinguality: +- monolingual +size_categories: +- 10K List[datasets.SplitGenerator]: + return self.child._split_generators(dl_manager=dl_manager) + + def _generate_examples(self, split_name, file_paths): + return self.child._generate_examples(split_name, file_paths) diff --git a/datasets/code_x_glue_cc_clone_detection_poj104/common.py b/datasets/code_x_glue_cc_clone_detection_poj104/common.py new file mode 100644 index 00000000000..bbd28960601 --- /dev/null +++ b/datasets/code_x_glue_cc_clone_detection_poj104/common.py @@ -0,0 +1,75 @@ +from typing import List + +import datasets + + +# Citation, taken from https://github.com/microsoft/CodeXGLUE +_DEFAULT_CITATION = """@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},}""" + + +class Child: + _DESCRIPTION = None + _FEATURES = None + _CITATION = None + SPLITS = {"train": datasets.Split.TRAIN} + _SUPERVISED_KEYS = None + + def __init__(self, info): + self.info = info + + def homepage(self): + return self.info["project_url"] + + def _info(self): + # This is the description that will appear on the datasets page. + return datasets.DatasetInfo( + description=self.info["description"] + "\n\n" + self._DESCRIPTION, + features=datasets.Features(self._FEATURES), + homepage=self.homepage(), + citation=self._CITATION or _DEFAULT_CITATION, + supervised_keys=self._SUPERVISED_KEYS, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + SPLITS = self.SPLITS + _URL = self.info["raw_url"] + urls_to_download = {} + for split in SPLITS: + if split not in urls_to_download: + urls_to_download[split] = {} + + for key, url in self.generate_urls(split): + if not url.startswith("http"): + url = _URL + "/" + url + urls_to_download[split][key] = url + + downloaded_files = {} + for k, v in urls_to_download.items(): + downloaded_files[k] = dl_manager.download_and_extract(v) + + return [ + datasets.SplitGenerator( + name=SPLITS[k], + gen_kwargs={"split_name": k, "file_paths": downloaded_files[k]}, + ) + for k in SPLITS + ] + + def check_empty(self, entries): + all_empty = all([v == "" for v in entries.values()]) + all_non_empty = all([v != "" for v in entries.values()]) + + if not all_non_empty and not all_empty: + raise RuntimeError("Parallel data files should have the same number of lines.") + + return all_empty + + +class TrainValidTestChild(Child): + SPLITS = { + "train": datasets.Split.TRAIN, + "valid": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } diff --git a/datasets/code_x_glue_cc_clone_detection_poj104/dataset_infos.json b/datasets/code_x_glue_cc_clone_detection_poj104/dataset_infos.json new file mode 100644 index 00000000000..b0251200faf --- /dev/null +++ b/datasets/code_x_glue_cc_clone_detection_poj104/dataset_infos.json @@ -0,0 +1 @@ +{"default": {"description": "CodeXGLUE Clone-detection-POJ-104 dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Clone-detection-POJ-104\n\nGiven a code and a collection of candidates as the input, the task is to return Top K codes with the same semantic. Models are evaluated by MAP score.\nWe use POJ-104 dataset on this task.", "citation": "@inproceedings{mou2016convolutional,\ntitle={Convolutional neural networks over tree structures for programming language processing},\nauthor={Mou, Lili and Li, Ge and Zhang, Lu and Wang, Tao and Jin, Zhi},\nbooktitle={Proceedings of the Thirtieth AAAI Conference on Artificial Intelligence},\npages={1287--1293},\nyear={2016}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/Clone-detection-POJ-104", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "label", "output": ""}, "task_templates": null, "builder_name": "code_x_glue_cc_clone_detection_poj104", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 18878686, "num_examples": 32000, "dataset_name": "code_x_glue_cc_clone_detection_poj104"}, "validation": {"name": "validation", "num_bytes": 5765303, "num_examples": 8000, "dataset_name": "code_x_glue_cc_clone_detection_poj104"}, "test": {"name": "test", "num_bytes": 6852864, "num_examples": 12000, "dataset_name": "code_x_glue_cc_clone_detection_poj104"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/Clone-detection-POJ-104/dataset/programs.tar.gz": {"num_bytes": 8658581, "checksum": "c0b8ef3ee9c9159c882dc9337cb46da0e612a28e24852a83f8a1cd68c838f390"}}, "download_size": 8658581, "post_processing_size": null, "dataset_size": 31496853, "size_in_bytes": 40155434}} \ No newline at end of file diff --git a/datasets/code_x_glue_cc_clone_detection_poj104/dummy/default/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_clone_detection_poj104/dummy/default/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..4eb8b20e120 Binary files /dev/null and b/datasets/code_x_glue_cc_clone_detection_poj104/dummy/default/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_clone_detection_poj104/generated_definitions.py b/datasets/code_x_glue_cc_clone_detection_poj104/generated_definitions.py new file mode 100644 index 00000000000..e57bd0f430c --- /dev/null +++ b/datasets/code_x_glue_cc_clone_detection_poj104/generated_definitions.py @@ -0,0 +1,12 @@ +DEFINITIONS = { + "default": { + "class_name": "CodeXGlueCcCloneDetectionPoj104", + "dataset_type": "Code-Code", + "description": "CodeXGLUE Clone-detection-POJ-104 dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Clone-detection-POJ-104", + "dir_name": "Clone-detection-POJ-104", + "name": "default", + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/Clone-detection-POJ-104", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/Clone-detection-POJ-104/dataset", + "sizes": {"test": 12000, "train": 32000, "validation": 8000}, + } +} diff --git a/datasets/code_x_glue_cc_cloze_testing_all/README.md b/datasets/code_x_glue_cc_cloze_testing_all/README.md new file mode 100644 index 00000000000..6a802bb6b81 --- /dev/null +++ b/datasets/code_x_glue_cc_cloze_testing_all/README.md @@ -0,0 +1,262 @@ +--- +annotations_creators: +- found +language_creators: +- found +languages: +- code +licenses: +- other-C-UDA +multilinguality: +- monolingual +size_categories: + go: + - 10K", "{", "}", "\n", "easyjsonC5a4559bEncodeGithubComChromedpCdprotoWebaudio7", "(", "&", "w", ",", "v", ")", "\n", "return", "w", ".", "Buffer", ".", "BuildBytes", "(", ")", ",", "w", ".", "Error", "\n", "}"] +} +``` + +#### java + +An example of 'train' looks as follows. +``` +{ + "id": 0, + "idx": "all-1", + "nl_tokens": ["/", "*", "(", "non", "-", "Javadoc", ")"], + "pl_tokens": ["@", "Override", "public", "int", "peekBit", "(", ")", "throws", "AACException", "{", "int", "ret", ";", "if", "(", "bitsCached", ">", "0", ")", "{", "ret", "=", "(", "cache", ">>", "(", "bitsCached", "-", "1", ")", ")", "&", "1", ";", "}", "else", "{", "final", "int", "word", "=", "readCache", "(", "true", ")", ";", "ret", "=", "(", "", ">>", "WORD_BITS", "-", "1", ")", "&", "1", ";", "}", "return", "ret", ";", "}"] +} +``` + +#### javascript + +An example of 'train' looks as follows. +``` +{ + "id": 0, + "idx": "all-1", + "nl_tokens": ["Cast", "query", "params", "according", "to", "type"], + "pl_tokens": ["function", "castQueryParams", "(", "relId", ",", "data", ",", "{", "relationships", "}", ")", "{", "const", "relationship", "=", "relationships", "[", "relId", "]", "if", "(", "!", "relationship", ".", "query", ")", "{", "return", "{", "}", "}", "return", "Object", ".", "keys", "(", "relationship", ".", "query", ")", ".", "reduce", "(", "(", "params", ",", "", ")", "=>", "{", "const", "value", "=", "getField", "(", "data", ",", "relationship", ".", "query", "[", "key", "]", ")", "if", "(", "value", "===", "undefined", ")", "{", "throw", "new", "TypeError", "(", "'Missing value for query param'", ")", "}", "return", "{", "...", "params", ",", "[", "key", "]", ":", "value", "}", "}", ",", "{", "}", ")", "}"] +} +``` + +#### php + +An example of 'train' looks as follows. +``` +{ + "id": 0, + "idx": "all-1", + "nl_tokens": ["Get", "choices", "."], + "pl_tokens": ["protected", "", "getChoices", "(", "FormFieldTranslation", "$", "translation", ")", "{", "$", "choices", "=", "preg_split", "(", "'/\\r\\n|\\r|\\n/'", ",", "$", "translation", "->", "getOption", "(", "'choices'", ")", ",", "-", "1", ",", "PREG_SPLIT_NO_EMPTY", ")", ";", "return", "array_combine", "(", "$", "choices", ",", "$", "choices", ")", ";", "}"] +} +``` + +#### python + +An example of 'train' looks as follows. +``` +{ + "id": 0, + "idx": "all-1", + "nl_tokens": ["Post", "a", "review"], + "pl_tokens": ["def", "post_review", "(", "session", ",", "review", ")", ":", "# POST /api/projects/0.1/reviews/", "", "=", "make_post_request", "(", "session", ",", "'reviews'", ",", "json_data", "=", "review", ")", "json_data", "=", "response", ".", "json", "(", ")", "if", "response", ".", "status_code", "==", "200", ":", "return", "json_data", "[", "'status'", "]", "else", ":", "raise", "ReviewNotPostedException", "(", "message", "=", "json_data", "[", "'message'", "]", ",", "error_code", "=", "json_data", "[", "'error_code'", "]", ",", "request_id", "=", "json_data", "[", "'request_id'", "]", ")"] +} +``` + +#### ruby + +An example of 'train' looks as follows. +``` +{ + "id": 0, + "idx": "all-1", + "nl_tokens": ["By", "default", "taskers", "don", "t", "see", "the", "flor", "variables", "in", "the", "execution", ".", "If", "include_vars", "or", "exclude_vars", "is", "present", "in", "the", "configuration", "of", "the", "tasker", "some", "or", "all", "of", "the", "variables", "are", "passed", "."], + "pl_tokens": ["def", "gather_vars", "(", "executor", ",", "tconf", ",", "message", ")", "# try to return before a potentially costly call to executor.vars(nid)", "return", "nil", "if", "(", "tconf", ".", "keys", "&", "%w[", "include_vars", "exclude_vars", "]", ")", ".", "empty?", "# default behaviour, don't pass variables to taskers", "iv", "=", "expand_filter", "(", "tconf", "[", "'include_vars'", "]", ")", "return", "nil", "if", "iv", "==", "false", "ev", "=", "expand_filter", "(", "tconf", "[", "'exclude_vars'", "]", ")", "return", "{", "}", "if", "ev", "==", "true", "vars", "=", "executor", ".", "vars", "(", "message", "[", "'nid'", "]", ")", "return", "vars", "if", "iv", "==", "true", "vars", "=", "vars", ".", "select", "{", "|", "k", ",", "v", "|", "var_match", "(", "k", ",", "iv", ")", "}", "if", "", "vars", "=", "vars", ".", "reject", "{", "|", "k", ",", "v", "|", "var_match", "(", "k", ",", "ev", ")", "}", "if", "ev", "vars", "end"] +} +``` + +### Data Fields + +In the following each data field in go is explained for each config. The data fields are the same among all splits. + +#### go, java, javascript, php, python, ruby + +|field name| type | description | +|----------|----------------|------------------------------| +|id |int32 | Index of the sample | +|idx |string | Original index in the dataset| +|nl_tokens |Sequence[string]| Natural language tokens | +|pl_tokens |Sequence[string]| Programming language tokens | + +### Data Splits + +| name |train| +|----------|----:| +|go |25282| +|java |40492| +|javascript|13837| +|php |51930| +|python |40137| +|ruby | 4437| + +## Dataset Creation + +### Curation Rationale + +[More Information Needed] + +### Source Data + +#### Initial Data Collection and Normalization + +Data from CodeSearchNet Challenge dataset. +[More Information Needed] + +#### Who are the source language producers? + +Software Engineering developers. + +### Annotations + +#### Annotation process + +[More Information Needed] + +#### Who are the annotators? + +[More Information Needed] + +### Personal and Sensitive Information + +[More Information Needed] + +## Considerations for Using the Data + +### Social Impact of Dataset + +[More Information Needed] + +### Discussion of Biases + +[More Information Needed] + +### Other Known Limitations + +[More Information Needed] + +## Additional Information + +### Dataset Curators + +https://github.com/microsoft, https://github.com/madlag + +### Licensing Information + +Computational Use of Data Agreement (C-UDA) License. + +### Citation Information + +``` +@article{CodeXGLUE, + title={CodeXGLUE: An Open Challenge for Code Intelligence}, + journal={arXiv}, + year={2020}, +} +@article{feng2020codebert, + title={CodeBERT: A Pre-Trained Model for Programming and Natural Languages}, + author={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others}, + journal={arXiv preprint arXiv:2002.08155}, + year={2020} +} +@article{husain2019codesearchnet, + title={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search}, + author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, + journal={arXiv preprint arXiv:1909.09436}, + year={2019} +} +``` + +### Contributions + +Thanks to @madlag (and partly also @ncoop57) for adding this dataset. diff --git a/datasets/code_x_glue_cc_cloze_testing_all/code_x_glue_cc_cloze_testing_all.py b/datasets/code_x_glue_cc_cloze_testing_all/code_x_glue_cc_cloze_testing_all.py new file mode 100644 index 00000000000..98beacef59c --- /dev/null +++ b/datasets/code_x_glue_cc_cloze_testing_all/code_x_glue_cc_cloze_testing_all.py @@ -0,0 +1,83 @@ +import json +from typing import List + +import datasets + +from .common import Child +from .generated_definitions import DEFINITIONS + + +_DESCRIPTION = """Cloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem. +Here we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word. +The only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.""" + +_CITATION = """@article{CodeXGLUE, +title={CodeXGLUE: An Open Challenge for Code Intelligence}, +journal={arXiv}, +year={2020}, +} +@article{feng2020codebert, +title={CodeBERT: A Pre-Trained Model for Programming and Natural Languages}, +author={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others}, +journal={arXiv preprint arXiv:2002.08155}, +year={2020} +} +@article{husain2019codesearchnet, +title={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search}, +author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, +journal={arXiv preprint arXiv:1909.09436}, +year={2019} +}""" + + +class CodeXGlueCcClozeTestingImpl(Child): + _DESCRIPTION = _DESCRIPTION + _CITATION = _CITATION + + _FEATURES = { + "id": datasets.Value("int32"), # Index of the sample + "idx": datasets.Value("string"), # Original index in the dataset + "nl_tokens": datasets.features.Sequence(datasets.Value("string")), # Natural language tokens + "pl_tokens": datasets.features.Sequence(datasets.Value("string")), # Programming language tokens + } + + def generate_urls(self, split_name): + yield "data", "clozeTest.json" + + def _generate_examples(self, split_name, file_paths): + with open(file_paths["data"], encoding="utf-8") as f: + j = json.load(f) + index = 0 + for entry in j: + yield index, dict( + id=index, idx=entry["idx"], nl_tokens=entry["nl_tokens"], pl_tokens=entry["pl_tokens"] + ) + index += 1 + + +CLASS_MAPPING = { + "CodeXGlueCcClozeTestingAll": CodeXGlueCcClozeTestingImpl, +} + + +class CodeXGlueCcClozeTestingAll(datasets.GeneratorBasedBuilder): + BUILDER_CONFIG_CLASS = datasets.BuilderConfig + BUILDER_CONFIGS = [ + datasets.BuilderConfig(name=name, description=info["description"]) for name, info in DEFINITIONS.items() + ] + + def _info(self): + name = self.config.name + info = DEFINITIONS[name] + if info["class_name"] in CLASS_MAPPING: + self.child = CLASS_MAPPING[info["class_name"]](info) + else: + raise RuntimeError(f"Unknown python class for dataset configuration {name}") + ret = self.child._info() + return ret + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + return self.child._split_generators(dl_manager=dl_manager) + + def _generate_examples(self, split_name, file_paths): + return self.child._generate_examples(split_name, file_paths) diff --git a/datasets/code_x_glue_cc_cloze_testing_all/common.py b/datasets/code_x_glue_cc_cloze_testing_all/common.py new file mode 100644 index 00000000000..bbd28960601 --- /dev/null +++ b/datasets/code_x_glue_cc_cloze_testing_all/common.py @@ -0,0 +1,75 @@ +from typing import List + +import datasets + + +# Citation, taken from https://github.com/microsoft/CodeXGLUE +_DEFAULT_CITATION = """@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},}""" + + +class Child: + _DESCRIPTION = None + _FEATURES = None + _CITATION = None + SPLITS = {"train": datasets.Split.TRAIN} + _SUPERVISED_KEYS = None + + def __init__(self, info): + self.info = info + + def homepage(self): + return self.info["project_url"] + + def _info(self): + # This is the description that will appear on the datasets page. + return datasets.DatasetInfo( + description=self.info["description"] + "\n\n" + self._DESCRIPTION, + features=datasets.Features(self._FEATURES), + homepage=self.homepage(), + citation=self._CITATION or _DEFAULT_CITATION, + supervised_keys=self._SUPERVISED_KEYS, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + SPLITS = self.SPLITS + _URL = self.info["raw_url"] + urls_to_download = {} + for split in SPLITS: + if split not in urls_to_download: + urls_to_download[split] = {} + + for key, url in self.generate_urls(split): + if not url.startswith("http"): + url = _URL + "/" + url + urls_to_download[split][key] = url + + downloaded_files = {} + for k, v in urls_to_download.items(): + downloaded_files[k] = dl_manager.download_and_extract(v) + + return [ + datasets.SplitGenerator( + name=SPLITS[k], + gen_kwargs={"split_name": k, "file_paths": downloaded_files[k]}, + ) + for k in SPLITS + ] + + def check_empty(self, entries): + all_empty = all([v == "" for v in entries.values()]) + all_non_empty = all([v != "" for v in entries.values()]) + + if not all_non_empty and not all_empty: + raise RuntimeError("Parallel data files should have the same number of lines.") + + return all_empty + + +class TrainValidTestChild(Child): + SPLITS = { + "train": datasets.Split.TRAIN, + "valid": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } diff --git a/datasets/code_x_glue_cc_cloze_testing_all/dataset_infos.json b/datasets/code_x_glue_cc_cloze_testing_all/dataset_infos.json new file mode 100644 index 00000000000..be49a5ddf24 --- /dev/null +++ b/datasets/code_x_glue_cc_cloze_testing_all/dataset_infos.json @@ -0,0 +1 @@ +{"go": {"description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "citation": "@article{CodeXGLUE,\ntitle={CodeXGLUE: An Open Challenge for Code Intelligence},\njournal={arXiv},\nyear={2020},\n}\n@article{feng2020codebert,\ntitle={CodeBERT: A Pre-Trained Model for Programming and Natural Languages},\nauthor={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others},\njournal={arXiv preprint arXiv:2002.08155},\nyear={2020}\n}\n@article{husain2019codesearchnet,\ntitle={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "string", "id": null, "_type": "Value"}, "nl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_cloze_testing_all", "config_name": "go", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 22409765, "num_examples": 25282, "dataset_name": "code_x_glue_cc_cloze_testing_all"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-all/data/cloze-all/go/clozeTest.json": {"num_bytes": 32578836, "checksum": "4a2d2adf8866f89792fed4faae5d6cdee6ccf03e354d42ab9d2f970d7a3f1436"}}, "download_size": 32578836, "post_processing_size": null, "dataset_size": 22409765, "size_in_bytes": 54988601}, "java": {"description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "citation": "@article{CodeXGLUE,\ntitle={CodeXGLUE: An Open Challenge for Code Intelligence},\njournal={arXiv},\nyear={2020},\n}\n@article{feng2020codebert,\ntitle={CodeBERT: A Pre-Trained Model for Programming and Natural Languages},\nauthor={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others},\njournal={arXiv preprint arXiv:2002.08155},\nyear={2020}\n}\n@article{husain2019codesearchnet,\ntitle={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "string", "id": null, "_type": "Value"}, "nl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_cloze_testing_all", "config_name": "java", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 40392965, "num_examples": 40492, "dataset_name": "code_x_glue_cc_cloze_testing_all"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-all/data/cloze-all/java/clozeTest.json": {"num_bytes": 56468936, "checksum": "c31af7ef2b40f601cabe0ec418c6316cd5ecba7871d1fbbd151e95f736edd26e"}}, "download_size": 56468936, "post_processing_size": null, "dataset_size": 40392965, "size_in_bytes": 96861901}, "javascript": {"description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "citation": "@article{CodeXGLUE,\ntitle={CodeXGLUE: An Open Challenge for Code Intelligence},\njournal={arXiv},\nyear={2020},\n}\n@article{feng2020codebert,\ntitle={CodeBERT: A Pre-Trained Model for Programming and Natural Languages},\nauthor={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others},\njournal={arXiv preprint arXiv:2002.08155},\nyear={2020}\n}\n@article{husain2019codesearchnet,\ntitle={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "string", "id": null, "_type": "Value"}, "nl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_cloze_testing_all", "config_name": "javascript", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 16090182, "num_examples": 13837, "dataset_name": "code_x_glue_cc_cloze_testing_all"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-all/data/cloze-all/javascript/clozeTest.json": {"num_bytes": 22665666, "checksum": "a4601da27ffceeb5a82961e06c2caaa70441351fed63dda5731343a0d7a50eab"}}, "download_size": 22665666, "post_processing_size": null, "dataset_size": 16090182, "size_in_bytes": 38755848}, "php": {"description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "citation": "@article{CodeXGLUE,\ntitle={CodeXGLUE: An Open Challenge for Code Intelligence},\njournal={arXiv},\nyear={2020},\n}\n@article{feng2020codebert,\ntitle={CodeBERT: A Pre-Trained Model for Programming and Natural Languages},\nauthor={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others},\njournal={arXiv preprint arXiv:2002.08155},\nyear={2020}\n}\n@article{husain2019codesearchnet,\ntitle={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "string", "id": null, "_type": "Value"}, "nl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_cloze_testing_all", "config_name": "php", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 51328988, "num_examples": 51930, "dataset_name": "code_x_glue_cc_cloze_testing_all"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-all/data/cloze-all/php/clozeTest.json": {"num_bytes": 73115225, "checksum": "62c0461ca13ac3c2cc2fcb734691007524aef2afd54293ab28548c2acef5e6b7"}}, "download_size": 73115225, "post_processing_size": null, "dataset_size": 51328988, "size_in_bytes": 124444213}, "python": {"description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "citation": "@article{CodeXGLUE,\ntitle={CodeXGLUE: An Open Challenge for Code Intelligence},\njournal={arXiv},\nyear={2020},\n}\n@article{feng2020codebert,\ntitle={CodeBERT: A Pre-Trained Model for Programming and Natural Languages},\nauthor={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others},\njournal={arXiv preprint arXiv:2002.08155},\nyear={2020}\n}\n@article{husain2019codesearchnet,\ntitle={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "string", "id": null, "_type": "Value"}, "nl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_cloze_testing_all", "config_name": "python", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 40631213, "num_examples": 40137, "dataset_name": "code_x_glue_cc_cloze_testing_all"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-all/data/cloze-all/python/clozeTest.json": {"num_bytes": 56766288, "checksum": "5fb71df234ddeaafba7f865fcf9152e9e72c5f4301528c3f3603396c6a6cf4db"}}, "download_size": 56766288, "post_processing_size": null, "dataset_size": 40631213, "size_in_bytes": 97397501}, "ruby": {"description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "citation": "@article{CodeXGLUE,\ntitle={CodeXGLUE: An Open Challenge for Code Intelligence},\njournal={arXiv},\nyear={2020},\n}\n@article{feng2020codebert,\ntitle={CodeBERT: A Pre-Trained Model for Programming and Natural Languages},\nauthor={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others},\njournal={arXiv preprint arXiv:2002.08155},\nyear={2020}\n}\n@article{husain2019codesearchnet,\ntitle={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "string", "id": null, "_type": "Value"}, "nl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_cloze_testing_all", "config_name": "ruby", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3454904, "num_examples": 4437, "dataset_name": "code_x_glue_cc_cloze_testing_all"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-all/data/cloze-all/ruby/clozeTest.json": {"num_bytes": 4825752, "checksum": "0fd1469d649abc251865710cd01008c199f521d6c836142463e2c10e64d486a3"}}, "download_size": 4825752, "post_processing_size": null, "dataset_size": 3454904, "size_in_bytes": 8280656}} \ No newline at end of file diff --git a/datasets/code_x_glue_cc_cloze_testing_all/dummy/go/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_cloze_testing_all/dummy/go/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..d45d26fb6bc Binary files /dev/null and b/datasets/code_x_glue_cc_cloze_testing_all/dummy/go/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_cloze_testing_all/dummy/java/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_cloze_testing_all/dummy/java/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..17436f25625 Binary files /dev/null and b/datasets/code_x_glue_cc_cloze_testing_all/dummy/java/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_cloze_testing_all/dummy/javascript/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_cloze_testing_all/dummy/javascript/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..343b693a696 Binary files /dev/null and b/datasets/code_x_glue_cc_cloze_testing_all/dummy/javascript/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_cloze_testing_all/dummy/php/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_cloze_testing_all/dummy/php/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..87e8f9c7f43 Binary files /dev/null and b/datasets/code_x_glue_cc_cloze_testing_all/dummy/php/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_cloze_testing_all/dummy/python/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_cloze_testing_all/dummy/python/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..ebc8ec01faa Binary files /dev/null and b/datasets/code_x_glue_cc_cloze_testing_all/dummy/python/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_cloze_testing_all/dummy/ruby/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_cloze_testing_all/dummy/ruby/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..3bb0131ed30 Binary files /dev/null and b/datasets/code_x_glue_cc_cloze_testing_all/dummy/ruby/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_cloze_testing_all/generated_definitions.py b/datasets/code_x_glue_cc_cloze_testing_all/generated_definitions.py new file mode 100644 index 00000000000..595b67c4d7d --- /dev/null +++ b/datasets/code_x_glue_cc_cloze_testing_all/generated_definitions.py @@ -0,0 +1,68 @@ +DEFINITIONS = { + "go": { + "class_name": "CodeXGlueCcClozeTestingAll", + "dataset_type": "Code-Code", + "description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", + "dir_name": "ClozeTesting-all", + "name": "go", + "parameters": {"language": "go"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-all/data/cloze-all/go", + "sizes": {"train": 25282}, + }, + "java": { + "class_name": "CodeXGlueCcClozeTestingAll", + "dataset_type": "Code-Code", + "description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", + "dir_name": "ClozeTesting-all", + "name": "java", + "parameters": {"language": "java"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-all/data/cloze-all/java", + "sizes": {"train": 40492}, + }, + "javascript": { + "class_name": "CodeXGlueCcClozeTestingAll", + "dataset_type": "Code-Code", + "description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", + "dir_name": "ClozeTesting-all", + "name": "javascript", + "parameters": {"language": "javascript"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-all/data/cloze-all/javascript", + "sizes": {"train": 13837}, + }, + "php": { + "class_name": "CodeXGlueCcClozeTestingAll", + "dataset_type": "Code-Code", + "description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", + "dir_name": "ClozeTesting-all", + "name": "php", + "parameters": {"language": "php"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-all/data/cloze-all/php", + "sizes": {"train": 51930}, + }, + "python": { + "class_name": "CodeXGlueCcClozeTestingAll", + "dataset_type": "Code-Code", + "description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", + "dir_name": "ClozeTesting-all", + "name": "python", + "parameters": {"language": "python"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-all/data/cloze-all/python", + "sizes": {"train": 40137}, + }, + "ruby": { + "class_name": "CodeXGlueCcClozeTestingAll", + "dataset_type": "Code-Code", + "description": "CodeXGLUE ClozeTesting-all dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", + "dir_name": "ClozeTesting-all", + "name": "ruby", + "parameters": {"language": "ruby"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-all", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-all/data/cloze-all/ruby", + "sizes": {"train": 4437}, + }, +} diff --git a/datasets/code_x_glue_cc_cloze_testing_maxmin/README.md b/datasets/code_x_glue_cc_cloze_testing_maxmin/README.md new file mode 100644 index 00000000000..28d38a57663 --- /dev/null +++ b/datasets/code_x_glue_cc_cloze_testing_maxmin/README.md @@ -0,0 +1,262 @@ +--- +annotations_creators: +- found +language_creators: +- found +languages: +- code +licenses: +- other-C-UDA +multilinguality: +- monolingual +size_categories: + go: + - 10K", "int", ")", "{", "structPool", "=", "&", "sync", ".", "Pool", "{", "New", ":", "newStructErrors", "}", "\n", "}"] +} +``` + +#### java + +An example of 'train' looks as follows. +``` +{ + "id": 0, + "idx": "maxmin-1", + "nl_tokens": ["Test", "whether", "find", "can", "be", "found", "at", "position", "startPos", "in", "the", "string", "src", "."], + "pl_tokens": ["public", "static", "boolean", "startsWith", "(", "char", "[", "]", "src", ",", "char", "[", "]", "find", ",", "int", "startAt", ")", "{", "int", "startPos", "=", "startAt", ";", "boolean", "result", "=", "true", ";", "// Check ranges", "if", "(", "src", ".", "length", "<", "startPos", "+", "find", ".", "length", ")", "{", "result", "=", "false", ";", "}", "else", "{", "final", "int", "", "=", "find", ".", "length", ";", "for", "(", "int", "a", "=", "0", ";", "a", "<", "max", "&&", "result", ";", "a", "++", ")", "{", "if", "(", "src", "[", "startPos", "]", "!=", "find", "[", "a", "]", ")", "{", "result", "=", "false", ";", "}", "startPos", "++", ";", "}", "}", "return", "result", ";", "}"] +} +``` + +#### javascript + +An example of 'train' looks as follows. +``` +{ + "id": 0, + "idx": "maxmin-1", + "nl_tokens": ["string", ".", "max", "Maximum", "length", "of", "the", "string"], + "pl_tokens": ["function", "(", "string", ")", "{", "// string.check check sting type and size", "return", "(", "(", "typeof", "string", "===", "'string'", "||", "string", "instanceof", "String", ")", "&&", "string", ".", "length", ">=", "this", ".", "", "&&", "string", ".", "length", "<=", "this", ".", "max", "&&", "(", "!", "this", ".", "match", "||", "string", ".", "match", "(", "this", ".", "match", ")", ")", ")", ";", "}"] +} +``` + +#### php + +An example of 'train' looks as follows. +``` +{ + "id": 0, + "idx": "maxmin-1", + "nl_tokens": ["Read", "the", "next", "character", "from", "the", "supplied", "string", ".", "Return", "null", "when", "we", "have", "run", "out", "of", "characters", "."], + "pl_tokens": ["public", "function", "readOne", "(", ")", "{", "if", "(", "$", "this", "->", "pos", "<=", "$", "this", "->", "", ")", "{", "$", "value", "=", "$", "this", "->", "string", "[", "$", "this", "->", "pos", "]", ";", "$", "this", "->", "pos", "+=", "1", ";", "}", "else", "{", "$", "value", "=", "null", ";", "}", "return", "$", "value", ";", "}"] +} +``` + +#### python + +An example of 'train' looks as follows. +``` +{ + "id": 0, + "idx": "maxmin-1", + "nl_tokens": ["Returns", "intermediary", "colors", "for", "given", "list", "of", "colors", "."], + "pl_tokens": ["def", "_interpolate", "(", "self", ",", "colors", ",", "n", "=", "100", ")", ":", "gradient", "=", "[", "]", "for", "i", "in", "_range", "(", "n", ")", ":", "l", "=", "len", "(", "colors", ")", "-", "1", "x", "=", "int", "(", "1.0", "*", "i", "/", "n", "*", "l", ")", "x", "=", "", "(", "x", "+", "0", ",", "l", ")", "y", "=", "min", "(", "x", "+", "1", ",", "l", ")", "base", "=", "1.0", "*", "n", "/", "l", "*", "x", "d", "=", "(", "i", "-", "base", ")", "/", "(", "1.0", "*", "n", "/", "l", ")", "r", "=", "colors", "[", "x", "]", ".", "r", "*", "(", "1", "-", "d", ")", "+", "colors", "[", "y", "]", ".", "r", "*", "d", "g", "=", "colors", "[", "x", "]", ".", "g", "*", "(", "1", "-", "d", ")", "+", "colors", "[", "y", "]", ".", "g", "*", "d", "b", "=", "colors", "[", "x", "]", ".", "b", "*", "(", "1", "-", "d", ")", "+", "colors", "[", "y", "]", ".", "b", "*", "d", "a", "=", "colors", "[", "x", "]", ".", "a", "*", "(", "1", "-", "d", ")", "+", "colors", "[", "y", "]", ".", "a", "*", "d", "gradient", ".", "append", "(", "color", "(", "r", ",", "g", ",", "b", ",", "a", ",", "mode", "=", "\"rgb\"", ")", ")", "gradient", ".", "append", "(", "colors", "[", "-", "1", "]", ")", "return", "gradient"] +} +``` + +#### ruby + +An example of 'train' looks as follows. +``` +{ + "id": 0, + "idx": "maxmin-1", + "nl_tokens": ["Delete", "all", "copies", "that", "are", "older", "than", "the", "max", "age", "provided", "in", "seconds", "."], + "pl_tokens": ["def", "clean", "(", "", ":", "24", "*", "60", "*", "60", ")", "Futex", ".", "new", "(", "file", ",", "log", ":", "@log", ")", ".", "open", "do", "list", "=", "load", "list", ".", "reject!", "do", "|", "s", "|", "if", "s", "[", ":time", "]", ">=", "Time", ".", "now", "-", "max", "false", "else", "@log", ".", "debug", "(", "\"Copy ##{s[:name]}/#{s[:host]}:#{s[:port]} is too old, over #{Age.new(s[:time])}\"", ")", "true", "end", "end", "save", "(", "list", ")", "deleted", "=", "0", "files", ".", "each", "do", "|", "f", "|", "next", "unless", "list", ".", "find", "{", "|", "s", "|", "s", "[", ":name", "]", "==", "File", ".", "basename", "(", "f", ",", "Copies", "::", "EXT", ")", "}", ".", "nil?", "file", "=", "File", ".", "join", "(", "@dir", ",", "f", ")", "size", "=", "File", ".", "size", "(", "file", ")", "File", ".", "delete", "(", "file", ")", "@log", ".", "debug", "(", "\"Copy at #{f} deleted: #{Size.new(size)}\"", ")", "deleted", "+=", "1", "end", "list", ".", "select!", "do", "|", "s", "|", "cp", "=", "File", ".", "join", "(", "@dir", ",", "\"#{s[:name]}#{Copies::EXT}\"", ")", "wallet", "=", "Wallet", ".", "new", "(", "cp", ")", "begin", "wallet", ".", "refurbish", "raise", "\"Invalid protocol #{wallet.protocol} in #{cp}\"", "unless", "wallet", ".", "protocol", "==", "Zold", "::", "PROTOCOL", "true", "rescue", "StandardError", "=>", "e", "FileUtils", ".", "rm_rf", "(", "cp", ")", "@log", ".", "debug", "(", "\"Copy at #{cp} deleted: #{Backtrace.new(e)}\"", ")", "deleted", "+=", "1", "false", "end", "end", "save", "(", "list", ")", "deleted", "end", "end"] +} +``` + +### Data Fields + +In the following each data field in go is explained for each config. The data fields are the same among all splits. + +#### go, java, javascript, php, python, ruby + +|field name| type | description | +|----------|----------------|------------------------------| +|id |int32 | Index of the sample | +|idx |string | Original index in the dataset| +|nl_tokens |Sequence[string]| Natural language tokens | +|pl_tokens |Sequence[string]| Programming language tokens | + +### Data Splits + +| name |train| +|----------|----:| +|go | 152| +|java | 482| +|javascript| 272| +|php | 407| +|python | 1264| +|ruby | 38| + +## Dataset Creation + +### Curation Rationale + +[More Information Needed] + +### Source Data + +#### Initial Data Collection and Normalization + +Data from CodeSearchNet Challenge dataset. +[More Information Needed] + +#### Who are the source language producers? + +Software Engineering developers. + +### Annotations + +#### Annotation process + +[More Information Needed] + +#### Who are the annotators? + +[More Information Needed] + +### Personal and Sensitive Information + +[More Information Needed] + +## Considerations for Using the Data + +### Social Impact of Dataset + +[More Information Needed] + +### Discussion of Biases + +[More Information Needed] + +### Other Known Limitations + +[More Information Needed] + +## Additional Information + +### Dataset Curators + +https://github.com/microsoft, https://github.com/madlag + +### Licensing Information + +Computational Use of Data Agreement (C-UDA) License. + +### Citation Information + +``` +@article{CodeXGLUE, + title={CodeXGLUE: An Open Challenge for Code Intelligence}, + journal={arXiv}, + year={2020}, +} +@article{feng2020codebert, + title={CodeBERT: A Pre-Trained Model for Programming and Natural Languages}, + author={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others}, + journal={arXiv preprint arXiv:2002.08155}, + year={2020} +} +@article{husain2019codesearchnet, + title={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search}, + author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, + journal={arXiv preprint arXiv:1909.09436}, + year={2019} +} +``` + +### Contributions + +Thanks to @madlag (and partly also @ncoop57) for adding this dataset. diff --git a/datasets/code_x_glue_cc_cloze_testing_maxmin/code_x_glue_cc_cloze_testing_maxmin.py b/datasets/code_x_glue_cc_cloze_testing_maxmin/code_x_glue_cc_cloze_testing_maxmin.py new file mode 100644 index 00000000000..90acecfac47 --- /dev/null +++ b/datasets/code_x_glue_cc_cloze_testing_maxmin/code_x_glue_cc_cloze_testing_maxmin.py @@ -0,0 +1,83 @@ +import json +from typing import List + +import datasets + +from .common import Child +from .generated_definitions import DEFINITIONS + + +_DESCRIPTION = """Cloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem. +Here we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word. +The only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.""" + +_CITATION = """@article{CodeXGLUE, +title={CodeXGLUE: An Open Challenge for Code Intelligence}, +journal={arXiv}, +year={2020}, +} +@article{feng2020codebert, +title={CodeBERT: A Pre-Trained Model for Programming and Natural Languages}, +author={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others}, +journal={arXiv preprint arXiv:2002.08155}, +year={2020} +} +@article{husain2019codesearchnet, +title={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search}, +author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, +journal={arXiv preprint arXiv:1909.09436}, +year={2019} +}""" + + +class CodeXGlueCcClozeTestingImpl(Child): + _DESCRIPTION = _DESCRIPTION + _CITATION = _CITATION + + _FEATURES = { + "id": datasets.Value("int32"), # Index of the sample + "idx": datasets.Value("string"), # Original index in the dataset + "nl_tokens": datasets.features.Sequence(datasets.Value("string")), # Natural language tokens + "pl_tokens": datasets.features.Sequence(datasets.Value("string")), # Programming language tokens + } + + def generate_urls(self, split_name): + yield "data", "clozeTest.json" + + def _generate_examples(self, split_name, file_paths): + with open(file_paths["data"], encoding="utf-8") as f: + j = json.load(f) + index = 0 + for entry in j: + yield index, dict( + id=index, idx=entry["idx"], nl_tokens=entry["nl_tokens"], pl_tokens=entry["pl_tokens"] + ) + index += 1 + + +CLASS_MAPPING = { + "CodeXGlueCcClozeTestingMaxmin": CodeXGlueCcClozeTestingImpl, +} + + +class CodeXGlueCcClozeTestingMaxmin(datasets.GeneratorBasedBuilder): + BUILDER_CONFIG_CLASS = datasets.BuilderConfig + BUILDER_CONFIGS = [ + datasets.BuilderConfig(name=name, description=info["description"]) for name, info in DEFINITIONS.items() + ] + + def _info(self): + name = self.config.name + info = DEFINITIONS[name] + if info["class_name"] in CLASS_MAPPING: + self.child = CLASS_MAPPING[info["class_name"]](info) + else: + raise RuntimeError(f"Unknown python class for dataset configuration {name}") + ret = self.child._info() + return ret + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + return self.child._split_generators(dl_manager=dl_manager) + + def _generate_examples(self, split_name, file_paths): + return self.child._generate_examples(split_name, file_paths) diff --git a/datasets/code_x_glue_cc_cloze_testing_maxmin/common.py b/datasets/code_x_glue_cc_cloze_testing_maxmin/common.py new file mode 100644 index 00000000000..bbd28960601 --- /dev/null +++ b/datasets/code_x_glue_cc_cloze_testing_maxmin/common.py @@ -0,0 +1,75 @@ +from typing import List + +import datasets + + +# Citation, taken from https://github.com/microsoft/CodeXGLUE +_DEFAULT_CITATION = """@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},}""" + + +class Child: + _DESCRIPTION = None + _FEATURES = None + _CITATION = None + SPLITS = {"train": datasets.Split.TRAIN} + _SUPERVISED_KEYS = None + + def __init__(self, info): + self.info = info + + def homepage(self): + return self.info["project_url"] + + def _info(self): + # This is the description that will appear on the datasets page. + return datasets.DatasetInfo( + description=self.info["description"] + "\n\n" + self._DESCRIPTION, + features=datasets.Features(self._FEATURES), + homepage=self.homepage(), + citation=self._CITATION or _DEFAULT_CITATION, + supervised_keys=self._SUPERVISED_KEYS, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + SPLITS = self.SPLITS + _URL = self.info["raw_url"] + urls_to_download = {} + for split in SPLITS: + if split not in urls_to_download: + urls_to_download[split] = {} + + for key, url in self.generate_urls(split): + if not url.startswith("http"): + url = _URL + "/" + url + urls_to_download[split][key] = url + + downloaded_files = {} + for k, v in urls_to_download.items(): + downloaded_files[k] = dl_manager.download_and_extract(v) + + return [ + datasets.SplitGenerator( + name=SPLITS[k], + gen_kwargs={"split_name": k, "file_paths": downloaded_files[k]}, + ) + for k in SPLITS + ] + + def check_empty(self, entries): + all_empty = all([v == "" for v in entries.values()]) + all_non_empty = all([v != "" for v in entries.values()]) + + if not all_non_empty and not all_empty: + raise RuntimeError("Parallel data files should have the same number of lines.") + + return all_empty + + +class TrainValidTestChild(Child): + SPLITS = { + "train": datasets.Split.TRAIN, + "valid": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } diff --git a/datasets/code_x_glue_cc_cloze_testing_maxmin/dataset_infos.json b/datasets/code_x_glue_cc_cloze_testing_maxmin/dataset_infos.json new file mode 100644 index 00000000000..60ee47bda23 --- /dev/null +++ b/datasets/code_x_glue_cc_cloze_testing_maxmin/dataset_infos.json @@ -0,0 +1 @@ +{"go": {"description": "CodeXGLUE ClozeTesting-maxmin dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "citation": "@article{CodeXGLUE,\ntitle={CodeXGLUE: An Open Challenge for Code Intelligence},\njournal={arXiv},\nyear={2020},\n}\n@article{feng2020codebert,\ntitle={CodeBERT: A Pre-Trained Model for Programming and Natural Languages},\nauthor={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others},\njournal={arXiv preprint arXiv:2002.08155},\nyear={2020}\n}\n@article{husain2019codesearchnet,\ntitle={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "string", "id": null, "_type": "Value"}, "nl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_cloze_testing_maxmin", "config_name": "go", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 204997, "num_examples": 152, "dataset_name": "code_x_glue_cc_cloze_testing_maxmin"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-maxmin/data/cloze-maxmin/go/clozeTest.json": {"num_bytes": 298893, "checksum": "9b0ba35b614b7b93c537ed52c88d89ca02e186022086c9bb41323eb4342eecf9"}}, "download_size": 298893, "post_processing_size": null, "dataset_size": 204997, "size_in_bytes": 503890}, "java": {"description": "CodeXGLUE ClozeTesting-maxmin dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "citation": "@article{CodeXGLUE,\ntitle={CodeXGLUE: An Open Challenge for Code Intelligence},\njournal={arXiv},\nyear={2020},\n}\n@article{feng2020codebert,\ntitle={CodeBERT: A Pre-Trained Model for Programming and Natural Languages},\nauthor={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others},\njournal={arXiv preprint arXiv:2002.08155},\nyear={2020}\n}\n@article{husain2019codesearchnet,\ntitle={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "string", "id": null, "_type": "Value"}, "nl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_cloze_testing_maxmin", "config_name": "java", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 785754, "num_examples": 482, "dataset_name": "code_x_glue_cc_cloze_testing_maxmin"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-maxmin/data/cloze-maxmin/java/clozeTest.json": {"num_bytes": 1097733, "checksum": "ee2e7b0ad7d75ecb5e53b668f04fb39bdcebbacda820220b9bbb093136dd4082"}}, "download_size": 1097733, "post_processing_size": null, "dataset_size": 785754, "size_in_bytes": 1883487}, "javascript": {"description": "CodeXGLUE ClozeTesting-maxmin dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "citation": "@article{CodeXGLUE,\ntitle={CodeXGLUE: An Open Challenge for Code Intelligence},\njournal={arXiv},\nyear={2020},\n}\n@article{feng2020codebert,\ntitle={CodeBERT: A Pre-Trained Model for Programming and Natural Languages},\nauthor={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others},\njournal={arXiv preprint arXiv:2002.08155},\nyear={2020}\n}\n@article{husain2019codesearchnet,\ntitle={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "string", "id": null, "_type": "Value"}, "nl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_cloze_testing_maxmin", "config_name": "javascript", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 594347, "num_examples": 272, "dataset_name": "code_x_glue_cc_cloze_testing_maxmin"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-maxmin/data/cloze-maxmin/javascript/clozeTest.json": {"num_bytes": 836112, "checksum": "8517b7b0ecfcc59ccb19c3253a907e6b4f65549e4b68a8ac249807e50a001204"}}, "download_size": 836112, "post_processing_size": null, "dataset_size": 594347, "size_in_bytes": 1430459}, "php": {"description": "CodeXGLUE ClozeTesting-maxmin dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "citation": "@article{CodeXGLUE,\ntitle={CodeXGLUE: An Open Challenge for Code Intelligence},\njournal={arXiv},\nyear={2020},\n}\n@article{feng2020codebert,\ntitle={CodeBERT: A Pre-Trained Model for Programming and Natural Languages},\nauthor={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others},\njournal={arXiv preprint arXiv:2002.08155},\nyear={2020}\n}\n@article{husain2019codesearchnet,\ntitle={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "string", "id": null, "_type": "Value"}, "nl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_cloze_testing_maxmin", "config_name": "php", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 705477, "num_examples": 407, "dataset_name": "code_x_glue_cc_cloze_testing_maxmin"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-maxmin/data/cloze-maxmin/php/clozeTest.json": {"num_bytes": 1010305, "checksum": "37a7dd759eca7fa8076d2b9a2ab3991774728555dd7568a3e54b0a152d2c10b8"}}, "download_size": 1010305, "post_processing_size": null, "dataset_size": 705477, "size_in_bytes": 1715782}, "python": {"description": "CodeXGLUE ClozeTesting-maxmin dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "citation": "@article{CodeXGLUE,\ntitle={CodeXGLUE: An Open Challenge for Code Intelligence},\njournal={arXiv},\nyear={2020},\n}\n@article{feng2020codebert,\ntitle={CodeBERT: A Pre-Trained Model for Programming and Natural Languages},\nauthor={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others},\njournal={arXiv preprint arXiv:2002.08155},\nyear={2020}\n}\n@article{husain2019codesearchnet,\ntitle={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "string", "id": null, "_type": "Value"}, "nl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_cloze_testing_maxmin", "config_name": "python", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2566353, "num_examples": 1264, "dataset_name": "code_x_glue_cc_cloze_testing_maxmin"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-maxmin/data/cloze-maxmin/python/clozeTest.json": {"num_bytes": 3577929, "checksum": "98cbdaf5900a343d2017aa1b7144a8ee75eac3d0c3ad40812abd7b72fd892e72"}}, "download_size": 3577929, "post_processing_size": null, "dataset_size": 2566353, "size_in_bytes": 6144282}, "ruby": {"description": "CodeXGLUE ClozeTesting-maxmin dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin\n\nCloze tests are widely adopted in Natural Languages Processing to evaluate the performance of the trained language models. The task is aimed to predict the answers for the blank with the context of the blank, which can be formulated as a multi-choice classification problem.\nHere we present the two cloze testing datasets in code domain with six different programming languages: ClozeTest-maxmin and ClozeTest-all. Each instance in the dataset contains a masked code function, its docstring and the target word.\nThe only difference between ClozeTest-maxmin and ClozeTest-all is their selected words sets, where ClozeTest-maxmin only contains two words while ClozeTest-all contains 930 words.", "citation": "@article{CodeXGLUE,\ntitle={CodeXGLUE: An Open Challenge for Code Intelligence},\njournal={arXiv},\nyear={2020},\n}\n@article{feng2020codebert,\ntitle={CodeBERT: A Pre-Trained Model for Programming and Natural Languages},\nauthor={Feng, Zhangyin and Guo, Daya and Tang, Duyu and Duan, Nan and Feng, Xiaocheng and Gong, Ming and Shou, Linjun and Qin, Bing and Liu, Ting and Jiang, Daxin and others},\njournal={arXiv preprint arXiv:2002.08155},\nyear={2020}\n}\n@article{husain2019codesearchnet,\ntitle={CodeSearchNet Challenge: Evaluating the State of Semantic Code Search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "idx": {"dtype": "string", "id": null, "_type": "Value"}, "nl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "pl_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_cloze_testing_maxmin", "config_name": "ruby", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 48946, "num_examples": 38, "dataset_name": "code_x_glue_cc_cloze_testing_maxmin"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-maxmin/data/cloze-maxmin/ruby/clozeTest.json": {"num_bytes": 67675, "checksum": "d3c71ae0127b653ee551da3e6a5eea17434c38cc4b2d6349d529e8e37ae4f9d2"}}, "download_size": 67675, "post_processing_size": null, "dataset_size": 48946, "size_in_bytes": 116621}} \ No newline at end of file diff --git a/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/go/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/go/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..24b0ead6065 Binary files /dev/null and b/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/go/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/java/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/java/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..c5b478f18bb Binary files /dev/null and b/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/java/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/javascript/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/javascript/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..b48b2ee88b8 Binary files /dev/null and b/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/javascript/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/php/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/php/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..ca97de2fa8b Binary files /dev/null and b/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/php/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/python/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/python/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..94ff71fda35 Binary files /dev/null and b/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/python/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/ruby/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/ruby/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..4c52c1519b1 Binary files /dev/null and b/datasets/code_x_glue_cc_cloze_testing_maxmin/dummy/ruby/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_cloze_testing_maxmin/generated_definitions.py b/datasets/code_x_glue_cc_cloze_testing_maxmin/generated_definitions.py new file mode 100644 index 00000000000..2024ec61dde --- /dev/null +++ b/datasets/code_x_glue_cc_cloze_testing_maxmin/generated_definitions.py @@ -0,0 +1,68 @@ +DEFINITIONS = { + "go": { + "class_name": "CodeXGlueCcClozeTestingMaxmin", + "dataset_type": "Code-Code", + "description": "CodeXGLUE ClozeTesting-maxmin dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", + "dir_name": "ClozeTesting-maxmin", + "name": "go", + "parameters": {"language": "go"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-maxmin/data/cloze-maxmin/go", + "sizes": {"train": 152}, + }, + "java": { + "class_name": "CodeXGlueCcClozeTestingMaxmin", + "dataset_type": "Code-Code", + "description": "CodeXGLUE ClozeTesting-maxmin dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", + "dir_name": "ClozeTesting-maxmin", + "name": "java", + "parameters": {"language": "java"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-maxmin/data/cloze-maxmin/java", + "sizes": {"train": 482}, + }, + "javascript": { + "class_name": "CodeXGlueCcClozeTestingMaxmin", + "dataset_type": "Code-Code", + "description": "CodeXGLUE ClozeTesting-maxmin dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", + "dir_name": "ClozeTesting-maxmin", + "name": "javascript", + "parameters": {"language": "javascript"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-maxmin/data/cloze-maxmin/javascript", + "sizes": {"train": 272}, + }, + "php": { + "class_name": "CodeXGlueCcClozeTestingMaxmin", + "dataset_type": "Code-Code", + "description": "CodeXGLUE ClozeTesting-maxmin dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", + "dir_name": "ClozeTesting-maxmin", + "name": "php", + "parameters": {"language": "php"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-maxmin/data/cloze-maxmin/php", + "sizes": {"train": 407}, + }, + "python": { + "class_name": "CodeXGlueCcClozeTestingMaxmin", + "dataset_type": "Code-Code", + "description": "CodeXGLUE ClozeTesting-maxmin dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", + "dir_name": "ClozeTesting-maxmin", + "name": "python", + "parameters": {"language": "python"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-maxmin/data/cloze-maxmin/python", + "sizes": {"train": 1264}, + }, + "ruby": { + "class_name": "CodeXGlueCcClozeTestingMaxmin", + "dataset_type": "Code-Code", + "description": "CodeXGLUE ClozeTesting-maxmin dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", + "dir_name": "ClozeTesting-maxmin", + "name": "ruby", + "parameters": {"language": "ruby"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/ClozeTesting-maxmin", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/ClozeTesting-maxmin/data/cloze-maxmin/ruby", + "sizes": {"train": 38}, + }, +} diff --git a/datasets/code_x_glue_cc_code_completion_line/README.md b/datasets/code_x_glue_cc_code_completion_line/README.md new file mode 100644 index 00000000000..da01ceb53ec --- /dev/null +++ b/datasets/code_x_glue_cc_code_completion_line/README.md @@ -0,0 +1,201 @@ +--- +annotations_creators: +- found +language_creators: +- found +languages: +- code +licenses: +- other-C-UDA +multilinguality: +- monolingual +size_categories: + go: + - n<1K + java: + - n<1K + javascript: + - n<1K + php: + - n<1K + python: + - 1K package org . rubypeople . rdt . internal . ui . rubyeditor ; import java . util . Iterator ; import org . eclipse . core . resources . IMarker ; import org . eclipse . ui . texteditor . MarkerAnnotation ; import org . eclipse . ui . texteditor . MarkerUtilities ; import org . rubypeople . rdt . core . IRubyElement ; import org . rubypeople . rdt . core . IRubyModelMarker ; import org . rubypeople . rdt . core . IRubyScript ; import org . rubypeople . rdt . core . RubyCore ; public class RubyMarkerAnnotation extends MarkerAnnotation implements IRubyAnnotation { public static final String RUBY_MARKER_TYPE_PREFIX = \"\" ; public static final String ERROR_ANNOTATION_TYPE = \"\" ; public static final String WARNING_ANNOTATION_TYPE = \"\" ; public static final String INFO_ANNOTATION_TYPE = \"\" ; public static final String TASK_ANNOTATION_TYPE = \"\" ; private IRubyAnnotation fOverlay ; public RubyMarkerAnnotation ( IMarker marker ) { super ( marker ) ; } public String [ ] getArguments ( ) { return null ; } public int getId ( ) { IMarker marker = getMarker ( ) ; if ( marker == null || ! marker . exists ( ) ) return - 1 ; if ( isProblem ( ) ) return marker . getAttribute ( IRubyModelMarker . ID , - 1 ) ; return - 1 ; } public boolean isProblem ( ) { String type = getType ( ) ; return WARNING_ANNOTATION_TYPE . equals ( type ) || ERROR_ANNOTATION_TYPE . equals" +} +``` + +#### python + +An example of 'train' looks as follows. +``` +{ + "gt": "", + "id": 0, + "input": " from __future__ import absolute_import import weakref import operator from . compat import threading , itertools_filterfalse from . import py2k import types EMPTY_SET = frozenset ( ) class KeyedTuple ( tuple ) : def __new__ ( cls , vals , labels = None ) : t = tuple . __new__ ( cls , vals ) t . _labels = [ ] if labels : t . __dict__ . update ( zip ( labels , vals ) ) t . _labels = labels return t def keys ( self ) : return [ l for l in self . _labels if l is not None ] @ property def _fields ( self ) : return tuple ( self . keys ( ) ) def _asdict ( self ) : return dict ( ( key , self . __dict__ [ key ] ) for key in self . keys ( ) ) class ImmutableContainer ( object ) : def _immutable ( self , * arg , ** kw ) : raise TypeError ( \"\" % self . __class__ . __name__ ) __delitem__ = __setitem__ = __setattr__ = _immutable class immutabledict ( ImmutableContainer , dict ) : clear = pop = popitem = setdefault = update = ImmutableContainer . _immutable def __new__ ( cls , * args ) : new = dict . __new__ ( cls ) dict . __init__ ( new , * args ) return new def __init__ ( self , * args ) : pass def __reduce__ ( self ) : return immutabledict , ( dict ( self ) , ) def union ( self , d ) : if not self : return immutabledict ( d ) else : d2 = immutabledict ( self ) dict . update ( d2 , d ) return d2 def __repr__ ( self ) : return \"\" % dict . __repr__ ( self ) class Properties ( object ) : def __init__ ( self , data ) : self . __dict__ [ '_data' ] = data def __len__ ( self ) : return len ( self . _data ) def __iter__ ( self ) : return iter ( list ( self . _data . values ( ) ) ) def __add__ ( self , other ) : return list ( self ) + list ( other ) def __setitem__ ( self , key , object ) : self . _data [ key ] = object def __getitem__ ( self , key ) : return self . _data [ key ] def __delitem__ ( self , key ) : del self . _data [ key ] def __setattr__ ( self , key , object ) : self . _data [ key ] = object def __getstate__ ( self ) : return { '_data' : self . __dict__ [ '_data' ] } def __setstate__ ( self , state ) : self . __dict__ [ '_data' ] = state [ '_data' ] def __getattr__ ( self , key ) : try : return self . _data [ key ] except KeyError : raise AttributeError ( key ) def __contains__ ( self , key ) : return key in self . _data def as_immutable ( self ) : return ImmutableProperties ( self . _data ) def update ( self , value ) : self . _data . update ( value ) def get ( self , key , default = None ) : if key in self : return self [ key ] else : return default def keys ( self ) : return list ( self . _data ) def values ( self ) : return list ( self . _data . values ( ) ) def items ( self ) : return list ( self . _data . items ( ) ) def has_key ( self , key ) : return key in self . _data def clear ( self ) : self . _data . clear ( ) class OrderedProperties ( Properties ) : def __init__ ( self ) : Properties . __init__ ( self , OrderedDict ( ) ) class ImmutableProperties ( ImmutableContainer , Properties ) : class OrderedDict ( dict ) : def __init__ ( self , ____sequence = None , ** kwargs ) : self . _list = [ ] if ____sequence is None : if kwargs : self . update ( ** kwargs ) else : self . update ( ____sequence , ** kwargs ) def clear ( self ) : self . _list = [ ] dict . clear ( self ) def copy ( self ) : return self . __copy__ ( ) def __copy__ ( self ) : return OrderedDict ( self ) def sort ( self , * arg , ** kw ) : self . _list . sort ( * arg , ** kw ) def update ( self , ____sequence = None , ** kwargs ) : if ____sequence is not None : if hasattr ( ____sequence , 'keys' ) : for key in ____sequence . keys ( ) : self . __setitem__ ( key , ____sequence [ key ] ) else : for key , value in ____sequence : self [ key ] = value if kwargs : self . update ( kwargs ) def setdefault ( self , key , value ) : if key not in self : self . __setitem__ ( key , value ) return value else : return self . __getitem__ ( key ) def __iter__ ( self ) : return iter ( self . _list ) def keys ( self ) : return list ( self ) def values ( self ) : return [ self [ key ] for key in self . _list ] def items ( self ) : return [ ( key , self [ key ] ) for key in self . _list ] if py2k : def itervalues ( self ) : return iter ( self . values ( ) ) def iterkeys ( self ) : return iter ( self ) def iteritems ( self ) : return iter ( self . items ( ) ) def __setitem__ ( self , key , object ) : if key not in self : try : self . _list . append ( key ) except AttributeError : self . _list = [ key ] dict . __setitem__ ( self , key , object ) def __delitem__ ( self , key ) : dict . __delitem__ ( self , key ) self . _list . remove ( key ) def pop ( self , key , * default ) : present = key in self value = dict . pop ( self , key , * default ) if present : self . _list . remove ( key ) return value def popitem ( self ) : item = dict . popitem ( self ) self . _list . remove ( item [ 0 ] ) return item class OrderedSet ( set ) : def __init__ ( self , d = None ) : set . __init__ ( self ) self . _list = [ ] if d is not None : " +} +``` + +### Data Fields + +In the following each data field in go is explained for each config. The data fields are the same among all splits. + +#### java, python + +|field name| type | description | +|----------|------|----------------------------| +|id |int32 | Index of the sample | +|input |string| Input code string | +|gt |string| Code string to be predicted| + +### Data Splits + +| name |train| +|------|----:| +|java | 3000| +|python|10000| + +## Dataset Creation + +### Curation Rationale + +[More Information Needed] + +### Source Data + +#### Initial Data Collection and Normalization + +[More Information Needed] + +#### Who are the source language producers? + +[More Information Needed] + +### Annotations + +#### Annotation process + +[More Information Needed] + +#### Who are the annotators? + +[More Information Needed] + +### Personal and Sensitive Information + +[More Information Needed] + +## Considerations for Using the Data + +### Social Impact of Dataset + +[More Information Needed] + +### Discussion of Biases + +[More Information Needed] + +### Other Known Limitations + +[More Information Needed] + +## Additional Information + +### Dataset Curators + +https://github.com/microsoft, https://github.com/madlag + +### Licensing Information + +Computational Use of Data Agreement (C-UDA) License. + +### Citation Information + +``` +@article{raychev2016probabilistic, + title={Probabilistic Model for Code with Decision Trees}, + author={Raychev, Veselin and Bielik, Pavol and Vechev, Martin}, + journal={ACM SIGPLAN Notices}, + pages={731--747}, + year={2016}, + publisher={ACM New York, NY, USA} +} +@inproceedings{allamanis2013mining, + title={Mining Source Code Repositories at Massive Scale using Language Modeling}, + author={Allamanis, Miltiadis and Sutton, Charles}, + booktitle={2013 10th Working Conference on Mining Software Repositories (MSR)}, + pages={207--216}, + year={2013}, + organization={IEEE} +} +``` + +### Contributions + +Thanks to @madlag (and partly also @ncoop57) for adding this dataset. \ No newline at end of file diff --git a/datasets/code_x_glue_cc_code_completion_line/code_x_glue_cc_code_completion_line.py b/datasets/code_x_glue_cc_code_completion_line/code_x_glue_cc_code_completion_line.py new file mode 100644 index 00000000000..58a3ce72e10 --- /dev/null +++ b/datasets/code_x_glue_cc_code_completion_line/code_x_glue_cc_code_completion_line.py @@ -0,0 +1,80 @@ +import json +from typing import List + +import datasets + +from .common import Child +from .generated_definitions import DEFINITIONS + + +_DESCRIPTION = """Complete the unfinished line given previous context. Models are evaluated by exact match and edit similarity. +We propose line completion task to test model's ability to autocomplete a line. Majority code completion systems behave well in token level completion, but fail in completing an unfinished line like a method call with specific parameters, a function signature, a loop condition, a variable definition and so on. When a software develop finish one or more tokens of the current line, the line level completion model is expected to generate the entire line of syntactically correct code. +Line level code completion task shares the train/dev dataset with token level completion. After training a model on CodeCompletion-token, you could directly use it to test on line-level completion.""" + +_CITATION = """@article{raychev2016probabilistic, +title={Probabilistic Model for Code with Decision Trees}, +author={Raychev, Veselin and Bielik, Pavol and Vechev, Martin}, +journal={ACM SIGPLAN Notices}, +pages={731--747}, +year={2016}, +publisher={ACM New York, NY, USA} +} +@inproceedings{allamanis2013mining, +title={Mining Source Code Repositories at Massive Scale using Language Modeling}, +author={Allamanis, Miltiadis and Sutton, Charles}, +booktitle={2013 10th Working Conference on Mining Software Repositories (MSR)}, +pages={207--216}, +year={2013}, +organization={IEEE} +}""" + + +class CodeXGlueCcCodeCompletionLineImpl(Child): + _DESCRIPTION = _DESCRIPTION + _CITATION = _CITATION + + _FEATURES = { + "id": datasets.Value("int32"), # Index of the sample + "input": datasets.Value("string"), # Input code string + "gt": datasets.Value("string"), # Code string to be predicted + } + + _SUPERVISED_KEYS = ["gt"] + + def generate_urls(self, split_name): + yield "data", "test.json" + + def _generate_examples(self, split_name, file_paths): + with open(file_paths["data"], encoding="utf-8") as f: + for idx, line in enumerate(f): + entry = json.loads(line) + entry["id"] = idx + yield idx, entry + + +CLASS_MAPPING = { + "CodeXGlueCcCodeCompletionLine": CodeXGlueCcCodeCompletionLineImpl, +} + + +class CodeXGlueCcCodeCompletionLine(datasets.GeneratorBasedBuilder): + BUILDER_CONFIG_CLASS = datasets.BuilderConfig + BUILDER_CONFIGS = [ + datasets.BuilderConfig(name=name, description=info["description"]) for name, info in DEFINITIONS.items() + ] + + def _info(self): + name = self.config.name + info = DEFINITIONS[name] + if info["class_name"] in CLASS_MAPPING: + self.child = CLASS_MAPPING[info["class_name"]](info) + else: + raise RuntimeError(f"Unknown python class for dataset configuration {name}") + ret = self.child._info() + return ret + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + return self.child._split_generators(dl_manager=dl_manager) + + def _generate_examples(self, split_name, file_paths): + return self.child._generate_examples(split_name, file_paths) diff --git a/datasets/code_x_glue_cc_code_completion_line/common.py b/datasets/code_x_glue_cc_code_completion_line/common.py new file mode 100644 index 00000000000..bbd28960601 --- /dev/null +++ b/datasets/code_x_glue_cc_code_completion_line/common.py @@ -0,0 +1,75 @@ +from typing import List + +import datasets + + +# Citation, taken from https://github.com/microsoft/CodeXGLUE +_DEFAULT_CITATION = """@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},}""" + + +class Child: + _DESCRIPTION = None + _FEATURES = None + _CITATION = None + SPLITS = {"train": datasets.Split.TRAIN} + _SUPERVISED_KEYS = None + + def __init__(self, info): + self.info = info + + def homepage(self): + return self.info["project_url"] + + def _info(self): + # This is the description that will appear on the datasets page. + return datasets.DatasetInfo( + description=self.info["description"] + "\n\n" + self._DESCRIPTION, + features=datasets.Features(self._FEATURES), + homepage=self.homepage(), + citation=self._CITATION or _DEFAULT_CITATION, + supervised_keys=self._SUPERVISED_KEYS, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + SPLITS = self.SPLITS + _URL = self.info["raw_url"] + urls_to_download = {} + for split in SPLITS: + if split not in urls_to_download: + urls_to_download[split] = {} + + for key, url in self.generate_urls(split): + if not url.startswith("http"): + url = _URL + "/" + url + urls_to_download[split][key] = url + + downloaded_files = {} + for k, v in urls_to_download.items(): + downloaded_files[k] = dl_manager.download_and_extract(v) + + return [ + datasets.SplitGenerator( + name=SPLITS[k], + gen_kwargs={"split_name": k, "file_paths": downloaded_files[k]}, + ) + for k in SPLITS + ] + + def check_empty(self, entries): + all_empty = all([v == "" for v in entries.values()]) + all_non_empty = all([v != "" for v in entries.values()]) + + if not all_non_empty and not all_empty: + raise RuntimeError("Parallel data files should have the same number of lines.") + + return all_empty + + +class TrainValidTestChild(Child): + SPLITS = { + "train": datasets.Split.TRAIN, + "valid": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } diff --git a/datasets/code_x_glue_cc_code_completion_line/dataset_infos.json b/datasets/code_x_glue_cc_code_completion_line/dataset_infos.json new file mode 100644 index 00000000000..1c305d535b1 --- /dev/null +++ b/datasets/code_x_glue_cc_code_completion_line/dataset_infos.json @@ -0,0 +1 @@ +{"java": {"description": "CodeXGLUE CodeCompletion-line dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line\n\nComplete the unfinished line given previous context. Models are evaluated by exact match and edit similarity.\nWe propose line completion task to test model's ability to autocomplete a line. Majority code completion systems behave well in token level completion, but fail in completing an unfinished line like a method call with specific parameters, a function signature, a loop condition, a variable definition and so on. When a software develop finish one or more tokens of the current line, the line level completion model is expected to generate the entire line of syntactically correct code.\nLine level code completion task shares the train/dev dataset with token level completion. After training a model on CodeCompletion-token, you could directly use it to test on line-level completion.", "citation": "@article{raychev2016probabilistic,\ntitle={Probabilistic Model for Code with Decision Trees},\nauthor={Raychev, Veselin and Bielik, Pavol and Vechev, Martin},\njournal={ACM SIGPLAN Notices},\npages={731--747},\nyear={2016},\npublisher={ACM New York, NY, USA}\n}\n@inproceedings{allamanis2013mining,\ntitle={Mining Source Code Repositories at Massive Scale using Language Modeling},\nauthor={Allamanis, Miltiadis and Sutton, Charles},\nbooktitle={2013 10th Working Conference on Mining Software Repositories (MSR)},\npages={207--216},\nyear={2013},\norganization={IEEE}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "input": {"dtype": "string", "id": null, "_type": "Value"}, "gt": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "gt", "output": ""}, "task_templates": null, "builder_name": "code_x_glue_cc_code_completion_line", "config_name": "java", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 5454783, "num_examples": 3000, "dataset_name": "code_x_glue_cc_code_completion_line"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/CodeCompletion-line/dataset/javaCorpus/line_completion/test.json": {"num_bytes": 5523586, "checksum": "188e4ae5a8751871adb50fe48e8f1d50c6e2dca778fe53ff03c13b5a63f132af"}}, "download_size": 5523586, "post_processing_size": null, "dataset_size": 5454783, "size_in_bytes": 10978369}, "python": {"description": "CodeXGLUE CodeCompletion-line dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line\n\nComplete the unfinished line given previous context. Models are evaluated by exact match and edit similarity.\nWe propose line completion task to test model's ability to autocomplete a line. Majority code completion systems behave well in token level completion, but fail in completing an unfinished line like a method call with specific parameters, a function signature, a loop condition, a variable definition and so on. When a software develop finish one or more tokens of the current line, the line level completion model is expected to generate the entire line of syntactically correct code.\nLine level code completion task shares the train/dev dataset with token level completion. After training a model on CodeCompletion-token, you could directly use it to test on line-level completion.", "citation": "@article{raychev2016probabilistic,\ntitle={Probabilistic Model for Code with Decision Trees},\nauthor={Raychev, Veselin and Bielik, Pavol and Vechev, Martin},\njournal={ACM SIGPLAN Notices},\npages={731--747},\nyear={2016},\npublisher={ACM New York, NY, USA}\n}\n@inproceedings{allamanis2013mining,\ntitle={Mining Source Code Repositories at Massive Scale using Language Modeling},\nauthor={Allamanis, Miltiadis and Sutton, Charles},\nbooktitle={2013 10th Working Conference on Mining Software Repositories (MSR)},\npages={207--216},\nyear={2013},\norganization={IEEE}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "input": {"dtype": "string", "id": null, "_type": "Value"}, "gt": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "gt", "output": ""}, "task_templates": null, "builder_name": "code_x_glue_cc_code_completion_line", "config_name": "python", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 24021562, "num_examples": 10000, "dataset_name": "code_x_glue_cc_code_completion_line"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/CodeCompletion-line/dataset/py150/line_completion/test.json": {"num_bytes": 24266715, "checksum": "39cb31c2263b25506d94384e9ace954cf3ec8d1fd7a4b7f62beb0c3846e5555c"}}, "download_size": 24266715, "post_processing_size": null, "dataset_size": 24021562, "size_in_bytes": 48288277}} \ No newline at end of file diff --git a/datasets/code_x_glue_cc_code_completion_line/dummy/java/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_code_completion_line/dummy/java/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..31f8052ce4f Binary files /dev/null and b/datasets/code_x_glue_cc_code_completion_line/dummy/java/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_code_completion_line/dummy/python/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_code_completion_line/dummy/python/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..5b6c01f9b0f Binary files /dev/null and b/datasets/code_x_glue_cc_code_completion_line/dummy/python/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_code_completion_line/generated_definitions.py b/datasets/code_x_glue_cc_code_completion_line/generated_definitions.py new file mode 100644 index 00000000000..a520c56ae68 --- /dev/null +++ b/datasets/code_x_glue_cc_code_completion_line/generated_definitions.py @@ -0,0 +1,24 @@ +DEFINITIONS = { + "java": { + "class_name": "CodeXGlueCcCodeCompletionLine", + "dataset_type": "Code-Code", + "description": "CodeXGLUE CodeCompletion-line dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line", + "dir_name": "CodeCompletion-line", + "name": "java", + "parameters": {"language": "java", "original_language_name": "javaCorpus"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/CodeCompletion-line/dataset/javaCorpus/line_completion", + "sizes": {"train": 3000}, + }, + "python": { + "class_name": "CodeXGlueCcCodeCompletionLine", + "dataset_type": "Code-Code", + "description": "CodeXGLUE CodeCompletion-line dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line", + "dir_name": "CodeCompletion-line", + "name": "python", + "parameters": {"language": "python", "original_language_name": "py150"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/CodeCompletion-line/dataset/py150/line_completion", + "sizes": {"train": 10000}, + }, +} diff --git a/datasets/code_x_glue_cc_code_completion_token/README.md b/datasets/code_x_glue_cc_code_completion_token/README.md new file mode 100644 index 00000000000..a335550756a --- /dev/null +++ b/datasets/code_x_glue_cc_code_completion_token/README.md @@ -0,0 +1,202 @@ +--- +annotations_creators: +- found +language_creators: +- found +languages: +- code +licenses: +- other-C-UDA +multilinguality: +- monolingual +size_categories: +- 10K", "package", "org", ".", "vaadin", ".", "teemu", ".", "clara", ".", "demo", ";", "import", "java", ".", "io", ".", "BufferedReader", ";", "import", "java", ".", "io", ".", "ByteArrayInputStream", ";", "import", "java", ".", "io", ".", "IOException", ";", "import", "java", ".", "io", ".", "InputStreamReader", ";", "import", "org", ".", "vaadin", ".", "teemu", ".", "clara", ".", "Clara", ";", "import", "org", ".", "vaadin", ".", "teemu", ".", "clara", ".", "inflater", ".", "LayoutInflaterException", ";", "import", "com", ".", "vaadin", ".", "Application", ";", "import", "com", ".", "vaadin", ".", "terminal", ".", "ThemeResource", ";", "import", "com", ".", "vaadin", ".", "ui", ".", "Button", ";", "import", "com", ".", "vaadin", ".", "ui", ".", "Button", ".", "ClickEvent", ";", "import", "com", ".", "vaadin", ".", "ui", ".", "Component", ";", "import", "com", ".", "vaadin", ".", "ui", ".", "Embedded", ";", "import", "com", ".", "vaadin", ".", "ui", ".", "HorizontalLayout", ";", "import", "com", ".", "vaadin", ".", "ui", ".", "HorizontalSplitPanel", ";", "import", "com", ".", "vaadin", ".", "ui", ".", "TextArea", ";", "import", "com", ".", "vaadin", ".", "ui", ".", "VerticalLayout", ";", "import", "com", ".", "vaadin", ".", "ui", ".", "Window", ";", "import", "com", ".", "vaadin", ".", "ui", ".", "Window", ".", "Notification", ";", "@", "SuppressWarnings", "(", "\"serial\"", ")", "public", "class", "DemoApplication", "extends", "Application", "{", "private", "DemoController", "controller", ";", "private", "TextArea", "xmlArea", ";", "private", "HorizontalSplitPanel", "split", "=", "new", "HorizontalSplitPanel", "(", ")", ";", "private", "Window", "mainWindow", ";", "@", "Override", "public", "void", "init", "(", ")", "{", "setTheme", "(", "\"clara\"", ")", ";", "setMainWindow", "(", "mainWindow", "=", "new", "Window", "(", ")", ")", ";", "controller", "=", "new", "DemoController", "(", "mainWindow", ")", ";", "mainWindow", ".", "setContent", "(", "split", ")", ";", "VerticalLayout", "editor", "=", "new", "VerticalLayout", "(", ")", ";", "editor", ".", "setSpacing", "(", "true", ")", ";", "editor", ".", "setMargin", "(", "false", ",", "false", ",", "false", ",", "true", ")", ";", "editor", ".", "setHeight", "(", "\"100%\"", ")", ";", "editor", ".", "addComponent", "(", "xmlArea", "=", "createXmlArea", "(", ")", ")", ";", "editor", ".", "setExpandRatio", "(", "xmlArea", ",", "1.0f", ")", ";", "editor", ".", "addComponent", "(", "createUpdateButton", "(", ")", ")", ";", "HorizontalLayout", "wrapper", "=", "new", "HorizontalLayout", "(", ")", ";", "wrapper", ".", "setMargin", "(", "true", ")", ";", "wrapper", ".", "setSizeFull", "(", ")", ";", "wrapper", ".", "addComponent", "(", "createLogo", "(", ")", ")", ";", "wrapper", ".", "addComponent", "(", "editor", ")", ";", "wrapper", ".", "setExpandRatio", "(", "editor", ",", "1.0f", ")", ";", "split", ".", "setFirstComponent", "(", "wrapper", ")", ";", "updateLayout", "(", ")", ";", "}", "private", "Component", "createLogo", "(", ")", "{", "Embedded", "logo", "=", "new", "Embedded", "(", "null", ",", "new", "ThemeResource", "(", "\"\"", ")", ")", ";", "logo", ".", "setHeight", "(", "\"90px\"", ")", ";", "logo", ".", "setWidth", "(", "\"90px\"", ")", ";", "return", "logo", ";", "}", "private", "TextArea", "createXmlArea", "(", ")", "{", "TextArea", "area", "=", "new", "TextArea", "(", ")", ";", "area", ".", "setStyleName", "(", "\"xml-area\"", ")", ";", "area", ".", "setSizeFull", "(", ")", ";", "area", ".", "setValue", "(", "readStartingPoint", "(", ")", ")", ";", "return", "area", ";", "}", "private", "Button", "createUpdateButton", "(", ")", "{", "return", "new", "Button", "(", "\"Update\"", ",", "new", "Button", ".", "ClickListener", "(", ")", "{", "public", "void", "buttonClick", "(", "ClickEvent", "event", ")", "{", "updateLayout", "(", ")", ";", "}", "}", ")", ";", "}", "private", "String", "readStartingPoint", "(", ")", "{", "BufferedReader", "reader", "=", "null", ";", "try", "{", "reader", "=", "new", "BufferedReader", "(", "new", "InputStreamReader", "(", "getClass", "(", ")", ".", "getClassLoader", "(", ")", ".", "getResourceAsStream", "(", "\"\"", ")", ")", ")", ";", "StringBuilder", "xml", "=", "new", "StringBuilder", "(", ")", ";", "String", "line", ";", "while", "(", "(", "line", "=", "reader", ".", "readLine", "(", ")", ")", "!=", "null", ")", "{", "xml", ".", "append", "(", "line", ")", ";", "xml", ".", "append", "(", "\"n\"", ")", ";", "}", "return", "xml", ".", "toString", "(", ")", ";", "}", "catch", "(", "IOException", "e", ")", "{", "e", ".", "printStackTrace", "(", ")", ";", "}", "finally", "{", "if", "(", "reader", "!=", "null", ")", "{", "try", "{", "reader", ".", "close", "(", ")", ";", "}", "catch", "(", "IOException", "e", ")", "{", "e", ".", "printStackTrace", "(", ")", ";", "}", "}", "}", "return", "null", ";", "}", "private", "void", "updateLayout", "(", ")", "{", "try", "{", "Component", "c", "=", "Clara", ".", "create", "(", "new", "ByteArrayInputStream", "(", "xmlArea", ".", "getValue", "(", ")", ".", "toString", "(", ")", ".", "getBytes", "(", ")", ")", ",", "controller", ")", ";", "split", ".", "replaceComponent", "(", "split", ".", "getSecondComponent", "(", ")", ",", "c", ")", ";", "}", "catch", "(", "LayoutInflaterException", "e", ")", "{", "mainWindow", ".", "showNotification", "(", "e", ".", "getMessage", "(", ")", ",", "Notification", ".", "TYPE_ERROR_MESSAGE", ")", ";", "}", "}", "}", ""], + "id": 0 +} +``` + +#### python + +An example of 'train' looks as follows. +``` +{ + "code": ["", "from", "bootstrap", "import", "Bootstrap", "", "from", "fund", "import", "InstantPaymentNotificationHandler", "", "from", "fund", "import", "ThankYouHandler", "", "from", "view", "import", "*", "", "mapping", "=", "[", "(", "", "r'/'", ",", "", "Index", "", ")", ",", "(", "", "r'/ipn'", ",", "", "InstantPaymentNotificationHandler", "", ")", ",", "(", "", "r'/thank-you'", ",", "", "ThankYouHandler", "", ")", ",", "(", "", "r'/about\\/?'", ",", "", "About", "", ")", ",", "(", "", "r'/guide\\/?'", ",", "", "Guide", "", ")", ",", "(", "", "r''", ",", "", "Download", "", ")", ",", "(", "", "r''", ",", "", "Standards", "", ")", ",", "(", "", "r'/community\\/?'", ",", "", "Community", "", ")", ",", "(", "", "r'/news\\/?'", ",", "", "News", "", ")", ",", "(", "", "r'/support\\/?'", ",", "", "Support", "", ")", ",", "(", "", "r'/contact\\/?'", ",", "", "Contact", "", ")", ",", "(", "", "r'/press\\/?'", ",", "", "Press", "", ")", ",", "(", "", "r'/legal/terms'", ",", "", "Terms", "", ")", ",", "(", "", "r'/library\\/?'", ",", "", "Library", "", ")", ",", "(", "", "r''", ",", "", "Library", "", ")", ",", "(", "", "r''", ",", "", "Library", "", ")", ",", "(", "", "r''", ",", "", "Users", "", ")", ",", "(", "", "r''", ",", "", "User", "", ")", ",", "(", "", "r''", ",", "", "Design", "", ")", ",", "(", "", "r''", ",", "", "Design", "", ")", ",", "(", "", "r''", ",", "", "Design", "", ")", ",", "(", "", "r''", ",", "", "Design", "", ")", ",", "(", "", "r''", ",", "", "Design", "", ")", ",", "(", "", "r''", ",", "", "RedirectSuccess", "", ")", ",", "(", "", "r''", ",", "", "RedirectError", "", ")", ",", "(", "", "r''", ",", "", "RedirectAfterDelete", "", ")", ",", "(", "", "r''", ",", "", "Moderate", "", ")", ",", "(", "", "r''", ",", "", "Bootstrap", "", ")", ",", "(", "", "r'/activity'", ",", "", "ActivityScreen", "", ")", ",", "(", "", "r'/txns'", ",", "", "TxnList", "", ")", ",", "(", "", "r''", ",", "", "Base64Blob", "", ")", ",", "(", "", "r''", ",", "", "Base64Blob", "", ")", ",", "(", "", "r''", ",", "", "MessageStrings", "", ")", ",", "(", "", "r'/.*'", ",", "", "NotFound", "", ")", "", "]", ""], + "id": 0, + "path": "00/wikihouse/urls.py\n" +} +``` + +### Data Fields + +In the following each data field in go is explained for each config. The data fields are the same among all splits. + +#### java + +|field name| type | description | +|----------|----------------|--------------------| +|id |int32 | Index of the sample| +|code |Sequence[string]| Code Tokens | + +#### python + +|field name| type | description | +|----------|----------------|-----------------------------| +|id |int32 | Index of the sample | +|path |string | Original path in the dataset| +|code |Sequence[string]| Code Tokens | + +### Data Splits + +#### java + +| |train|validation|test| +|----|----:|---------:|---:| +|java|12934| 7189|8268| + +#### python + +| |train |test | +|------|-----:|----:| +|python|100000|50000| + +## Dataset Creation + +### Curation Rationale + +[More Information Needed] + +### Source Data + +#### Initial Data Collection and Normalization + +[More Information Needed] + +#### Who are the source language producers? + +[More Information Needed] + +### Annotations + +#### Annotation process + +[More Information Needed] + +#### Who are the annotators? + +[More Information Needed] + +### Personal and Sensitive Information + +[More Information Needed] + +## Considerations for Using the Data + +### Social Impact of Dataset + +[More Information Needed] + +### Discussion of Biases + +[More Information Needed] + +### Other Known Limitations + +[More Information Needed] + +## Additional Information + +### Dataset Curators + +https://github.com/microsoft, https://github.com/madlag + +### Licensing Information + +Computational Use of Data Agreement (C-UDA) License. + +### Citation Information + +``` +@article{raychev2016probabilistic, + title={Probabilistic Model for Code with Decision Trees}, + author={Raychev, Veselin and Bielik, Pavol and Vechev, Martin}, + journal={ACM SIGPLAN Notices}, + pages={731--747}, + year={2016}, + publisher={ACM New York, NY, USA} + } + @inproceedings{allamanis2013mining, + title={Mining Source Code Repositories at Massive Scale using Language Modeling}, + author={Allamanis, Miltiadis and Sutton, Charles}, + booktitle={2013 10th Working Conference on Mining Software Repositories (MSR)}, + pages={207--216}, + year={2013}, + organization={IEEE} + } +``` + +### Contributions + +Thanks to @madlag (and partly also @ncoop57) for adding this dataset. \ No newline at end of file diff --git a/datasets/code_x_glue_cc_code_completion_token/code_x_glue_cc_code_completion_token.py b/datasets/code_x_glue_cc_code_completion_token/code_x_glue_cc_code_completion_token.py new file mode 100644 index 00000000000..8945a8b1db6 --- /dev/null +++ b/datasets/code_x_glue_cc_code_completion_token/code_x_glue_cc_code_completion_token.py @@ -0,0 +1,216 @@ +import os +import os.path +from typing import List + +import datasets + +from .common import Child +from .generated_definitions import DEFINITIONS + + +_DESCRIPTION = """Predict next code token given context of previous tokens. Models are evaluated by token level accuracy. +Code completion is a one of the most widely used features in software development through IDEs. An effective code completion tool could improve software developers' productivity. We provide code completion evaluation tasks in two granularities -- token level and line level. Here we introduce token level code completion. Token level task is analogous to language modeling. Models should have be able to predict the next token in arbitary types. +""" + +_CITATION = """@article{raychev2016probabilistic, + title={Probabilistic Model for Code with Decision Trees}, + author={Raychev, Veselin and Bielik, Pavol and Vechev, Martin}, + journal={ACM SIGPLAN Notices}, + pages={731--747}, + year={2016}, + publisher={ACM New York, NY, USA} +} +@inproceedings{allamanis2013mining, + title={Mining Source Code Repositories at Massive Scale using Language Modeling}, + author={Allamanis, Miltiadis and Sutton, Charles}, + booktitle={2013 10th Working Conference on Mining Software Repositories (MSR)}, + pages={207--216}, + year={2013}, + organization={IEEE} +}""" + + +class CodeXGlueCcCodeCompletionTokenImpl(Child): + _DESCRIPTION = _DESCRIPTION + _CITATION = _CITATION + + +class CodeXGlueCcCodeCompletionTokenJavaImpl(CodeXGlueCcCodeCompletionTokenImpl): + SPLITS = { + "training": datasets.Split.TRAIN, + "validation": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } + + _FEATURES = { + "id": datasets.Value("int32"), # Index of the sample + "code": datasets.features.Sequence(datasets.Value("string")), # Code Tokens + } + + def generate_urls(self, split_name): + language = self.info["parameters"]["language"] + if language != "java": + raise RuntimeError(f"Unknown language {language}: should be java.") + + yield "data", f"https://zenodo.org/record/3628665/files/java_{split_name}_pre" + + def _generate_examples(self, split_name, file_paths): + with open(file_paths["data"], encoding="utf-8") as f: + for idx, line in enumerate(f): + new_data = [] + for token in line.strip().split(): + if len(token) > 100: + continue + new_data.append(token) + entry = dict(id=idx, code=new_data) + yield idx, entry + + +class CodeXGlueCcCodeCompletionTokenPythonImpl(CodeXGlueCcCodeCompletionTokenImpl): + SPLITS = {"train": datasets.Split.TRAIN, "test": datasets.Split.TEST} + + _FEATURES = { + "id": datasets.Value("int32"), # Index of the sample + "path": datasets.Value("string"), # Original path in the dataset + "code": datasets.features.Sequence(datasets.Value("string")), # Code Tokens + } + + PYTHON_FILE_MAPPING = dict(train="python100k_train.txt", test="python50k_eval.txt") + + def generate_urls(self, split_name): + language = self.info["parameters"]["language"] + if language != "python": + raise RuntimeError(f"Unknown language {language}") + + yield "data", "http://files.srl.inf.ethz.ch/data/py150_files.tar.gz" + + def process_string(self, token): + # Copyright (c) Microsoft Corporation. + # Licensed under the MIT License. + import re + + str_quote_options = ["'''", '"""', "'", '"'] + start_quote = "" + end_quote = "" + qualifier_regex = r"^[a-z]+" + qualifier_match = re.search(qualifier_regex, token) + # string qualifiers like 'r' for regex, 'f' for formatted string, 'b' for bytes, 'u' for unicode, etc (or combination of them) + qualifier = "" if not qualifier_match else qualifier_match[0] + # token string without qualifiers + token_string = re.sub(qualifier_regex, "", token) + # string literal without quotes + str_lit = token_string + for q in str_quote_options: + if token_string.startswith(q): + start_quote = q + str_lit = str_lit[len(q) :] + if token_string.endswith(q): + end_quote = q + str_lit = str_lit[: -len(q)] + break + if start_quote in str_quote_options[:2]: + return "" + return ( + f"{qualifier}{start_quote}{str_lit}{end_quote}" + if len(str_lit) < 15 + and "\n" not in str_lit + and "" not in str_lit + and "" not in str_lit + and "" not in str_lit + and "" not in str_lit + else f"{qualifier}{start_quote}{end_quote}" + ) + + def py_tokenize(self, base_dir, file_name): + # Copyright (c) Microsoft Corporation. + # Licensed under the MIT License. + from io import BytesIO + from tokenize import COMMENT, ENCODING, ENDMARKER, INDENT, NEWLINE, NL, NUMBER, STRING, tokenize + + file_paths = open(os.path.join(base_dir, file_name), encoding="utf-8").readlines() + for ct, path in enumerate(file_paths): + try: + code = open(os.path.join(base_dir, path.strip()), encoding="utf-8").read() + token_gen = tokenize(BytesIO(bytes(code, "utf8")).readline) + out_tokens = [] + prev_eol = False + for toknum, tokval, _, _, _ in token_gen: + tokval = " ".join(tokval.split()) + if len(tokval) > 100: + continue + if toknum == STRING: + add_token = self.process_string(tokval) + if len(add_token) > 0: + out_tokens.append(add_token) + prev_eol = False + elif toknum == NUMBER: + if len(tokval) < 50: + out_tokens.append(tokval) + prev_eol = False + elif toknum in [NEWLINE, NL]: + if not prev_eol: + out_tokens.append("") + prev_eol = True + elif toknum in [COMMENT, INDENT, ENCODING, ENDMARKER] or len(tokval) == 0: + continue + else: + out_tokens.append(tokval) + prev_eol = False + if out_tokens[0] == "": + out_tokens = out_tokens[1:] + if out_tokens[-1] == "": + out_tokens = out_tokens[:-1] + except Exception: + out_tokens = [] + out_tokens = [""] + out_tokens + [""] + yield path, out_tokens + + def _generate_examples(self, split_name, file_paths): + base_dir = file_paths["data"] + filename = self.PYTHON_FILE_MAPPING[split_name] + + data_dir = os.path.join(base_dir, "data") + if not os.path.exists(data_dir): + import gzip + import tarfile + + gzip_filename = os.path.join(base_dir, "data.tar.gz") + with gzip.open(gzip_filename, "rb") as gzip_file: + t = tarfile.TarFile(fileobj=gzip_file) + t.extractall(path=base_dir) + + idx = 0 + for entry in self.py_tokenize(base_dir=base_dir, file_name=filename): + path, out_tokens = entry + path = path[len("data/") :] + yield idx, dict(id=idx, path=path, code=out_tokens) + idx += 1 + + +CLASS_MAPPING = { + "CodeXGlueCcCodeCompletionTokenJava": CodeXGlueCcCodeCompletionTokenJavaImpl, + "CodeXGlueCcCodeCompletionTokenPython": CodeXGlueCcCodeCompletionTokenPythonImpl, +} + + +class CodeXGlueCcCodeCompletionToken(datasets.GeneratorBasedBuilder): + BUILDER_CONFIG_CLASS = datasets.BuilderConfig + BUILDER_CONFIGS = [ + datasets.BuilderConfig(name=name, description=info["description"]) for name, info in DEFINITIONS.items() + ] + + def _info(self): + name = self.config.name + info = DEFINITIONS[name] + if info["class_name"] in CLASS_MAPPING: + self.child = CLASS_MAPPING[info["class_name"]](info) + else: + raise RuntimeError(f"Unknown python class for dataset configuration {name}") + ret = self.child._info() + return ret + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + return self.child._split_generators(dl_manager=dl_manager) + + def _generate_examples(self, split_name, file_paths): + return self.child._generate_examples(split_name, file_paths) diff --git a/datasets/code_x_glue_cc_code_completion_token/common.py b/datasets/code_x_glue_cc_code_completion_token/common.py new file mode 100644 index 00000000000..bbd28960601 --- /dev/null +++ b/datasets/code_x_glue_cc_code_completion_token/common.py @@ -0,0 +1,75 @@ +from typing import List + +import datasets + + +# Citation, taken from https://github.com/microsoft/CodeXGLUE +_DEFAULT_CITATION = """@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},}""" + + +class Child: + _DESCRIPTION = None + _FEATURES = None + _CITATION = None + SPLITS = {"train": datasets.Split.TRAIN} + _SUPERVISED_KEYS = None + + def __init__(self, info): + self.info = info + + def homepage(self): + return self.info["project_url"] + + def _info(self): + # This is the description that will appear on the datasets page. + return datasets.DatasetInfo( + description=self.info["description"] + "\n\n" + self._DESCRIPTION, + features=datasets.Features(self._FEATURES), + homepage=self.homepage(), + citation=self._CITATION or _DEFAULT_CITATION, + supervised_keys=self._SUPERVISED_KEYS, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + SPLITS = self.SPLITS + _URL = self.info["raw_url"] + urls_to_download = {} + for split in SPLITS: + if split not in urls_to_download: + urls_to_download[split] = {} + + for key, url in self.generate_urls(split): + if not url.startswith("http"): + url = _URL + "/" + url + urls_to_download[split][key] = url + + downloaded_files = {} + for k, v in urls_to_download.items(): + downloaded_files[k] = dl_manager.download_and_extract(v) + + return [ + datasets.SplitGenerator( + name=SPLITS[k], + gen_kwargs={"split_name": k, "file_paths": downloaded_files[k]}, + ) + for k in SPLITS + ] + + def check_empty(self, entries): + all_empty = all([v == "" for v in entries.values()]) + all_non_empty = all([v != "" for v in entries.values()]) + + if not all_non_empty and not all_empty: + raise RuntimeError("Parallel data files should have the same number of lines.") + + return all_empty + + +class TrainValidTestChild(Child): + SPLITS = { + "train": datasets.Split.TRAIN, + "valid": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } diff --git a/datasets/code_x_glue_cc_code_completion_token/dataset_infos.json b/datasets/code_x_glue_cc_code_completion_token/dataset_infos.json new file mode 100644 index 00000000000..45351731352 --- /dev/null +++ b/datasets/code_x_glue_cc_code_completion_token/dataset_infos.json @@ -0,0 +1 @@ +{"java": {"description": "CodeXGLUE CodeCompletion-token dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-token\n\nPredict next code token given context of previous tokens. Models are evaluated by token level accuracy.\nCode completion is a one of the most widely used features in software development through IDEs. An effective code completion tool could improve software developers' productivity. We provide code completion evaluation tasks in two granularities -- token level and line level. Here we introduce token level code completion. Token level task is analogous to language modeling. Models should have be able to predict the next token in arbitary types.\n", "citation": "@article{raychev2016probabilistic,\n title={Probabilistic Model for Code with Decision Trees},\n author={Raychev, Veselin and Bielik, Pavol and Vechev, Martin},\n journal={ACM SIGPLAN Notices},\n pages={731--747},\n year={2016},\n publisher={ACM New York, NY, USA}\n}\n@inproceedings{allamanis2013mining,\n title={Mining Source Code Repositories at Massive Scale using Language Modeling},\n author={Allamanis, Miltiadis and Sutton, Charles},\n booktitle={2013 10th Working Conference on Mining Software Repositories (MSR)},\n pages={207--216},\n year={2013},\n organization={IEEE}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/CodeCompletion-token", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "code": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_code_completion_token", "config_name": "java", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 128312061, "num_examples": 12934, "dataset_name": "code_x_glue_cc_code_completion_token"}, "validation": {"name": "validation", "num_bytes": 30259174, "num_examples": 7189, "dataset_name": "code_x_glue_cc_code_completion_token"}, "test": {"name": "test", "num_bytes": 43027956, "num_examples": 8268, "dataset_name": "code_x_glue_cc_code_completion_token"}}, "download_checksums": {"https://zenodo.org/record/3628665/files/java_training_pre": {"num_bytes": 81051708, "checksum": "676295d2756adcac22e213fbc3ea0f50669a0d152e9497e23a2929e2e2124905"}, "https://zenodo.org/record/3628665/files/java_validation_pre": {"num_bytes": 18835141, "checksum": "0c58a97d96aa7435396581ee5efbb93c0e74a545ba5f795f878098a6e59ab8b3"}, "https://zenodo.org/record/3628665/files/java_test_pre": {"num_bytes": 26969670, "checksum": "a88cd5c91c2ed23a928528bef3535f4fc8db1359975447211f2b13926cc38d9d"}}, "download_size": 126856519, "post_processing_size": null, "dataset_size": 201599191, "size_in_bytes": 328455710}, "python": {"description": "CodeXGLUE CodeCompletion-token dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-token\n\nPredict next code token given context of previous tokens. Models are evaluated by token level accuracy.\nCode completion is a one of the most widely used features in software development through IDEs. An effective code completion tool could improve software developers' productivity. We provide code completion evaluation tasks in two granularities -- token level and line level. Here we introduce token level code completion. Token level task is analogous to language modeling. Models should have be able to predict the next token in arbitary types.\n", "citation": "@article{raychev2016probabilistic,\n title={Probabilistic Model for Code with Decision Trees},\n author={Raychev, Veselin and Bielik, Pavol and Vechev, Martin},\n journal={ACM SIGPLAN Notices},\n pages={731--747},\n year={2016},\n publisher={ACM New York, NY, USA}\n}\n@inproceedings{allamanis2013mining,\n title={Mining Source Code Repositories at Massive Scale using Language Modeling},\n author={Allamanis, Miltiadis and Sutton, Charles},\n booktitle={2013 10th Working Conference on Mining Software Repositories (MSR)},\n pages={207--216},\n year={2013},\n organization={IEEE}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/CodeCompletion-token", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_code_completion_token", "config_name": "python", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 684319575, "num_examples": 100000, "dataset_name": "code_x_glue_cc_code_completion_token"}, "test": {"name": "test", "num_bytes": 333978088, "num_examples": 50000, "dataset_name": "code_x_glue_cc_code_completion_token"}}, "download_checksums": {"http://files.srl.inf.ethz.ch/data/py150_files.tar.gz": {"num_bytes": 199067128, "checksum": "73be7f7a78e549845cf80cf779a3bcc3a9cf351ff7017e07b89e8d1c82b8d389"}}, "download_size": 199067128, "post_processing_size": null, "dataset_size": 1018297663, "size_in_bytes": 1217364791}} \ No newline at end of file diff --git a/datasets/code_x_glue_cc_code_completion_token/dummy/java/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_code_completion_token/dummy/java/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..80c1f8aa34b Binary files /dev/null and b/datasets/code_x_glue_cc_code_completion_token/dummy/java/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_code_completion_token/dummy/python/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_code_completion_token/dummy/python/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..f4e0eccc57b Binary files /dev/null and b/datasets/code_x_glue_cc_code_completion_token/dummy/python/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_code_completion_token/generated_definitions.py b/datasets/code_x_glue_cc_code_completion_token/generated_definitions.py new file mode 100644 index 00000000000..dcd7989b851 --- /dev/null +++ b/datasets/code_x_glue_cc_code_completion_token/generated_definitions.py @@ -0,0 +1,24 @@ +DEFINITIONS = { + "java": { + "class_name": "CodeXGlueCcCodeCompletionTokenJava", + "dataset_type": "Code-Code", + "description": "CodeXGLUE CodeCompletion-token dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-token", + "dir_name": "CodeCompletion-token", + "name": "java", + "parameters": {"language": "java", "original_language_name": "javaCorpus"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/CodeCompletion-token", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/CodeCompletion-token/dataset/javaCorpus", + "sizes": {"test": 8268, "train": 12934, "validation": 7189}, + }, + "python": { + "class_name": "CodeXGlueCcCodeCompletionTokenPython", + "dataset_type": "Code-Code", + "description": "CodeXGLUE CodeCompletion-token dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-token", + "dir_name": "CodeCompletion-token", + "name": "python", + "parameters": {"language": "python", "original_language_name": "py150"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/CodeCompletion-token", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/CodeCompletion-token/dataset/py150", + "sizes": {"test": 50000, "train": 100000}, + }, +} diff --git a/datasets/code_x_glue_cc_code_refinement/README.md b/datasets/code_x_glue_cc_code_refinement/README.md new file mode 100644 index 00000000000..5d3e1378415 --- /dev/null +++ b/datasets/code_x_glue_cc_code_refinement/README.md @@ -0,0 +1,176 @@ +--- +annotations_creators: +- expert-generated +language_creators: +- found +languages: +- code +licenses: +- other-C-UDA +multilinguality: +- other-programming-languages +size_categories: +- 10K METHOD_1 ( ) { java.util.ArrayList < TYPE_1 > VAR_1 = new java.util.ArrayList < TYPE_1 > ( ) ; for ( TYPE_2 VAR_2 : VAR_3 ) { VAR_1 . METHOD_2 ( VAR_2 . METHOD_1 ( ) ) ; } return VAR_1 ; } \n", + "fixed": "public java.util.List < TYPE_1 > METHOD_1 ( ) { return VAR_1 ; } \n", + "id": 0 +} +``` + +### Data Fields + +In the following each data field in go is explained for each config. The data fields are the same among all splits. + +#### medium, small + +|field name| type | description | +|----------|------|--------------------------------| +|id |int32 | Index of the sample | +|buggy |string| The buggy version of the code | +|fixed |string| The correct version of the code| + +### Data Splits + +| name |train|validation|test| +|------|----:|---------:|---:| +|medium|52364| 6546|6545| +|small |46680| 5835|5835| + +## Dataset Creation + +### Curation Rationale + +[More Information Needed] + +### Source Data + +#### Initial Data Collection and Normalization + +Downloaded from GitHub Archive every public GitHub event between March 2011 and October 2017 and used the Google BigQuery APIs. +[More Information Needed] + +#### Who are the source language producers? + +Software Engineering developers. + +### Annotations + +#### Annotation process + +Automatically annotated by filtering commit messages containing the pattern: ("fix" or "solve") and ("bug" or "issue" or "problem" or "error"). A statistically significant amount of samples (95% confidence level with 5% confidence interval) were manually evaluated by two authors to check if the filtered bug/fix pairs were correct. After all disagreements were settled, authors conclude that 97.6% were true positives. + +#### Who are the annotators? + +Heuristics and the authors of the paper. + +### Personal and Sensitive Information + +[More Information Needed] + +## Considerations for Using the Data + +### Social Impact of Dataset + +[More Information Needed] + +### Discussion of Biases + +[More Information Needed] + +### Other Known Limitations + +[More Information Needed] + +## Additional Information + +### Dataset Curators + +https://github.com/microsoft, https://github.com/madlag + +### Licensing Information + +Computational Use of Data Agreement (C-UDA) License. + +### Citation Information + +``` +@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},} +``` + +### Contributions + +Thanks to @madlag (and partly also @ncoop57) for adding this dataset. diff --git a/datasets/code_x_glue_cc_code_refinement/code_x_glue_cc_code_refinement.py b/datasets/code_x_glue_cc_code_refinement/code_x_glue_cc_code_refinement.py new file mode 100644 index 00000000000..5aaf27e633a --- /dev/null +++ b/datasets/code_x_glue_cc_code_refinement/code_x_glue_cc_code_refinement.py @@ -0,0 +1,93 @@ +from typing import List + +import datasets + +from .common import TrainValidTestChild +from .generated_definitions import DEFINITIONS + + +_DESCRIPTION = """We use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length.""" +_CITATION = """@article{10.1145/3340544, +author = {Tufano, Michele and Watson, Cody and Bavota, Gabriele and Penta, Massimiliano Di and White, Martin and Poshyvanyk, Denys}, +title = {An Empirical Study on Learning Bug-Fixing Patches in the Wild via Neural Machine Translation}, +year = {2019}, +issue_date = {October 2019}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +volume = {28}, +number = {4}, +issn = {1049-331X}, +url = {https://doi-org.proxy.wm.edu/10.1145/3340544}, +doi = {10.1145/3340544}, +abstract = {Millions of open source projects with numerous bug fixes are available in code repositories. This proliferation of software development histories can be leveraged to learn how to fix common programming bugs. To explore such a potential, we perform an empirical study to assess the feasibility of using Neural Machine Translation techniques for learning bug-fixing patches for real defects. First, we mine millions of bug-fixes from the change histories of projects hosted on GitHub in order to extract meaningful examples of such bug-fixes. Next, we abstract the buggy and corresponding fixed code, and use them to train an Encoder-Decoder model able to translate buggy code into its fixed version. In our empirical investigation, we found that such a model is able to fix thousands of unique buggy methods in the wild. Overall, this model is capable of predicting fixed patches generated by developers in 9--50% of the cases, depending on the number of candidate patches we allow it to generate. Also, the model is able to emulate a variety of different Abstract Syntax Tree operations and generate candidate patches in a split second.}, +journal = {ACM Trans. Softw. Eng. Methodol.}, +month = sep, +articleno = {19}, +numpages = {29}, +keywords = {bug-fixes, Neural machine translation} +}""" + + +class CodeXGlueCcCodeRefinementImpl(TrainValidTestChild): + _DESCRIPTION = _DESCRIPTION + _CITATION = _CITATION + + _FEATURES = { + "id": datasets.Value("int32"), # Index of the sample + "buggy": datasets.Value("string"), # The buggy version of the code + "fixed": datasets.Value("string"), # The correct version of the code + } + + _SUPERVISED_KEYS = ["fixed"] + + def generate_urls(self, split_name): + size = self.info["parameters"]["size"] + for key in "buggy", "fixed": + yield key, f"{size}/{split_name}.buggy-fixed.{key}" + + def _generate_examples(self, split_name, file_paths): + """This function returns the examples in the raw (text) form.""" + # Open each file (one for java, and one for c#) + files = {k: open(file_paths[k], encoding="utf-8") for k in file_paths} + + id_ = 0 + while True: + # Read a single line from each file + entries = {k: files[k].readline() for k in file_paths} + + empty = self.check_empty(entries) + if empty: + # We are done: end of files + return + + entries["id"] = id_ + yield id_, entries + id_ += 1 + + +CLASS_MAPPING = { + "CodeXGlueCcCodeRefinement": CodeXGlueCcCodeRefinementImpl, +} + + +class CodeXGlueCcCodeRefinement(datasets.GeneratorBasedBuilder): + BUILDER_CONFIG_CLASS = datasets.BuilderConfig + BUILDER_CONFIGS = [ + datasets.BuilderConfig(name=name, description=info["description"]) for name, info in DEFINITIONS.items() + ] + + def _info(self): + name = self.config.name + info = DEFINITIONS[name] + if info["class_name"] in CLASS_MAPPING: + self.child = CLASS_MAPPING[info["class_name"]](info) + else: + raise RuntimeError(f"Unknown python class for dataset configuration {name}") + ret = self.child._info() + return ret + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + return self.child._split_generators(dl_manager=dl_manager) + + def _generate_examples(self, split_name, file_paths): + return self.child._generate_examples(split_name, file_paths) diff --git a/datasets/code_x_glue_cc_code_refinement/common.py b/datasets/code_x_glue_cc_code_refinement/common.py new file mode 100644 index 00000000000..bbd28960601 --- /dev/null +++ b/datasets/code_x_glue_cc_code_refinement/common.py @@ -0,0 +1,75 @@ +from typing import List + +import datasets + + +# Citation, taken from https://github.com/microsoft/CodeXGLUE +_DEFAULT_CITATION = """@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},}""" + + +class Child: + _DESCRIPTION = None + _FEATURES = None + _CITATION = None + SPLITS = {"train": datasets.Split.TRAIN} + _SUPERVISED_KEYS = None + + def __init__(self, info): + self.info = info + + def homepage(self): + return self.info["project_url"] + + def _info(self): + # This is the description that will appear on the datasets page. + return datasets.DatasetInfo( + description=self.info["description"] + "\n\n" + self._DESCRIPTION, + features=datasets.Features(self._FEATURES), + homepage=self.homepage(), + citation=self._CITATION or _DEFAULT_CITATION, + supervised_keys=self._SUPERVISED_KEYS, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + SPLITS = self.SPLITS + _URL = self.info["raw_url"] + urls_to_download = {} + for split in SPLITS: + if split not in urls_to_download: + urls_to_download[split] = {} + + for key, url in self.generate_urls(split): + if not url.startswith("http"): + url = _URL + "/" + url + urls_to_download[split][key] = url + + downloaded_files = {} + for k, v in urls_to_download.items(): + downloaded_files[k] = dl_manager.download_and_extract(v) + + return [ + datasets.SplitGenerator( + name=SPLITS[k], + gen_kwargs={"split_name": k, "file_paths": downloaded_files[k]}, + ) + for k in SPLITS + ] + + def check_empty(self, entries): + all_empty = all([v == "" for v in entries.values()]) + all_non_empty = all([v != "" for v in entries.values()]) + + if not all_non_empty and not all_empty: + raise RuntimeError("Parallel data files should have the same number of lines.") + + return all_empty + + +class TrainValidTestChild(Child): + SPLITS = { + "train": datasets.Split.TRAIN, + "valid": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } diff --git a/datasets/code_x_glue_cc_code_refinement/dataset_infos.json b/datasets/code_x_glue_cc_code_refinement/dataset_infos.json new file mode 100644 index 00000000000..dbdf44a2963 --- /dev/null +++ b/datasets/code_x_glue_cc_code_refinement/dataset_infos.json @@ -0,0 +1 @@ +{"medium": {"description": "CodeXGLUE code-refinement dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-refinement\n\nWe use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length.", "citation": "@article{10.1145/3340544,\nauthor = {Tufano, Michele and Watson, Cody and Bavota, Gabriele and Penta, Massimiliano Di and White, Martin and Poshyvanyk, Denys},\ntitle = {An Empirical Study on Learning Bug-Fixing Patches in the Wild via Neural Machine Translation},\nyear = {2019},\nissue_date = {October 2019},\npublisher = {Association for Computing Machinery},\naddress = {New York, NY, USA},\nvolume = {28},\nnumber = {4},\nissn = {1049-331X},\nurl = {https://doi-org.proxy.wm.edu/10.1145/3340544},\ndoi = {10.1145/3340544},\nabstract = {Millions of open source projects with numerous bug fixes are available in code repositories. This proliferation of software development histories can be leveraged to learn how to fix common programming bugs. To explore such a potential, we perform an empirical study to assess the feasibility of using Neural Machine Translation techniques for learning bug-fixing patches for real defects. First, we mine millions of bug-fixes from the change histories of projects hosted on GitHub in order to extract meaningful examples of such bug-fixes. Next, we abstract the buggy and corresponding fixed code, and use them to train an Encoder-Decoder model able to translate buggy code into its fixed version. In our empirical investigation, we found that such a model is able to fix thousands of unique buggy methods in the wild. Overall, this model is capable of predicting fixed patches generated by developers in 9--50% of the cases, depending on the number of candidate patches we allow it to generate. Also, the model is able to emulate a variety of different Abstract Syntax Tree operations and generate candidate patches in a split second.},\njournal = {ACM Trans. Softw. Eng. Methodol.},\nmonth = sep,\narticleno = {19},\nnumpages = {29},\nkeywords = {bug-fixes, Neural machine translation}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/code-refinement", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "buggy": {"dtype": "string", "id": null, "_type": "Value"}, "fixed": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "fixed", "output": ""}, "task_templates": null, "builder_name": "code_x_glue_cc_code_refinement", "config_name": "medium", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 32614834, "num_examples": 52364, "dataset_name": "code_x_glue_cc_code_refinement"}, "validation": {"name": "validation", "num_bytes": 4086741, "num_examples": 6546, "dataset_name": "code_x_glue_cc_code_refinement"}, "test": {"name": "test", "num_bytes": 4063673, "num_examples": 6545, "dataset_name": "code_x_glue_cc_code_refinement"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data/medium/train.buggy-fixed.buggy": {"num_bytes": 16188348, "checksum": "4570731680fa183650864e8729a7354d235c9a3ef42f0085ace3441418074085"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data/medium/train.buggy-fixed.fixed": {"num_bytes": 15798070, "checksum": "009c121662602642bc55f6882f220aea6a738e6a11f2c4df86e7fe3cd30c175c"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data/medium/valid.buggy-fixed.buggy": {"num_bytes": 2028309, "checksum": "8ad01f88be2009599007f40427458d6d2601fe93f2f1d65b0f46b7d414a3add2"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data/medium/valid.buggy-fixed.fixed": {"num_bytes": 1979872, "checksum": "7ef5e4b2e95914e0eceb4f2cf6dfad0641625145319b9836db70d3f8745ad2d6"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data/medium/test.buggy-fixed.buggy": {"num_bytes": 2014594, "checksum": "21107528c3b25bfdec24d0c4c18a953de31c26f3795a7d7c9e108a60396bcd38"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data/medium/test.buggy-fixed.fixed": {"num_bytes": 1970531, "checksum": "4b13298647e9a782bf908d4a26710e97a1846f5513a9bf1aa46ac8223fb84b3d"}}, "download_size": 39979724, "post_processing_size": null, "dataset_size": 40765248, "size_in_bytes": 80744972}, "small": {"description": "CodeXGLUE code-refinement dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-refinement\n\nWe use the dataset released by this paper(https://arxiv.org/pdf/1812.08693.pdf). The source side is a Java function with bugs and the target side is the refined one. All the function and variable names are normalized. Their dataset contains two subsets ( i.e.small and medium) based on the function length.", "citation": "@article{10.1145/3340544,\nauthor = {Tufano, Michele and Watson, Cody and Bavota, Gabriele and Penta, Massimiliano Di and White, Martin and Poshyvanyk, Denys},\ntitle = {An Empirical Study on Learning Bug-Fixing Patches in the Wild via Neural Machine Translation},\nyear = {2019},\nissue_date = {October 2019},\npublisher = {Association for Computing Machinery},\naddress = {New York, NY, USA},\nvolume = {28},\nnumber = {4},\nissn = {1049-331X},\nurl = {https://doi-org.proxy.wm.edu/10.1145/3340544},\ndoi = {10.1145/3340544},\nabstract = {Millions of open source projects with numerous bug fixes are available in code repositories. This proliferation of software development histories can be leveraged to learn how to fix common programming bugs. To explore such a potential, we perform an empirical study to assess the feasibility of using Neural Machine Translation techniques for learning bug-fixing patches for real defects. First, we mine millions of bug-fixes from the change histories of projects hosted on GitHub in order to extract meaningful examples of such bug-fixes. Next, we abstract the buggy and corresponding fixed code, and use them to train an Encoder-Decoder model able to translate buggy code into its fixed version. In our empirical investigation, we found that such a model is able to fix thousands of unique buggy methods in the wild. Overall, this model is capable of predicting fixed patches generated by developers in 9--50% of the cases, depending on the number of candidate patches we allow it to generate. Also, the model is able to emulate a variety of different Abstract Syntax Tree operations and generate candidate patches in a split second.},\njournal = {ACM Trans. Softw. Eng. Methodol.},\nmonth = sep,\narticleno = {19},\nnumpages = {29},\nkeywords = {bug-fixes, Neural machine translation}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/code-refinement", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "buggy": {"dtype": "string", "id": null, "_type": "Value"}, "fixed": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "fixed", "output": ""}, "task_templates": null, "builder_name": "code_x_glue_cc_code_refinement", "config_name": "small", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 13006719, "num_examples": 46680, "dataset_name": "code_x_glue_cc_code_refinement"}, "validation": {"name": "validation", "num_bytes": 1629250, "num_examples": 5835, "dataset_name": "code_x_glue_cc_code_refinement"}, "test": {"name": "test", "num_bytes": 1619708, "num_examples": 5835, "dataset_name": "code_x_glue_cc_code_refinement"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data/small/train.buggy-fixed.buggy": {"num_bytes": 6509949, "checksum": "dfb4366dedb73dd40f78c3af870ccb0a1aeff2d9ceb45585df26c99897740748"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data/small/train.buggy-fixed.fixed": {"num_bytes": 5936570, "checksum": "c98b1139265d33e787a9dd742a464e7eb5bd137ebb3fcb54f0416ee7672739f3"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data/small/valid.buggy-fixed.buggy": {"num_bytes": 815315, "checksum": "e83a0c524cdce5a4492dfe0c9bc7d642aa5ed267ddbae5ffd1e563fe54cae6b8"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data/small/valid.buggy-fixed.fixed": {"num_bytes": 743907, "checksum": "dc9c23594350988dcdaf456a1e8eea1dc86c81d1c0ff9985ff6d63d15196ffd2"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data/small/test.buggy-fixed.buggy": {"num_bytes": 809941, "checksum": "d2e675094f471b3bfbb9419eb7cf14dca11f8d41185319d3310cbc3f517df323"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data/small/test.buggy-fixed.fixed": {"num_bytes": 739739, "checksum": "e7ec462d00d253ddec3d1cef7a06cfc99db1cfe94039a6607ed14afeef6be04a"}}, "download_size": 15555421, "post_processing_size": null, "dataset_size": 16255677, "size_in_bytes": 31811098}} \ No newline at end of file diff --git a/datasets/code_x_glue_cc_code_refinement/dummy/medium/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_code_refinement/dummy/medium/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..ddb8a5e2273 Binary files /dev/null and b/datasets/code_x_glue_cc_code_refinement/dummy/medium/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_code_refinement/dummy/small/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_code_refinement/dummy/small/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..941396551d6 Binary files /dev/null and b/datasets/code_x_glue_cc_code_refinement/dummy/small/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_code_refinement/generated_definitions.py b/datasets/code_x_glue_cc_code_refinement/generated_definitions.py new file mode 100644 index 00000000000..088c8aa3560 --- /dev/null +++ b/datasets/code_x_glue_cc_code_refinement/generated_definitions.py @@ -0,0 +1,24 @@ +DEFINITIONS = { + "medium": { + "class_name": "CodeXGlueCcCodeRefinement", + "dataset_type": "Code-Code", + "description": "CodeXGLUE code-refinement dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-refinement", + "dir_name": "code-refinement", + "name": "medium", + "parameters": {"size": "medium"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/code-refinement", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data", + "sizes": {"test": 6545, "train": 52364, "validation": 6546}, + }, + "small": { + "class_name": "CodeXGlueCcCodeRefinement", + "dataset_type": "Code-Code", + "description": "CodeXGLUE code-refinement dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-refinement", + "dir_name": "code-refinement", + "name": "small", + "parameters": {"size": "small"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/code-refinement", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-refinement/data", + "sizes": {"test": 5835, "train": 46680, "validation": 5835}, + }, +} diff --git a/datasets/code_x_glue_cc_code_to_code_trans/README.md b/datasets/code_x_glue_cc_code_to_code_trans/README.md new file mode 100644 index 00000000000..0cba606fce9 --- /dev/null +++ b/datasets/code_x_glue_cc_code_to_code_trans/README.md @@ -0,0 +1,162 @@ +--- +annotations_creators: +- expert-generated +language_creators: +- found +languages: +- code +licenses: +- other-C-UDA +multilinguality: +- other-programming-languages +size_categories: +- 10K List[datasets.SplitGenerator]: + return self.child._split_generators(dl_manager=dl_manager) + + def _generate_examples(self, split_name, file_paths): + return self.child._generate_examples(split_name, file_paths) diff --git a/datasets/code_x_glue_cc_code_to_code_trans/common.py b/datasets/code_x_glue_cc_code_to_code_trans/common.py new file mode 100644 index 00000000000..bbd28960601 --- /dev/null +++ b/datasets/code_x_glue_cc_code_to_code_trans/common.py @@ -0,0 +1,75 @@ +from typing import List + +import datasets + + +# Citation, taken from https://github.com/microsoft/CodeXGLUE +_DEFAULT_CITATION = """@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},}""" + + +class Child: + _DESCRIPTION = None + _FEATURES = None + _CITATION = None + SPLITS = {"train": datasets.Split.TRAIN} + _SUPERVISED_KEYS = None + + def __init__(self, info): + self.info = info + + def homepage(self): + return self.info["project_url"] + + def _info(self): + # This is the description that will appear on the datasets page. + return datasets.DatasetInfo( + description=self.info["description"] + "\n\n" + self._DESCRIPTION, + features=datasets.Features(self._FEATURES), + homepage=self.homepage(), + citation=self._CITATION or _DEFAULT_CITATION, + supervised_keys=self._SUPERVISED_KEYS, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + SPLITS = self.SPLITS + _URL = self.info["raw_url"] + urls_to_download = {} + for split in SPLITS: + if split not in urls_to_download: + urls_to_download[split] = {} + + for key, url in self.generate_urls(split): + if not url.startswith("http"): + url = _URL + "/" + url + urls_to_download[split][key] = url + + downloaded_files = {} + for k, v in urls_to_download.items(): + downloaded_files[k] = dl_manager.download_and_extract(v) + + return [ + datasets.SplitGenerator( + name=SPLITS[k], + gen_kwargs={"split_name": k, "file_paths": downloaded_files[k]}, + ) + for k in SPLITS + ] + + def check_empty(self, entries): + all_empty = all([v == "" for v in entries.values()]) + all_non_empty = all([v != "" for v in entries.values()]) + + if not all_non_empty and not all_empty: + raise RuntimeError("Parallel data files should have the same number of lines.") + + return all_empty + + +class TrainValidTestChild(Child): + SPLITS = { + "train": datasets.Split.TRAIN, + "valid": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } diff --git a/datasets/code_x_glue_cc_code_to_code_trans/dataset_infos.json b/datasets/code_x_glue_cc_code_to_code_trans/dataset_infos.json new file mode 100644 index 00000000000..869f4ae9fb3 --- /dev/null +++ b/datasets/code_x_glue_cc_code_to_code_trans/dataset_infos.json @@ -0,0 +1 @@ +{"default": {"description": "CodeXGLUE code-to-code-trans dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-to-code-trans\n\nThe dataset is collected from several public repos, including Lucene(http://lucene.apache.org/), POI(http://poi.apache.org/), JGit(https://github.com/eclipse/jgit/) and Antlr(https://github.com/antlr/).\n We collect both the Java and C# versions of the codes and find the parallel functions. After removing duplicates and functions with the empty body, we split the whole dataset into training, validation and test sets.", "citation": "@article{DBLP:journals/corr/abs-2102-04664,\n author = {Shuai Lu and\n Daya Guo and\n Shuo Ren and\n Junjie Huang and\n Alexey Svyatkovskiy and\n Ambrosio Blanco and\n Colin B. Clement and\n Dawn Drain and\n Daxin Jiang and\n Duyu Tang and\n Ge Li and\n Lidong Zhou and\n Linjun Shou and\n Long Zhou and\n Michele Tufano and\n Ming Gong and\n Ming Zhou and\n Nan Duan and\n Neel Sundaresan and\n Shao Kun Deng and\n Shengyu Fu and\n Shujie Liu},\n title = {CodeXGLUE: {A} Machine Learning Benchmark Dataset for Code Understanding\n and Generation},\n journal = {CoRR},\n volume = {abs/2102.04664},\n year = {2021}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/code-to-code-trans", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "java": {"dtype": "string", "id": null, "_type": "Value"}, "cs": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "code_x_glue_cc_code_to_code_trans", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4372657, "num_examples": 10300, "dataset_name": "code_x_glue_cc_code_to_code_trans"}, "validation": {"name": "validation", "num_bytes": 226415, "num_examples": 500, "dataset_name": "code_x_glue_cc_code_to_code_trans"}, "test": {"name": "test", "num_bytes": 418595, "num_examples": 1000, "dataset_name": "code_x_glue_cc_code_to_code_trans"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-to-code-trans/data/train.java-cs.txt.cs": {"num_bytes": 2387613, "checksum": "8f9e154e38b17cf19840a44c50a00b6fa16397336c302e3cf514b29ddfafa0e9"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-to-code-trans/data/train.java-cs.txt.java": {"num_bytes": 1861428, "checksum": "3d2ba1a8f5de30688663ce76bf9b061574d330fc54eb08c4b7eccda74f42be67"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-to-code-trans/data/valid.java-cs.txt.cs": {"num_bytes": 124022, "checksum": "687c61db799e9e3369a0822184ba67bb5b007c48025f25d44084cc6f525ce4ea"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-to-code-trans/data/valid.java-cs.txt.java": {"num_bytes": 96385, "checksum": "aed88f2a31af5b6367100bfbca6d9c4888fa63685502b21db817d8b0f0ad5272"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-to-code-trans/data/test.java-cs.txt.cs": {"num_bytes": 229147, "checksum": "4137527f96c898372e368c75deb3ec8c17c1187ac5a1ae641da1df65e143cd2d"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-to-code-trans/data/test.java-cs.txt.java": {"num_bytes": 177440, "checksum": "cad0fb08ae59443baeeb1f58de3af83786358dac8ce3a81fd026708ca1b9b2ee"}}, "download_size": 4876035, "post_processing_size": null, "dataset_size": 5017667, "size_in_bytes": 9893702}} \ No newline at end of file diff --git a/datasets/code_x_glue_cc_code_to_code_trans/dummy/default/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_code_to_code_trans/dummy/default/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..3af26db61d0 Binary files /dev/null and b/datasets/code_x_glue_cc_code_to_code_trans/dummy/default/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_code_to_code_trans/generated_definitions.py b/datasets/code_x_glue_cc_code_to_code_trans/generated_definitions.py new file mode 100644 index 00000000000..f5f0758f6df --- /dev/null +++ b/datasets/code_x_glue_cc_code_to_code_trans/generated_definitions.py @@ -0,0 +1,12 @@ +DEFINITIONS = { + "default": { + "class_name": "CodeXGlueCcCodeToCodeTrans", + "dataset_type": "Code-Code", + "description": "CodeXGLUE code-to-code-trans dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/code-to-code-trans", + "dir_name": "code-to-code-trans", + "name": "default", + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/code-to-code-trans", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/code-to-code-trans/data", + "sizes": {"test": 1000, "train": 10300, "validation": 500}, + } +} diff --git a/datasets/code_x_glue_cc_defect_detection/README.md b/datasets/code_x_glue_cc_defect_detection/README.md new file mode 100644 index 00000000000..495739cd3fb --- /dev/null +++ b/datasets/code_x_glue_cc_defect_detection/README.md @@ -0,0 +1,167 @@ +--- +annotations_creators: +- found +language_creators: +- found +languages: +- code +licenses: +- other-C-UDA +multilinguality: +- other-programming-languages +size_categories: +- 10Koutdev);\n if (chr == NULL) {\n error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,\n \"Device '%s' not found\", s->outdev);\n qemu_chr_fe_init(&s->chr_out, chr, errp);", + "id": 8, + "project": "qemu", + "target": true +} +``` + +### Data Fields + +In the following each data field in go is explained for each config. The data fields are the same among all splits. + +#### default + +|field name| type | description | +|----------|------|------------------------------------------| +|id |int32 | Index of the sample | +|func |string| The source code | +|target |bool | 0 or 1 (vulnerability or not) | +|project |string| Original project that contains this code | +|commit_id |string| Commit identifier in the original project| + +### Data Splits + +| name |train|validation|test| +|-------|----:|---------:|---:| +|default|21854| 2732|2732| + +## Dataset Creation + +### Curation Rationale + +[More Information Needed] + +### Source Data + +#### Initial Data Collection and Normalization + +[More Information Needed] + +#### Who are the source language producers? + +[More Information Needed] + +### Annotations + +#### Annotation process + +[More Information Needed] + +#### Who are the annotators? + +[More Information Needed] + +### Personal and Sensitive Information + +[More Information Needed] + +## Considerations for Using the Data + +### Social Impact of Dataset + +[More Information Needed] + +### Discussion of Biases + +[More Information Needed] + +### Other Known Limitations + +[More Information Needed] + +## Additional Information + +### Dataset Curators + +https://github.com/microsoft, https://github.com/madlag + +### Licensing Information + +Computational Use of Data Agreement (C-UDA) License. + +### Citation Information + +``` +@inproceedings{zhou2019devign, +title={Devign: Effective vulnerability identification by learning comprehensive program semantics via graph neural networks}, +author={Zhou, Yaqin and Liu, Shangqing and Siow, Jingkai and Du, Xiaoning and Liu, Yang}, +booktitle={Advances in Neural Information Processing Systems}, +pages={10197--10207}, year={2019} +``` + +### Contributions + +Thanks to @madlag (and partly also @ncoop57) for adding this dataset. \ No newline at end of file diff --git a/datasets/code_x_glue_cc_defect_detection/code_x_glue_cc_defect_detection.py b/datasets/code_x_glue_cc_defect_detection/code_x_glue_cc_defect_detection.py new file mode 100644 index 00000000000..b65b0693f99 --- /dev/null +++ b/datasets/code_x_glue_cc_defect_detection/code_x_glue_cc_defect_detection.py @@ -0,0 +1,78 @@ +from typing import List + +import datasets + +from .common import TrainValidTestChild +from .generated_definitions import DEFINITIONS + + +_DESCRIPTION = """Given a source code, the task is to identify whether it is an insecure code that may attack software systems, such as resource leaks, use-after-free vulnerabilities and DoS attack. We treat the task as binary classification (0/1), where 1 stands for insecure code and 0 for secure code. +The dataset we use comes from the paper Devign: Effective Vulnerability Identification by Learning Comprehensive Program Semantics via Graph Neural Networks. We combine all projects and split 80%/10%/10% for training/dev/test.""" +_CITATION = """@inproceedings{zhou2019devign, +title={Devign: Effective vulnerability identification by learning comprehensive program semantics via graph neural networks}, +author={Zhou, Yaqin and Liu, Shangqing and Siow, Jingkai and Du, Xiaoning and Liu, Yang}, +booktitle={Advances in Neural Information Processing Systems}, +pages={10197--10207}, year={2019}""" + + +class CodeXGlueCcDefectDetectionImpl(TrainValidTestChild): + _DESCRIPTION = _DESCRIPTION + _CITATION = _CITATION + + _FEATURES = { + "id": datasets.Value("int32"), # Index of the sample + "func": datasets.Value("string"), # The source code + "target": datasets.Value("bool"), # 0 or 1 (vulnerability or not) + "project": datasets.Value("string"), # Original project that contains this code + "commit_id": datasets.Value("string"), # Commit identifier in the original project + } + _SUPERVISED_KEYS = ["target"] + + def generate_urls(self, split_name): + yield "index", f"{split_name}.txt" + yield "data", "function.json" + + def _generate_examples(self, split_name, file_paths): + import json + + js_all = json.load(open(file_paths["data"], encoding="utf-8")) + + index = set() + with open(file_paths["index"], encoding="utf-8") as f: + for line in f: + line = line.strip() + index.add(int(line)) + + for idx, js in enumerate(js_all): + if idx in index: + js["id"] = idx + js["target"] = int(js["target"]) == 1 + yield idx, js + + +CLASS_MAPPING = { + "CodeXGlueCcDefectDetection": CodeXGlueCcDefectDetectionImpl, +} + + +class CodeXGlueCcDefectDetection(datasets.GeneratorBasedBuilder): + BUILDER_CONFIG_CLASS = datasets.BuilderConfig + BUILDER_CONFIGS = [ + datasets.BuilderConfig(name=name, description=info["description"]) for name, info in DEFINITIONS.items() + ] + + def _info(self): + name = self.config.name + info = DEFINITIONS[name] + if info["class_name"] in CLASS_MAPPING: + self.child = CLASS_MAPPING[info["class_name"]](info) + else: + raise RuntimeError(f"Unknown python class for dataset configuration {name}") + ret = self.child._info() + return ret + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + return self.child._split_generators(dl_manager=dl_manager) + + def _generate_examples(self, split_name, file_paths): + return self.child._generate_examples(split_name, file_paths) diff --git a/datasets/code_x_glue_cc_defect_detection/common.py b/datasets/code_x_glue_cc_defect_detection/common.py new file mode 100644 index 00000000000..bbd28960601 --- /dev/null +++ b/datasets/code_x_glue_cc_defect_detection/common.py @@ -0,0 +1,75 @@ +from typing import List + +import datasets + + +# Citation, taken from https://github.com/microsoft/CodeXGLUE +_DEFAULT_CITATION = """@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},}""" + + +class Child: + _DESCRIPTION = None + _FEATURES = None + _CITATION = None + SPLITS = {"train": datasets.Split.TRAIN} + _SUPERVISED_KEYS = None + + def __init__(self, info): + self.info = info + + def homepage(self): + return self.info["project_url"] + + def _info(self): + # This is the description that will appear on the datasets page. + return datasets.DatasetInfo( + description=self.info["description"] + "\n\n" + self._DESCRIPTION, + features=datasets.Features(self._FEATURES), + homepage=self.homepage(), + citation=self._CITATION or _DEFAULT_CITATION, + supervised_keys=self._SUPERVISED_KEYS, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + SPLITS = self.SPLITS + _URL = self.info["raw_url"] + urls_to_download = {} + for split in SPLITS: + if split not in urls_to_download: + urls_to_download[split] = {} + + for key, url in self.generate_urls(split): + if not url.startswith("http"): + url = _URL + "/" + url + urls_to_download[split][key] = url + + downloaded_files = {} + for k, v in urls_to_download.items(): + downloaded_files[k] = dl_manager.download_and_extract(v) + + return [ + datasets.SplitGenerator( + name=SPLITS[k], + gen_kwargs={"split_name": k, "file_paths": downloaded_files[k]}, + ) + for k in SPLITS + ] + + def check_empty(self, entries): + all_empty = all([v == "" for v in entries.values()]) + all_non_empty = all([v != "" for v in entries.values()]) + + if not all_non_empty and not all_empty: + raise RuntimeError("Parallel data files should have the same number of lines.") + + return all_empty + + +class TrainValidTestChild(Child): + SPLITS = { + "train": datasets.Split.TRAIN, + "valid": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } diff --git a/datasets/code_x_glue_cc_defect_detection/dataset_infos.json b/datasets/code_x_glue_cc_defect_detection/dataset_infos.json new file mode 100644 index 00000000000..1aa69c01d6f --- /dev/null +++ b/datasets/code_x_glue_cc_defect_detection/dataset_infos.json @@ -0,0 +1 @@ +{"default": {"description": "CodeXGLUE Defect-detection dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Defect-detection\n\nGiven a source code, the task is to identify whether it is an insecure code that may attack software systems, such as resource leaks, use-after-free vulnerabilities and DoS attack. We treat the task as binary classification (0/1), where 1 stands for insecure code and 0 for secure code.\nThe dataset we use comes from the paper Devign: Effective Vulnerability Identification by Learning Comprehensive Program Semantics via Graph Neural Networks. We combine all projects and split 80%/10%/10% for training/dev/test.", "citation": "@inproceedings{zhou2019devign,\ntitle={Devign: Effective vulnerability identification by learning comprehensive program semantics via graph neural networks},\nauthor={Zhou, Yaqin and Liu, Shangqing and Siow, Jingkai and Du, Xiaoning and Liu, Yang},\nbooktitle={Advances in Neural Information Processing Systems},\npages={10197--10207}, year={2019}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/Defect-detection", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "func": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"dtype": "bool", "id": null, "_type": "Value"}, "project": {"dtype": "string", "id": null, "_type": "Value"}, "commit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "target", "output": ""}, "task_templates": null, "builder_name": "code_x_glue_cc_defect_detection", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 45723487, "num_examples": 21854, "dataset_name": "code_x_glue_cc_defect_detection"}, "validation": {"name": "validation", "num_bytes": 5582545, "num_examples": 2732, "dataset_name": "code_x_glue_cc_defect_detection"}, "test": {"name": "test", "num_bytes": 5646752, "num_examples": 2732, "dataset_name": "code_x_glue_cc_defect_detection"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/Defect-detection/dataset/train.txt": {"num_bytes": 122185, "checksum": "f0a25410594302a9f0e542a393ad82ad479308a8aa471f4d6cf61b91d6d572bf"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/Defect-detection/dataset/function.json": {"num_bytes": 61532917, "checksum": "0a3b2d561dc6280e53795886ede727d0045c016d083905ba3e9ce384a7eab246"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/Defect-detection/dataset/valid.txt": {"num_bytes": 15295, "checksum": "9f2fa1e108955f197d4a7fa2aa2c7f5e542457b51e0eb1f6e890172d6f700a6e"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/Defect-detection/dataset/test.txt": {"num_bytes": 15318, "checksum": "b5336b337170ea1edf0570b69edb5a90e3c99bf41cd92909795f5fe32d376d52"}}, "download_size": 61685715, "post_processing_size": null, "dataset_size": 56952784, "size_in_bytes": 118638499}} \ No newline at end of file diff --git a/datasets/code_x_glue_cc_defect_detection/dummy/default/0.0.0/dummy_data.zip b/datasets/code_x_glue_cc_defect_detection/dummy/default/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..52825fe4214 Binary files /dev/null and b/datasets/code_x_glue_cc_defect_detection/dummy/default/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_cc_defect_detection/generated_definitions.py b/datasets/code_x_glue_cc_defect_detection/generated_definitions.py new file mode 100644 index 00000000000..cf6213c19d0 --- /dev/null +++ b/datasets/code_x_glue_cc_defect_detection/generated_definitions.py @@ -0,0 +1,12 @@ +DEFINITIONS = { + "default": { + "class_name": "CodeXGlueCcDefectDetection", + "dataset_type": "Code-Code", + "description": "CodeXGLUE Defect-detection dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Defect-detection", + "dir_name": "Defect-detection", + "name": "default", + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Code/Defect-detection", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Code/Defect-detection/dataset", + "sizes": {"test": 2732, "train": 21854, "validation": 2732}, + } +} diff --git a/datasets/code_x_glue_ct_code_to_text/README.md b/datasets/code_x_glue_ct_code_to_text/README.md new file mode 100644 index 00000000000..10d2a8d8d61 --- /dev/null +++ b/datasets/code_x_glue_ct_code_to_text/README.md @@ -0,0 +1,311 @@ +--- +annotations_creators: +- found +language_creators: +- found +languages: +- code +- en +licenses: +- other-C-UDA +multilinguality: +- other-programming-languages +size_categories: + go: + - 100K256 +- Remove examples that documents contain special tokens (e.g. or https:...) +- Remove examples that documents are not English. + +### Supported Tasks and Leaderboards + +- `machine-translation`: The dataset can be used to train a model for automatically generating **English** docstrings for code. + +### Languages + +- Go **programming** language +- Java **programming** language +- Javascript **programming** language +- PHP **programming** language +- Python **programming** language +- Ruby **programming** language +- English **natural** language + +## Dataset Structure + +### Data Instances + +#### go + +An example of 'test' looks as follows. +``` +{ + "code": "func NewSTM(c *v3.Client, apply func(STM) error, so ...stmOption) (*v3.TxnResponse, error) {\n\topts := &stmOptions{ctx: c.Ctx()}\n\tfor _, f := range so {\n\t\tf(opts)\n\t}\n\tif len(opts.prefetch) != 0 {\n\t\tf := apply\n\t\tapply = func(s STM) error {\n\t\t\ts.Get(opts.prefetch...)\n\t\t\treturn f(s)\n\t\t}\n\t}\n\treturn runSTM(mkSTM(c, opts), apply)\n}", + "code_tokens": ["func", "NewSTM", "(", "c", "*", "v3", ".", "Client", ",", "apply", "func", "(", "STM", ")", "error", ",", "so", "...", "stmOption", ")", "(", "*", "v3", ".", "TxnResponse", ",", "error", ")", "{", "opts", ":=", "&", "stmOptions", "{", "ctx", ":", "c", ".", "Ctx", "(", ")", "}", "\n", "for", "_", ",", "f", ":=", "range", "so", "{", "f", "(", "opts", ")", "\n", "}", "\n", "if", "len", "(", "opts", ".", "prefetch", ")", "!=", "0", "{", "f", ":=", "apply", "\n", "apply", "=", "func", "(", "s", "STM", ")", "error", "{", "s", ".", "Get", "(", "opts", ".", "prefetch", "...", ")", "\n", "return", "f", "(", "s", ")", "\n", "}", "\n", "}", "\n", "return", "runSTM", "(", "mkSTM", "(", "c", ",", "opts", ")", ",", "apply", ")", "\n", "}"], + "docstring": "// NewSTM initiates a new STM instance, using serializable snapshot isolation by default.", + "docstring_tokens": ["NewSTM", "initiates", "a", "new", "STM", "instance", "using", "serializable", "snapshot", "isolation", "by", "default", "."], + "func_name": "NewSTM", + "id": 0, + "language": "go", + "original_string": "func NewSTM(c *v3.Client, apply func(STM) error, so ...stmOption) (*v3.TxnResponse, error) {\n\topts := &stmOptions{ctx: c.Ctx()}\n\tfor _, f := range so {\n\t\tf(opts)\n\t}\n\tif len(opts.prefetch) != 0 {\n\t\tf := apply\n\t\tapply = func(s STM) error {\n\t\t\ts.Get(opts.prefetch...)\n\t\t\treturn f(s)\n\t\t}\n\t}\n\treturn runSTM(mkSTM(c, opts), apply)\n}", + "path": "clientv3/concurrency/stm.go", + "repo": "etcd-io/etcd", + "sha": "616592d9ba993e3fe9798eef581316016df98906", + "url": "https://github.com/etcd-io/etcd/blob/616592d9ba993e3fe9798eef581316016df98906/clientv3/concurrency/stm.go#L89-L102" +} +``` + +#### java + +An example of 'test' looks as follows. +``` +{ + "code": "protected final void fastPathOrderedEmit(U value, boolean delayError, Disposable disposable) {\n final Observer observer = downstream;\n final SimplePlainQueue q = queue;\n\n if (wip.get() == 0 && wip.compareAndSet(0, 1)) {\n if (q.isEmpty()) {\n accept(observer, value);\n if (leave(-1) == 0) {\n return;\n }\n } else {\n q.offer(value);\n }\n } else {\n q.offer(value);\n if (!enter()) {\n return;\n }\n }\n QueueDrainHelper.drainLoop(q, observer, delayError, disposable, this);\n }", + "code_tokens": ["protected", "final", "void", "fastPathOrderedEmit", "(", "U", "value", ",", "boolean", "delayError", ",", "Disposable", "disposable", ")", "{", "final", "Observer", "<", "?", "super", "V", ">", "observer", "=", "downstream", ";", "final", "SimplePlainQueue", "<", "U", ">", "q", "=", "queue", ";", "if", "(", "wip", ".", "get", "(", ")", "==", "0", "&&", "wip", ".", "compareAndSet", "(", "0", ",", "1", ")", ")", "{", "if", "(", "q", ".", "isEmpty", "(", ")", ")", "{", "accept", "(", "observer", ",", "value", ")", ";", "if", "(", "leave", "(", "-", "1", ")", "==", "0", ")", "{", "return", ";", "}", "}", "else", "{", "q", ".", "offer", "(", "value", ")", ";", "}", "}", "else", "{", "q", ".", "offer", "(", "value", ")", ";", "if", "(", "!", "enter", "(", ")", ")", "{", "return", ";", "}", "}", "QueueDrainHelper", ".", "drainLoop", "(", "q", ",", "observer", ",", "delayError", ",", "disposable", ",", "this", ")", ";", "}"], + "docstring": "Makes sure the fast-path emits in order.\n@param value the value to emit or queue up\n@param delayError if true, errors are delayed until the source has terminated\n@param disposable the resource to dispose if the drain terminates", + "docstring_tokens": ["Makes", "sure", "the", "fast", "-", "path", "emits", "in", "order", "."], + "func_name": "QueueDrainObserver.fastPathOrderedEmit", + "id": 0, + "language": "java", + "original_string": "protected final void fastPathOrderedEmit(U value, boolean delayError, Disposable disposable) {\n final Observer observer = downstream;\n final SimplePlainQueue q = queue;\n\n if (wip.get() == 0 && wip.compareAndSet(0, 1)) {\n if (q.isEmpty()) {\n accept(observer, value);\n if (leave(-1) == 0) {\n return;\n }\n } else {\n q.offer(value);\n }\n } else {\n q.offer(value);\n if (!enter()) {\n return;\n }\n }\n QueueDrainHelper.drainLoop(q, observer, delayError, disposable, this);\n }", + "path": "src/main/java/io/reactivex/internal/observers/QueueDrainObserver.java", + "repo": "ReactiveX/RxJava", + "sha": "ac84182aa2bd866b53e01c8e3fe99683b882c60e", + "url": "https://github.com/ReactiveX/RxJava/blob/ac84182aa2bd866b53e01c8e3fe99683b882c60e/src/main/java/io/reactivex/internal/observers/QueueDrainObserver.java#L88-L108" +} +``` + +#### javascript + +An example of 'test' looks as follows. +``` +{ + "code": "function createInstance(defaultConfig) {\n var context = new Axios(defaultConfig);\n var instance = bind(Axios.prototype.request, context);\n\n // Copy axios.prototype to instance\n utils.extend(instance, Axios.prototype, context);\n\n // Copy context to instance\n utils.extend(instance, context);\n\n return instance;\n}", + "code_tokens": ["function", "createInstance", "(", "defaultConfig", ")", "{", "var", "context", "=", "new", "Axios", "(", "defaultConfig", ")", ";", "var", "instance", "=", "bind", "(", "Axios", ".", "prototype", ".", "request", ",", "context", ")", ";", "// Copy axios.prototype to instance", "utils", ".", "extend", "(", "instance", ",", "Axios", ".", "prototype", ",", "context", ")", ";", "// Copy context to instance", "utils", ".", "extend", "(", "instance", ",", "context", ")", ";", "return", "instance", ";", "}"], + "docstring": "Create an instance of Axios\n\n@param {Object} defaultConfig The default config for the instance\n@return {Axios} A new instance of Axios", + "docstring_tokens": ["Create", "an", "instance", "of", "Axios"], + "func_name": "createInstance", + "id": 0, + "language": "javascript", + "original_string": "function createInstance(defaultConfig) {\n var context = new Axios(defaultConfig);\n var instance = bind(Axios.prototype.request, context);\n\n // Copy axios.prototype to instance\n utils.extend(instance, Axios.prototype, context);\n\n // Copy context to instance\n utils.extend(instance, context);\n\n return instance;\n}", + "path": "lib/axios.js", + "repo": "axios/axios", + "sha": "92d231387fe2092f8736bc1746d4caa766b675f5", + "url": "https://github.com/axios/axios/blob/92d231387fe2092f8736bc1746d4caa766b675f5/lib/axios.js#L15-L26" +} +``` + +#### php + +An example of 'train' looks as follows. +``` +{ + "code": "public static function build($serviceAddress, $restConfigPath, array $config = [])\n {\n $config += [\n 'httpHandler' => null,\n ];\n list($baseUri, $port) = self::normalizeServiceAddress($serviceAddress);\n $requestBuilder = new RequestBuilder(\"$baseUri:$port\", $restConfigPath);\n $httpHandler = $config['httpHandler'] ?: self::buildHttpHandlerAsync();\n return new RestTransport($requestBuilder, $httpHandler);\n }", + "code_tokens": ["public", "static", "function", "build", "(", "$", "serviceAddress", ",", "$", "restConfigPath", ",", "array", "$", "config", "=", "[", "]", ")", "{", "$", "config", "+=", "[", "'httpHandler'", "=>", "null", ",", "]", ";", "list", "(", "$", "baseUri", ",", "$", "port", ")", "=", "self", "::", "normalizeServiceAddress", "(", "$", "serviceAddress", ")", ";", "$", "requestBuilder", "=", "new", "RequestBuilder", "(", "\"$baseUri:$port\"", ",", "$", "restConfigPath", ")", ";", "$", "httpHandler", "=", "$", "config", "[", "'httpHandler'", "]", "?", ":", "self", "::", "buildHttpHandlerAsync", "(", ")", ";", "return", "new", "RestTransport", "(", "$", "requestBuilder", ",", "$", "httpHandler", ")", ";", "}"], + "docstring": "Builds a RestTransport.\n\n@param string $serviceAddress\nThe address of the API remote host, for example \"example.googleapis.com\".\n@param string $restConfigPath\nPath to rest config file.\n@param array $config {\nConfig options used to construct the gRPC transport.\n\n@type callable $httpHandler A handler used to deliver PSR-7 requests.\n}\n@return RestTransport\n@throws ValidationException", + "docstring_tokens": ["Builds", "a", "RestTransport", "."], + "func_name": "RestTransport.build", + "id": 0, + "language": "php", + "original_string": "public static function build($serviceAddress, $restConfigPath, array $config = [])\n {\n $config += [\n 'httpHandler' => null,\n ];\n list($baseUri, $port) = self::normalizeServiceAddress($serviceAddress);\n $requestBuilder = new RequestBuilder(\"$baseUri:$port\", $restConfigPath);\n $httpHandler = $config['httpHandler'] ?: self::buildHttpHandlerAsync();\n return new RestTransport($requestBuilder, $httpHandler);\n }", + "path": "src/Transport/RestTransport.php", + "repo": "googleapis/gax-php", + "sha": "48387fb818c6882296710a2302a0aa973b99afb2", + "url": "https://github.com/googleapis/gax-php/blob/48387fb818c6882296710a2302a0aa973b99afb2/src/Transport/RestTransport.php#L85-L94" +} +``` + +#### python + +An example of 'validation' looks as follows. +``` +{ + "code": "def save_act(self, path=None):\n \"\"\"Save model to a pickle located at `path`\"\"\"\n if path is None:\n path = os.path.join(logger.get_dir(), \"model.pkl\")\n\n with tempfile.TemporaryDirectory() as td:\n save_variables(os.path.join(td, \"model\"))\n arc_name = os.path.join(td, \"packed.zip\")\n with zipfile.ZipFile(arc_name, 'w') as zipf:\n for root, dirs, files in os.walk(td):\n for fname in files:\n file_path = os.path.join(root, fname)\n if file_path != arc_name:\n zipf.write(file_path, os.path.relpath(file_path, td))\n with open(arc_name, \"rb\") as f:\n model_data = f.read()\n with open(path, \"wb\") as f:\n cloudpickle.dump((model_data, self._act_params), f)", + "code_tokens": ["def", "save_act", "(", "self", ",", "path", "=", "None", ")", ":", "if", "path", "is", "None", ":", "path", "=", "os", ".", "path", ".", "join", "(", "logger", ".", "get_dir", "(", ")", ",", "\"model.pkl\"", ")", "with", "tempfile", ".", "TemporaryDirectory", "(", ")", "as", "td", ":", "save_variables", "(", "os", ".", "path", ".", "join", "(", "td", ",", "\"model\"", ")", ")", "arc_name", "=", "os", ".", "path", ".", "join", "(", "td", ",", "\"packed.zip\"", ")", "with", "zipfile", ".", "ZipFile", "(", "arc_name", ",", "'w'", ")", "as", "zipf", ":", "for", "root", ",", "dirs", ",", "files", "in", "os", ".", "walk", "(", "td", ")", ":", "for", "fname", "in", "files", ":", "file_path", "=", "os", ".", "path", ".", "join", "(", "root", ",", "fname", ")", "if", "file_path", "!=", "arc_name", ":", "zipf", ".", "write", "(", "file_path", ",", "os", ".", "path", ".", "relpath", "(", "file_path", ",", "td", ")", ")", "with", "open", "(", "arc_name", ",", "\"rb\"", ")", "as", "f", ":", "model_data", "=", "f", ".", "read", "(", ")", "with", "open", "(", "path", ",", "\"wb\"", ")", "as", "f", ":", "cloudpickle", ".", "dump", "(", "(", "model_data", ",", "self", ".", "_act_params", ")", ",", "f", ")"], + "docstring": "Save model to a pickle located at `path`", + "docstring_tokens": ["Save", "model", "to", "a", "pickle", "located", "at", "path"], + "func_name": "ActWrapper.save_act", + "id": 0, + "language": "python", + "original_string": "def save_act(self, path=None):\n \"\"\"Save model to a pickle located at `path`\"\"\"\n if path is None:\n path = os.path.join(logger.get_dir(), \"model.pkl\")\n\n with tempfile.TemporaryDirectory() as td:\n save_variables(os.path.join(td, \"model\"))\n arc_name = os.path.join(td, \"packed.zip\")\n with zipfile.ZipFile(arc_name, 'w') as zipf:\n for root, dirs, files in os.walk(td):\n for fname in files:\n file_path = os.path.join(root, fname)\n if file_path != arc_name:\n zipf.write(file_path, os.path.relpath(file_path, td))\n with open(arc_name, \"rb\") as f:\n model_data = f.read()\n with open(path, \"wb\") as f:\n cloudpickle.dump((model_data, self._act_params), f)", + "path": "baselines/deepq/deepq.py", + "repo": "openai/baselines", + "sha": "3301089b48c42b87b396e246ea3f56fa4bfc9678", + "url": "https://github.com/openai/baselines/blob/3301089b48c42b87b396e246ea3f56fa4bfc9678/baselines/deepq/deepq.py#L55-L72" +} +``` + +#### ruby + +An example of 'train' looks as follows. +``` +{ + "code": "def render_body(context, options)\n if options.key?(:partial)\n [render_partial(context, options)]\n else\n StreamingTemplateRenderer.new(@lookup_context).render(context, options)\n end\n end", + "code_tokens": ["def", "render_body", "(", "context", ",", "options", ")", "if", "options", ".", "key?", "(", ":partial", ")", "[", "render_partial", "(", "context", ",", "options", ")", "]", "else", "StreamingTemplateRenderer", ".", "new", "(", "@lookup_context", ")", ".", "render", "(", "context", ",", "options", ")", "end", "end"], + "docstring": "Render but returns a valid Rack body. If fibers are defined, we return\n a streaming body that renders the template piece by piece.\n\n Note that partials are not supported to be rendered with streaming,\n so in such cases, we just wrap them in an array.", + "docstring_tokens": ["Render", "but", "returns", "a", "valid", "Rack", "body", ".", "If", "fibers", "are", "defined", "we", "return", "a", "streaming", "body", "that", "renders", "the", "template", "piece", "by", "piece", "."], + "func_name": "ActionView.Renderer.render_body", + "id": 0, + "language": "ruby", + "original_string": "def render_body(context, options)\n if options.key?(:partial)\n [render_partial(context, options)]\n else\n StreamingTemplateRenderer.new(@lookup_context).render(context, options)\n end\n end", + "path": "actionview/lib/action_view/renderer/renderer.rb", + "repo": "rails/rails", + "sha": "85a8bc644be69908f05740a5886ec19cd3679df5", + "url": "https://github.com/rails/rails/blob/85a8bc644be69908f05740a5886ec19cd3679df5/actionview/lib/action_view/renderer/renderer.rb#L38-L44" +} +``` + +### Data Fields + +In the following each data field in go is explained for each config. The data fields are the same among all splits. + +#### go, java, javascript, php, python, ruby + +| field name | type | description | +|----------------|----------------|-----------------------------------------------------------------------------------| +|id |int32 | Index of the sample | +|repo |string | repo: the owner/repo | +|path |string | path: the full path to the original file | +|func_name |string | func_name: the function or method name | +|original_string |string | original_string: the raw string before tokenization or parsing | +|language |string | language: the programming language name | +|code |string | code/function: the part of the original_string that is code | +|code_tokens |Sequence[string]| code_tokens/function_tokens: tokenized version of code | +|docstring |string | docstring: the top-level comment or docstring, if it exists in the original string| +|docstring_tokens|Sequence[string]| docstring_tokens: tokenized version of docstring | +|sha |string | sha of the file | +|url |string | url of the file | + +### Data Splits + +| name |train |validation|test | +|----------|-----:|---------:|----:| +|go |167288| 7325| 8122| +|java |164923| 5183|10955| +|javascript| 58025| 3885| 3291| +|php |241241| 12982|14014| +|python |251820| 13914|14918| +|ruby | 24927| 1400| 1261| + +## Dataset Creation + +### Curation Rationale + +[More Information Needed] + +### Source Data + +#### Initial Data Collection and Normalization + +Data from CodeSearchNet Challenge dataset. +[More Information Needed] + +#### Who are the source language producers? + +Software Engineering developers. + +### Annotations + +#### Annotation process + +[More Information Needed] + +#### Who are the annotators? + +[More Information Needed] + +### Personal and Sensitive Information + +[More Information Needed] + +## Considerations for Using the Data + +### Social Impact of Dataset + +[More Information Needed] + +### Discussion of Biases + +[More Information Needed] + +### Other Known Limitations + +[More Information Needed] + +## Additional Information + +### Dataset Curators + +https://github.com/microsoft, https://github.com/madlag + +### Licensing Information + +Computational Use of Data Agreement (C-UDA) License. + +### Citation Information + +``` +@article{husain2019codesearchnet, + title={Codesearchnet challenge: Evaluating the state of semantic code search}, + author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, + journal={arXiv preprint arXiv:1909.09436}, + year={2019} +} +``` + +### Contributions + +Thanks to @madlag (and partly also @ncoop57) for adding this dataset. diff --git a/datasets/code_x_glue_ct_code_to_text/code_x_glue_ct_code_to_text.py b/datasets/code_x_glue_ct_code_to_text/code_x_glue_ct_code_to_text.py new file mode 100644 index 00000000000..fdf291164e7 --- /dev/null +++ b/datasets/code_x_glue_ct_code_to_text/code_x_glue_ct_code_to_text.py @@ -0,0 +1,155 @@ +import json +import os +import os.path +from typing import List + +import datasets + +from .common import TrainValidTestChild +from .generated_definitions import DEFINITIONS + + +_DESCRIPTION = """The dataset we use comes from CodeSearchNet and we filter the dataset as the following: +- Remove examples that codes cannot be parsed into an abstract syntax tree. +- Remove examples that #tokens of documents is < 3 or >256 +- Remove examples that documents contain special tokens (e.g. or https:...) +- Remove examples that documents are not English. +""" +_CITATION = """@article{husain2019codesearchnet, +title={Codesearchnet challenge: Evaluating the state of semantic code search}, +author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, +journal={arXiv preprint arXiv:1909.09436}, +year={2019} +}""" + + +class CodeXGlueCtCodeToTextBaseImpl(TrainValidTestChild): + _DESCRIPTION = _DESCRIPTION + _CITATION = _CITATION + + # For each file, each line in the uncompressed file represents one function. + _FEATURES = { + "id": datasets.Value("int32"), # Index of the sample + "repo": datasets.Value("string"), # repo: the owner/repo + "path": datasets.Value("string"), # path: the full path to the original file + "func_name": datasets.Value("string"), # func_name: the function or method name + "original_string": datasets.Value("string"), # original_string: the raw string before tokenization or parsing + "language": datasets.Value("string"), # language: the programming language name + "code": datasets.Value("string"), # code/function: the part of the original_string that is code + "code_tokens": datasets.features.Sequence( + datasets.Value("string") + ), # code_tokens/function_tokens: tokenized version of code + "docstring": datasets.Value( + "string" + ), # docstring: the top-level comment or docstring, if it exists in the original string + "docstring_tokens": datasets.features.Sequence( + datasets.Value("string") + ), # docstring_tokens: tokenized version of docstring + "sha": datasets.Value("string"), # sha of the file + "url": datasets.Value("string"), # url of the file + } + + _SUPERVISED_KEYS = ["docstring", "docstring_tokens"] + + def generate_urls(self, split_name, language): + yield "language", f"https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/{language}.zip" + yield "dataset", "dataset.zip" + + def get_data_files(self, split_name, file_paths, language): + language_specific_path = file_paths["language"] + final_path = os.path.join(language_specific_path, language, "final") + # Make some cleanup to save space + for path in os.listdir(final_path): + if path.endswith(".pkl"): + os.unlink(path) + + data_files = [] + for root, dirs, files in os.walk(final_path): + for file in files: + temp = os.path.join(root, file) + if ".jsonl" in temp: + if split_name in temp: + data_files.append(temp) + return data_files + + def post_process(self, split_name, language, js): + return js + + def _generate_examples(self, split_name, file_paths, language): + import gzip + + data_set_path = file_paths["dataset"] + + data_files = self.get_data_files(split_name, file_paths, language) + + urls = {} + f1_path_parts = [data_set_path, "dataset", language, f"{split_name}.txt"] + if self.SINGLE_LANGUAGE: + del f1_path_parts[2] + + f1_path = os.path.join(*f1_path_parts) + with open(f1_path, encoding="utf-8") as f1: + for line in f1: + line = line.strip() + urls[line] = True + + idx = 0 + for file in data_files: + if ".gz" in file: + f = gzip.open(file) + else: + f = open(file, encoding="utf-8") + + for line in f: + line = line.strip() + js = json.loads(line) + if js["url"] in urls: + js["id"] = idx + js = self.post_process(split_name, language, js) + if "partition" in js: + del js["partition"] + yield idx, js + idx += 1 + f.close() + + +class CodeXGlueCtCodeToTextImpl(CodeXGlueCtCodeToTextBaseImpl): + SINGLE_LANGUAGE = False + + def generate_urls(self, split_name): + language = self.info["parameters"]["language"] + for e in super().generate_urls(split_name, language): + yield e + + def _generate_examples(self, split_name, file_paths): + language = self.info["parameters"]["language"] + for e in super()._generate_examples(split_name, file_paths, language): + yield e + + +CLASS_MAPPING = { + "CodeXGlueCtCodeToText": CodeXGlueCtCodeToTextImpl, +} + + +class CodeXGlueCtCodeToText(datasets.GeneratorBasedBuilder): + BUILDER_CONFIG_CLASS = datasets.BuilderConfig + BUILDER_CONFIGS = [ + datasets.BuilderConfig(name=name, description=info["description"]) for name, info in DEFINITIONS.items() + ] + + def _info(self): + name = self.config.name + info = DEFINITIONS[name] + if info["class_name"] in CLASS_MAPPING: + self.child = CLASS_MAPPING[info["class_name"]](info) + else: + raise RuntimeError(f"Unknown python class for dataset configuration {name}") + ret = self.child._info() + return ret + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + return self.child._split_generators(dl_manager=dl_manager) + + def _generate_examples(self, split_name, file_paths): + return self.child._generate_examples(split_name, file_paths) diff --git a/datasets/code_x_glue_ct_code_to_text/common.py b/datasets/code_x_glue_ct_code_to_text/common.py new file mode 100644 index 00000000000..bbd28960601 --- /dev/null +++ b/datasets/code_x_glue_ct_code_to_text/common.py @@ -0,0 +1,75 @@ +from typing import List + +import datasets + + +# Citation, taken from https://github.com/microsoft/CodeXGLUE +_DEFAULT_CITATION = """@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},}""" + + +class Child: + _DESCRIPTION = None + _FEATURES = None + _CITATION = None + SPLITS = {"train": datasets.Split.TRAIN} + _SUPERVISED_KEYS = None + + def __init__(self, info): + self.info = info + + def homepage(self): + return self.info["project_url"] + + def _info(self): + # This is the description that will appear on the datasets page. + return datasets.DatasetInfo( + description=self.info["description"] + "\n\n" + self._DESCRIPTION, + features=datasets.Features(self._FEATURES), + homepage=self.homepage(), + citation=self._CITATION or _DEFAULT_CITATION, + supervised_keys=self._SUPERVISED_KEYS, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + SPLITS = self.SPLITS + _URL = self.info["raw_url"] + urls_to_download = {} + for split in SPLITS: + if split not in urls_to_download: + urls_to_download[split] = {} + + for key, url in self.generate_urls(split): + if not url.startswith("http"): + url = _URL + "/" + url + urls_to_download[split][key] = url + + downloaded_files = {} + for k, v in urls_to_download.items(): + downloaded_files[k] = dl_manager.download_and_extract(v) + + return [ + datasets.SplitGenerator( + name=SPLITS[k], + gen_kwargs={"split_name": k, "file_paths": downloaded_files[k]}, + ) + for k in SPLITS + ] + + def check_empty(self, entries): + all_empty = all([v == "" for v in entries.values()]) + all_non_empty = all([v != "" for v in entries.values()]) + + if not all_non_empty and not all_empty: + raise RuntimeError("Parallel data files should have the same number of lines.") + + return all_empty + + +class TrainValidTestChild(Child): + SPLITS = { + "train": datasets.Split.TRAIN, + "valid": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } diff --git a/datasets/code_x_glue_ct_code_to_text/dataset_infos.json b/datasets/code_x_glue_ct_code_to_text/dataset_infos.json new file mode 100644 index 00000000000..d7a9e9ec408 --- /dev/null +++ b/datasets/code_x_glue_ct_code_to_text/dataset_infos.json @@ -0,0 +1 @@ +{"go": {"description": "CodeXGLUE code-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text\n\nThe dataset we use comes from CodeSearchNet and we filter the dataset as the following:\n- Remove examples that codes cannot be parsed into an abstract syntax tree.\n- Remove examples that #tokens of documents is < 3 or >256\n- Remove examples that documents contain special tokens (e.g. or https:...)\n- Remove examples that documents are not English.\n", "citation": "@article{husain2019codesearchnet,\ntitle={Codesearchnet challenge: Evaluating the state of semantic code search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Text/code-to-text", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "repo": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "func_name": {"dtype": "string", "id": null, "_type": "Value"}, "original_string": {"dtype": "string", "id": null, "_type": "Value"}, "language": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "code_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "docstring": {"dtype": "string", "id": null, "_type": "Value"}, "docstring_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "sha": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "docstring", "output": "docstring_tokens"}, "task_templates": null, "builder_name": "code_x_glue_ct_code_to_text", "config_name": "go", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 342244027, "num_examples": 167288, "dataset_name": "code_x_glue_ct_code_to_text"}, "validation": {"name": "validation", "num_bytes": 13721912, "num_examples": 7325, "dataset_name": "code_x_glue_ct_code_to_text"}, "test": {"name": "test", "num_bytes": 16328458, "num_examples": 8122, "dataset_name": "code_x_glue_ct_code_to_text"}}, "download_checksums": {"https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/go.zip": {"num_bytes": 487525935, "checksum": "15d23f01dc2796447e1736263e6830079289d5ef41f09988011afdcf8da6b6e5"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Text/code-to-text/dataset.zip": {"num_bytes": 12396864, "checksum": "31ec750805302ecd71b278a492d23d2ac916269f7ec645bba4f23b6f7c4bf217"}}, "download_size": 499922799, "post_processing_size": null, "dataset_size": 372294397, "size_in_bytes": 872217196}, "java": {"description": "CodeXGLUE code-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text\n\nThe dataset we use comes from CodeSearchNet and we filter the dataset as the following:\n- Remove examples that codes cannot be parsed into an abstract syntax tree.\n- Remove examples that #tokens of documents is < 3 or >256\n- Remove examples that documents contain special tokens (e.g. or https:...)\n- Remove examples that documents are not English.\n", "citation": "@article{husain2019codesearchnet,\ntitle={Codesearchnet challenge: Evaluating the state of semantic code search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Text/code-to-text", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "repo": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "func_name": {"dtype": "string", "id": null, "_type": "Value"}, "original_string": {"dtype": "string", "id": null, "_type": "Value"}, "language": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "code_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "docstring": {"dtype": "string", "id": null, "_type": "Value"}, "docstring_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "sha": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "docstring", "output": "docstring_tokens"}, "task_templates": null, "builder_name": "code_x_glue_ct_code_to_text", "config_name": "java", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 452554719, "num_examples": 164923, "dataset_name": "code_x_glue_ct_code_to_text"}, "validation": {"name": "validation", "num_bytes": 13366396, "num_examples": 5183, "dataset_name": "code_x_glue_ct_code_to_text"}, "test": {"name": "test", "num_bytes": 29080857, "num_examples": 10955, "dataset_name": "code_x_glue_ct_code_to_text"}}, "download_checksums": {"https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip": {"num_bytes": 1060569153, "checksum": "05f9204b1808413fab30f0e69229e298f6de4ad468279d53a2aa5797e3a78c17"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Text/code-to-text/dataset.zip": {"num_bytes": 12396864, "checksum": "31ec750805302ecd71b278a492d23d2ac916269f7ec645bba4f23b6f7c4bf217"}}, "download_size": 1072966017, "post_processing_size": null, "dataset_size": 495001972, "size_in_bytes": 1567967989}, "javascript": {"description": "CodeXGLUE code-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text\n\nThe dataset we use comes from CodeSearchNet and we filter the dataset as the following:\n- Remove examples that codes cannot be parsed into an abstract syntax tree.\n- Remove examples that #tokens of documents is < 3 or >256\n- Remove examples that documents contain special tokens (e.g. or https:...)\n- Remove examples that documents are not English.\n", "citation": "@article{husain2019codesearchnet,\ntitle={Codesearchnet challenge: Evaluating the state of semantic code search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Text/code-to-text", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "repo": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "func_name": {"dtype": "string", "id": null, "_type": "Value"}, "original_string": {"dtype": "string", "id": null, "_type": "Value"}, "language": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "code_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "docstring": {"dtype": "string", "id": null, "_type": "Value"}, "docstring_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "sha": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "docstring", "output": "docstring_tokens"}, "task_templates": null, "builder_name": "code_x_glue_ct_code_to_text", "config_name": "javascript", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 160860743, "num_examples": 58025, "dataset_name": "code_x_glue_ct_code_to_text"}, "validation": {"name": "validation", "num_bytes": 10337396, "num_examples": 3885, "dataset_name": "code_x_glue_ct_code_to_text"}, "test": {"name": "test", "num_bytes": 10190765, "num_examples": 3291, "dataset_name": "code_x_glue_ct_code_to_text"}}, "download_checksums": {"https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/javascript.zip": {"num_bytes": 1664713350, "checksum": "fdc743f5af27f90c77584a2d29e2b7f8cecdd00c37b433c385b888ee062936dd"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Text/code-to-text/dataset.zip": {"num_bytes": 12396864, "checksum": "31ec750805302ecd71b278a492d23d2ac916269f7ec645bba4f23b6f7c4bf217"}}, "download_size": 1677110214, "post_processing_size": null, "dataset_size": 181388904, "size_in_bytes": 1858499118}, "php": {"description": "CodeXGLUE code-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text\n\nThe dataset we use comes from CodeSearchNet and we filter the dataset as the following:\n- Remove examples that codes cannot be parsed into an abstract syntax tree.\n- Remove examples that #tokens of documents is < 3 or >256\n- Remove examples that documents contain special tokens (e.g. or https:...)\n- Remove examples that documents are not English.\n", "citation": "@article{husain2019codesearchnet,\ntitle={Codesearchnet challenge: Evaluating the state of semantic code search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Text/code-to-text", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "repo": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "func_name": {"dtype": "string", "id": null, "_type": "Value"}, "original_string": {"dtype": "string", "id": null, "_type": "Value"}, "language": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "code_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "docstring": {"dtype": "string", "id": null, "_type": "Value"}, "docstring_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "sha": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "docstring", "output": "docstring_tokens"}, "task_templates": null, "builder_name": "code_x_glue_ct_code_to_text", "config_name": "php", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 614655799, "num_examples": 241241, "dataset_name": "code_x_glue_ct_code_to_text"}, "validation": {"name": "validation", "num_bytes": 33283149, "num_examples": 12982, "dataset_name": "code_x_glue_ct_code_to_text"}, "test": {"name": "test", "num_bytes": 35375097, "num_examples": 14014, "dataset_name": "code_x_glue_ct_code_to_text"}}, "download_checksums": {"https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/php.zip": {"num_bytes": 851894048, "checksum": "c3bbf0d1b10010f88b058faea876f1f5471758399e30d58c11f78ff53660ce00"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Text/code-to-text/dataset.zip": {"num_bytes": 12396864, "checksum": "31ec750805302ecd71b278a492d23d2ac916269f7ec645bba4f23b6f7c4bf217"}}, "download_size": 864290912, "post_processing_size": null, "dataset_size": 683314045, "size_in_bytes": 1547604957}, "python": {"description": "CodeXGLUE code-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text\n\nThe dataset we use comes from CodeSearchNet and we filter the dataset as the following:\n- Remove examples that codes cannot be parsed into an abstract syntax tree.\n- Remove examples that #tokens of documents is < 3 or >256\n- Remove examples that documents contain special tokens (e.g. or https:...)\n- Remove examples that documents are not English.\n", "citation": "@article{husain2019codesearchnet,\ntitle={Codesearchnet challenge: Evaluating the state of semantic code search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Text/code-to-text", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "repo": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "func_name": {"dtype": "string", "id": null, "_type": "Value"}, "original_string": {"dtype": "string", "id": null, "_type": "Value"}, "language": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "code_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "docstring": {"dtype": "string", "id": null, "_type": "Value"}, "docstring_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "sha": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "docstring", "output": "docstring_tokens"}, "task_templates": null, "builder_name": "code_x_glue_ct_code_to_text", "config_name": "python", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 813664500, "num_examples": 251820, "dataset_name": "code_x_glue_ct_code_to_text"}, "validation": {"name": "validation", "num_bytes": 46888668, "num_examples": 13914, "dataset_name": "code_x_glue_ct_code_to_text"}, "test": {"name": "test", "num_bytes": 50659792, "num_examples": 14918, "dataset_name": "code_x_glue_ct_code_to_text"}}, "download_checksums": {"https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip": {"num_bytes": 940909997, "checksum": "7223c6460bebfa85697b586da91e47bc5d64790a4d60bba5917106458ab6b40e"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Text/code-to-text/dataset.zip": {"num_bytes": 12396864, "checksum": "31ec750805302ecd71b278a492d23d2ac916269f7ec645bba4f23b6f7c4bf217"}}, "download_size": 953306861, "post_processing_size": null, "dataset_size": 911212960, "size_in_bytes": 1864519821}, "ruby": {"description": "CodeXGLUE code-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text\n\nThe dataset we use comes from CodeSearchNet and we filter the dataset as the following:\n- Remove examples that codes cannot be parsed into an abstract syntax tree.\n- Remove examples that #tokens of documents is < 3 or >256\n- Remove examples that documents contain special tokens (e.g. or https:...)\n- Remove examples that documents are not English.\n", "citation": "@article{husain2019codesearchnet,\ntitle={Codesearchnet challenge: Evaluating the state of semantic code search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Text/code-to-text", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "repo": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "func_name": {"dtype": "string", "id": null, "_type": "Value"}, "original_string": {"dtype": "string", "id": null, "_type": "Value"}, "language": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "code_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "docstring": {"dtype": "string", "id": null, "_type": "Value"}, "docstring_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "sha": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "docstring", "output": "docstring_tokens"}, "task_templates": null, "builder_name": "code_x_glue_ct_code_to_text", "config_name": "ruby", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 51956595, "num_examples": 24927, "dataset_name": "code_x_glue_ct_code_to_text"}, "validation": {"name": "validation", "num_bytes": 2821089, "num_examples": 1400, "dataset_name": "code_x_glue_ct_code_to_text"}, "test": {"name": "test", "num_bytes": 2671603, "num_examples": 1261, "dataset_name": "code_x_glue_ct_code_to_text"}}, "download_checksums": {"https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/ruby.zip": {"num_bytes": 111758028, "checksum": "67aee5812d0f994df745c771c7791483f2b060561495747d424e307af4b342e6"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Text/code-to-text/dataset.zip": {"num_bytes": 12396864, "checksum": "31ec750805302ecd71b278a492d23d2ac916269f7ec645bba4f23b6f7c4bf217"}}, "download_size": 124154892, "post_processing_size": null, "dataset_size": 57449287, "size_in_bytes": 181604179}} \ No newline at end of file diff --git a/datasets/code_x_glue_ct_code_to_text/dummy/go/0.0.0/dummy_data.zip b/datasets/code_x_glue_ct_code_to_text/dummy/go/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..61754a8a9c2 Binary files /dev/null and b/datasets/code_x_glue_ct_code_to_text/dummy/go/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_ct_code_to_text/dummy/java/0.0.0/dummy_data.zip b/datasets/code_x_glue_ct_code_to_text/dummy/java/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..08bce80910b Binary files /dev/null and b/datasets/code_x_glue_ct_code_to_text/dummy/java/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_ct_code_to_text/dummy/javascript/0.0.0/dummy_data.zip b/datasets/code_x_glue_ct_code_to_text/dummy/javascript/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..758ad50308d Binary files /dev/null and b/datasets/code_x_glue_ct_code_to_text/dummy/javascript/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_ct_code_to_text/dummy/php/0.0.0/dummy_data.zip b/datasets/code_x_glue_ct_code_to_text/dummy/php/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..64b23ac22cd Binary files /dev/null and b/datasets/code_x_glue_ct_code_to_text/dummy/php/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_ct_code_to_text/dummy/python/0.0.0/dummy_data.zip b/datasets/code_x_glue_ct_code_to_text/dummy/python/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..6fc736cc02e Binary files /dev/null and b/datasets/code_x_glue_ct_code_to_text/dummy/python/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_ct_code_to_text/dummy/ruby/0.0.0/dummy_data.zip b/datasets/code_x_glue_ct_code_to_text/dummy/ruby/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..b7897085f47 Binary files /dev/null and b/datasets/code_x_glue_ct_code_to_text/dummy/ruby/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_ct_code_to_text/generated_definitions.py b/datasets/code_x_glue_ct_code_to_text/generated_definitions.py new file mode 100644 index 00000000000..45fb58aaaf5 --- /dev/null +++ b/datasets/code_x_glue_ct_code_to_text/generated_definitions.py @@ -0,0 +1,68 @@ +DEFINITIONS = { + "go": { + "class_name": "CodeXGlueCtCodeToText", + "dataset_type": "Code-Text", + "description": "CodeXGLUE code-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text", + "dir_name": "code-to-text", + "name": "go", + "parameters": {"language": "go"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Text/code-to-text", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Text/code-to-text", + "sizes": {"test": 8122, "train": 167288, "validation": 7325}, + }, + "java": { + "class_name": "CodeXGlueCtCodeToText", + "dataset_type": "Code-Text", + "description": "CodeXGLUE code-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text", + "dir_name": "code-to-text", + "name": "java", + "parameters": {"language": "java"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Text/code-to-text", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Text/code-to-text", + "sizes": {"test": 10955, "train": 164923, "validation": 5183}, + }, + "javascript": { + "class_name": "CodeXGlueCtCodeToText", + "dataset_type": "Code-Text", + "description": "CodeXGLUE code-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text", + "dir_name": "code-to-text", + "name": "javascript", + "parameters": {"language": "javascript"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Text/code-to-text", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Text/code-to-text", + "sizes": {"test": 3291, "train": 58025, "validation": 3885}, + }, + "php": { + "class_name": "CodeXGlueCtCodeToText", + "dataset_type": "Code-Text", + "description": "CodeXGLUE code-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text", + "dir_name": "code-to-text", + "name": "php", + "parameters": {"language": "php"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Text/code-to-text", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Text/code-to-text", + "sizes": {"test": 14014, "train": 241241, "validation": 12982}, + }, + "python": { + "class_name": "CodeXGlueCtCodeToText", + "dataset_type": "Code-Text", + "description": "CodeXGLUE code-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text", + "dir_name": "code-to-text", + "name": "python", + "parameters": {"language": "python"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Text/code-to-text", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Text/code-to-text", + "sizes": {"test": 14918, "train": 251820, "validation": 13914}, + }, + "ruby": { + "class_name": "CodeXGlueCtCodeToText", + "dataset_type": "Code-Text", + "description": "CodeXGLUE code-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Code-Text/code-to-text", + "dir_name": "code-to-text", + "name": "ruby", + "parameters": {"language": "ruby"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Code-Text/code-to-text", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Code-Text/code-to-text", + "sizes": {"test": 1261, "train": 24927, "validation": 1400}, + }, +} diff --git a/datasets/code_x_glue_tc_nl_code_search_adv/README.md b/datasets/code_x_glue_tc_nl_code_search_adv/README.md new file mode 100644 index 00000000000..d5aadd4f859 --- /dev/null +++ b/datasets/code_x_glue_tc_nl_code_search_adv/README.md @@ -0,0 +1,203 @@ +--- +annotations_creators: +- found +language_creators: +- found +languages: +- code +- en +licenses: +- other-C-UDA +multilinguality: +- other-programming-languages +size_categories: +- 100K256 +- Remove examples that documents contain special tokens (e.g. or https:...) +- Remove examples that documents are not English. + +### Supported Tasks and Leaderboards + +- `document-retrieval`: The dataset can be used to train a model for retrieving top-k codes from a given **English** natural language query. + +### Languages + +- Python **programming** language +- English **natural** language + +## Dataset Structure + +### Data Instances + +An example of 'validation' looks as follows. +``` +{ + "argument_list": "", + "code": "def Func(arg_0, arg_1='.', arg_2=True, arg_3=False, **arg_4):\n \"\"\"Downloads Dailymotion videos by URL.\n \"\"\"\n\n arg_5 = get_content(rebuilt_url(arg_0))\n arg_6 = json.loads(match1(arg_5, r'qualities\":({.+?}),\"'))\n arg_7 = match1(arg_5, r'\"video_title\"\\s*:\\s*\"([^\"]+)\"') or \\\n match1(arg_5, r'\"title\"\\s*:\\s*\"([^\"]+)\"')\n arg_7 = unicodize(arg_7)\n\n for arg_8 in ['1080','720','480','380','240','144','auto']:\n try:\n arg_9 = arg_6[arg_8][1][\"url\"]\n if arg_9:\n break\n except KeyError:\n pass\n\n arg_10, arg_11, arg_12 = url_info(arg_9)\n\n print_info(site_info, arg_7, arg_10, arg_12)\n if not arg_3:\n download_urls([arg_9], arg_7, arg_11, arg_12, arg_1=arg_1, arg_2=arg_2)", + "code_tokens": ["def", "Func", "(", "arg_0", ",", "arg_1", "=", "'.'", ",", "arg_2", "=", "True", ",", "arg_3", "=", "False", ",", "**", "arg_4", ")", ":", "arg_5", "=", "get_content", "(", "rebuilt_url", "(", "arg_0", ")", ")", "arg_6", "=", "json", ".", "loads", "(", "match1", "(", "arg_5", ",", "r'qualities\":({.+?}),\"'", ")", ")", "arg_7", "=", "match1", "(", "arg_5", ",", "r'\"video_title\"\\s*:\\s*\"([^\"]+)\"'", ")", "or", "match1", "(", "arg_5", ",", "r'\"title\"\\s*:\\s*\"([^\"]+)\"'", ")", "arg_7", "=", "unicodize", "(", "arg_7", ")", "for", "arg_8", "in", "[", "'1080'", ",", "'720'", ",", "'480'", ",", "'380'", ",", "'240'", ",", "'144'", ",", "'auto'", "]", ":", "try", ":", "arg_9", "=", "arg_6", "[", "arg_8", "]", "[", "1", "]", "[", "\"url\"", "]", "if", "arg_9", ":", "break", "except", "KeyError", ":", "pass", "arg_10", ",", "arg_11", ",", "arg_12", "=", "url_info", "(", "arg_9", ")", "print_info", "(", "site_info", ",", "arg_7", ",", "arg_10", ",", "arg_12", ")", "if", "not", "arg_3", ":", "download_urls", "(", "[", "arg_9", "]", ",", "arg_7", ",", "arg_11", ",", "arg_12", ",", "arg_1", "=", "arg_1", ",", "arg_2", "=", "arg_2", ")"], + "docstring": "Downloads Dailymotion videos by URL.", + "docstring_summary": "Downloads Dailymotion videos by URL.", + "docstring_tokens": ["Downloads", "Dailymotion", "videos", "by", "URL", "."], + "func_name": "", + "id": 0, + "identifier": "dailymotion_download", + "language": "python", + "nwo": "soimort/you-get", + "original_string": "", + "parameters": "(url, output_dir='.', merge=True, info_only=False, **kwargs)", + "path": "src/you_get/extractors/dailymotion.py", + "repo": "", + "return_statement": "", + "score": 0.9997601509094238, + "sha": "b746ac01c9f39de94cac2d56f665285b0523b974", + "url": "https://github.com/soimort/you-get/blob/b746ac01c9f39de94cac2d56f665285b0523b974/src/you_get/extractors/dailymotion.py#L13-L35" +} +``` + +### Data Fields + +In the following each data field in go is explained for each config. The data fields are the same among all splits. + +#### default + +| field name | type | description | +|-----------------|-----------------------|-----------------------------------------------------------------------------------| +|id |int32 | Index of the sample | +|repo |string | repo: the owner/repo | +|path |string | path: the full path to the original file | +|func_name |string | func_name: the function or method name | +|original_string |string | original_string: the raw string before tokenization or parsing | +|language |string | language: the programming language | +|code |string | code/function: the part of the original_string that is code | +|code_tokens |Sequence[string] | code_tokens/function_tokens: tokenized version of code | +|docstring |string | docstring: the top-level comment or docstring, if it exists in the original string| +|docstring_tokens |Sequence[string] | docstring_tokens: tokenized version of docstring | +|sha |string | sha of the file | +|url |string | url of the file | +|docstring_summary|string | Summary of the docstring | +|parameters |string | parameters of the function | +|return_statement |string | return statement | +|argument_list |string | list of arguments of the function | +|identifier |string | identifier | +|nwo |string | nwo | +|score |datasets.Value("float"]| score for this search | + +### Data Splits + +| name |train |validation|test | +|-------|-----:|---------:|----:| +|default|251820| 9604|19210| + +## Dataset Creation + +### Curation Rationale + +[More Information Needed] + +### Source Data + + +#### Initial Data Collection and Normalization + +Data from CodeSearchNet Challenge dataset. +[More Information Needed] + +#### Who are the source language producers? + +Software Engineering developers. + +### Annotations + +#### Annotation process + +[More Information Needed] + +#### Who are the annotators? + +[More Information Needed] + +### Personal and Sensitive Information + +[More Information Needed] + +## Considerations for Using the Data + +### Social Impact of Dataset + +[More Information Needed] + +### Discussion of Biases + +[More Information Needed] + +### Other Known Limitations + +[More Information Needed] + +## Additional Information + +### Dataset Curators + +https://github.com/microsoft, https://github.com/madlag + +### Licensing Information + +Computational Use of Data Agreement (C-UDA) License. + +### Citation Information + +``` +@article{husain2019codesearchnet, + title={Codesearchnet challenge: Evaluating the state of semantic code search}, + author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, + journal={arXiv preprint arXiv:1909.09436}, + year={2019} +} +``` + +### Contributions + +Thanks to @madlag (and partly also @ncoop57) for adding this dataset. \ No newline at end of file diff --git a/datasets/code_x_glue_tc_nl_code_search_adv/code_x_glue_tc_nl_code_search_adv.py b/datasets/code_x_glue_tc_nl_code_search_adv/code_x_glue_tc_nl_code_search_adv.py new file mode 100644 index 00000000000..b0ddaaf820f --- /dev/null +++ b/datasets/code_x_glue_tc_nl_code_search_adv/code_x_glue_tc_nl_code_search_adv.py @@ -0,0 +1,206 @@ +import json +import os +import os.path +from typing import List + +import datasets + +from .common import TrainValidTestChild +from .generated_definitions import DEFINITIONS + + +_DESCRIPTION = """The dataset we use comes from CodeSearchNet and we filter the dataset as the following: +- Remove examples that codes cannot be parsed into an abstract syntax tree. +- Remove examples that #tokens of documents is < 3 or >256 +- Remove examples that documents contain special tokens (e.g. or https:...) +- Remove examples that documents are not English. +""" +_CITATION = """@article{husain2019codesearchnet, +title={Codesearchnet challenge: Evaluating the state of semantic code search}, +author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, +journal={arXiv preprint arXiv:1909.09436}, +year={2019} +}""" + + +class CodeXGlueCtCodeToTextBaseImpl(TrainValidTestChild): + _DESCRIPTION = _DESCRIPTION + _CITATION = _CITATION + + # For each file, each line in the uncompressed file represents one function. + _FEATURES = { + "id": datasets.Value("int32"), # Index of the sample + "repo": datasets.Value("string"), # repo: the owner/repo + "path": datasets.Value("string"), # path: the full path to the original file + "func_name": datasets.Value("string"), # func_name: the function or method name + "original_string": datasets.Value("string"), # original_string: the raw string before tokenization or parsing + "language": datasets.Value("string"), # language: the programming language name + "code": datasets.Value("string"), # code/function: the part of the original_string that is code + "code_tokens": datasets.features.Sequence( + datasets.Value("string") + ), # code_tokens/function_tokens: tokenized version of code + "docstring": datasets.Value( + "string" + ), # docstring: the top-level comment or docstring, if it exists in the original string + "docstring_tokens": datasets.features.Sequence( + datasets.Value("string") + ), # docstring_tokens: tokenized version of docstring + "sha": datasets.Value("string"), # sha of the file + "url": datasets.Value("string"), # url of the file + } + + _SUPERVISED_KEYS = ["docstring", "docstring_tokens"] + + def generate_urls(self, split_name, language): + yield "language", f"https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/{language}.zip" + yield "dataset", "dataset.zip" + + def get_data_files(self, split_name, file_paths, language): + language_specific_path = file_paths["language"] + final_path = os.path.join(language_specific_path, language, "final") + # Make some cleanup to save space + for path in os.listdir(final_path): + if path.endswith(".pkl"): + os.unlink(path) + + data_files = [] + for root, dirs, files in os.walk(final_path): + for file in files: + temp = os.path.join(root, file) + if ".jsonl" in temp: + if split_name in temp: + data_files.append(temp) + return data_files + + def post_process(self, split_name, language, js): + return js + + def _generate_examples(self, split_name, file_paths, language): + import gzip + + data_set_path = file_paths["dataset"] + + data_files = self.get_data_files(split_name, file_paths, language) + + urls = {} + f1_path_parts = [data_set_path, "dataset", language, f"{split_name}.txt"] + if self.SINGLE_LANGUAGE: + del f1_path_parts[2] + + f1_path = os.path.join(*f1_path_parts) + with open(f1_path, encoding="utf-8") as f1: + for line in f1: + line = line.strip() + urls[line] = True + + idx = 0 + for file in data_files: + if ".gz" in file: + f = gzip.open(file) + else: + f = open(file, encoding="utf-8") + + for line in f: + line = line.strip() + js = json.loads(line) + if js["url"] in urls: + js["id"] = idx + js = self.post_process(split_name, language, js) + if "partition" in js: + del js["partition"] + yield idx, js + idx += 1 + f.close() + + +class CodeXGlueTcNLCodeSearchAdvImpl(CodeXGlueCtCodeToTextBaseImpl): + LANGUAGE = "python" + SINGLE_LANGUAGE = True + + _FEATURES = { + "id": datasets.Value("int32"), # Index of the sample + "repo": datasets.Value("string"), # repo: the owner/repo + "path": datasets.Value("string"), # path: the full path to the original file + "func_name": datasets.Value("string"), # func_name: the function or method name + "original_string": datasets.Value("string"), # original_string: the raw string before tokenization or parsing + "language": datasets.Value("string"), # language: the programming language + "code": datasets.Value("string"), # code/function: the part of the original_string that is code + "code_tokens": datasets.features.Sequence( + datasets.Value("string") + ), # code_tokens/function_tokens: tokenized version of code + "docstring": datasets.Value( + "string" + ), # docstring: the top-level comment or docstring, if it exists in the original string + "docstring_tokens": datasets.features.Sequence( + datasets.Value("string") + ), # docstring_tokens: tokenized version of docstring + "sha": datasets.Value("string"), # sha of the file + "url": datasets.Value("string"), # url of the file + "docstring_summary": datasets.Value("string"), # Summary of the docstring + "parameters": datasets.Value("string"), # parameters of the function + "return_statement": datasets.Value("string"), # return statement + "argument_list": datasets.Value("string"), # list of arguments of the function + "identifier": datasets.Value("string"), # identifier + "nwo": datasets.Value("string"), # nwo + "score": datasets.Value("float"), # score for this search + } + + def post_process(self, split_name, language, js): + for suffix in "_tokens", "": + key = "function" + suffix + if key in js: + js["code" + suffix] = js[key] + del js[key] + + for key in self._FEATURES: + if key not in js: + if key == "score": + js[key] = -1 + else: + js[key] = "" + + return js + + def generate_urls(self, split_name): + for e in super().generate_urls(split_name, self.LANGUAGE): + yield e + + def get_data_files(self, split_name, file_paths, language): + if split_name == "train": + return super().get_data_files(split_name, file_paths, language) + else: + data_set_path = file_paths["dataset"] + data_file = os.path.join(data_set_path, "dataset", "test_code.jsonl") + return [data_file] + + def _generate_examples(self, split_name, file_paths): + for e in super()._generate_examples(split_name, file_paths, self.LANGUAGE): + yield e + + +CLASS_MAPPING = { + "CodeXGlueTcNLCodeSearchAdv": CodeXGlueTcNLCodeSearchAdvImpl, +} + + +class CodeXGlueTcNlCodeSearchAdv(datasets.GeneratorBasedBuilder): + BUILDER_CONFIG_CLASS = datasets.BuilderConfig + BUILDER_CONFIGS = [ + datasets.BuilderConfig(name=name, description=info["description"]) for name, info in DEFINITIONS.items() + ] + + def _info(self): + name = self.config.name + info = DEFINITIONS[name] + if info["class_name"] in CLASS_MAPPING: + self.child = CLASS_MAPPING[info["class_name"]](info) + else: + raise RuntimeError(f"Unknown python class for dataset configuration {name}") + ret = self.child._info() + return ret + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + return self.child._split_generators(dl_manager=dl_manager) + + def _generate_examples(self, split_name, file_paths): + return self.child._generate_examples(split_name, file_paths) diff --git a/datasets/code_x_glue_tc_nl_code_search_adv/common.py b/datasets/code_x_glue_tc_nl_code_search_adv/common.py new file mode 100644 index 00000000000..bbd28960601 --- /dev/null +++ b/datasets/code_x_glue_tc_nl_code_search_adv/common.py @@ -0,0 +1,75 @@ +from typing import List + +import datasets + + +# Citation, taken from https://github.com/microsoft/CodeXGLUE +_DEFAULT_CITATION = """@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},}""" + + +class Child: + _DESCRIPTION = None + _FEATURES = None + _CITATION = None + SPLITS = {"train": datasets.Split.TRAIN} + _SUPERVISED_KEYS = None + + def __init__(self, info): + self.info = info + + def homepage(self): + return self.info["project_url"] + + def _info(self): + # This is the description that will appear on the datasets page. + return datasets.DatasetInfo( + description=self.info["description"] + "\n\n" + self._DESCRIPTION, + features=datasets.Features(self._FEATURES), + homepage=self.homepage(), + citation=self._CITATION or _DEFAULT_CITATION, + supervised_keys=self._SUPERVISED_KEYS, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + SPLITS = self.SPLITS + _URL = self.info["raw_url"] + urls_to_download = {} + for split in SPLITS: + if split not in urls_to_download: + urls_to_download[split] = {} + + for key, url in self.generate_urls(split): + if not url.startswith("http"): + url = _URL + "/" + url + urls_to_download[split][key] = url + + downloaded_files = {} + for k, v in urls_to_download.items(): + downloaded_files[k] = dl_manager.download_and_extract(v) + + return [ + datasets.SplitGenerator( + name=SPLITS[k], + gen_kwargs={"split_name": k, "file_paths": downloaded_files[k]}, + ) + for k in SPLITS + ] + + def check_empty(self, entries): + all_empty = all([v == "" for v in entries.values()]) + all_non_empty = all([v != "" for v in entries.values()]) + + if not all_non_empty and not all_empty: + raise RuntimeError("Parallel data files should have the same number of lines.") + + return all_empty + + +class TrainValidTestChild(Child): + SPLITS = { + "train": datasets.Split.TRAIN, + "valid": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } diff --git a/datasets/code_x_glue_tc_nl_code_search_adv/dataset_infos.json b/datasets/code_x_glue_tc_nl_code_search_adv/dataset_infos.json new file mode 100644 index 00000000000..96bfda566b2 --- /dev/null +++ b/datasets/code_x_glue_tc_nl_code_search_adv/dataset_infos.json @@ -0,0 +1 @@ +{"default": {"description": "CodeXGLUE NL-code-search-Adv dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Code/NL-code-search-Adv\n\nThe dataset we use comes from CodeSearchNet and we filter the dataset as the following:\n- Remove examples that codes cannot be parsed into an abstract syntax tree.\n- Remove examples that #tokens of documents is < 3 or >256\n- Remove examples that documents contain special tokens (e.g. or https:...)\n- Remove examples that documents are not English.\n", "citation": "@article{husain2019codesearchnet,\ntitle={Codesearchnet challenge: Evaluating the state of semantic code search},\nauthor={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},\njournal={arXiv preprint arXiv:1909.09436},\nyear={2019}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Text-Code/NL-code-search-Adv", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "repo": {"dtype": "string", "id": null, "_type": "Value"}, "path": {"dtype": "string", "id": null, "_type": "Value"}, "func_name": {"dtype": "string", "id": null, "_type": "Value"}, "original_string": {"dtype": "string", "id": null, "_type": "Value"}, "language": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}, "code_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "docstring": {"dtype": "string", "id": null, "_type": "Value"}, "docstring_tokens": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "sha": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "docstring_summary": {"dtype": "string", "id": null, "_type": "Value"}, "parameters": {"dtype": "string", "id": null, "_type": "Value"}, "return_statement": {"dtype": "string", "id": null, "_type": "Value"}, "argument_list": {"dtype": "string", "id": null, "_type": "Value"}, "identifier": {"dtype": "string", "id": null, "_type": "Value"}, "nwo": {"dtype": "string", "id": null, "_type": "Value"}, "score": {"dtype": "float32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "docstring", "output": "docstring_tokens"}, "task_templates": null, "builder_name": "code_x_glue_tc_nl_code_search_adv", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 820716084, "num_examples": 251820, "dataset_name": "code_x_glue_tc_nl_code_search_adv"}, "validation": {"name": "validation", "num_bytes": 23468834, "num_examples": 9604, "dataset_name": "code_x_glue_tc_nl_code_search_adv"}, "test": {"name": "test", "num_bytes": 47433760, "num_examples": 19210, "dataset_name": "code_x_glue_tc_nl_code_search_adv"}}, "download_checksums": {"https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip": {"num_bytes": 940909997, "checksum": "7223c6460bebfa85697b586da91e47bc5d64790a4d60bba5917106458ab6b40e"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Code/NL-code-search-Adv/dataset.zip": {"num_bytes": 25115627, "checksum": "b4d5157699ca3bda7a33674f17d7b24294b4c8f36f650cea01d3d0dbcefdc656"}}, "download_size": 966025624, "post_processing_size": null, "dataset_size": 891618678, "size_in_bytes": 1857644302}} \ No newline at end of file diff --git a/datasets/code_x_glue_tc_nl_code_search_adv/dummy/default/0.0.0/dummy_data.zip b/datasets/code_x_glue_tc_nl_code_search_adv/dummy/default/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..75a24fc84fe Binary files /dev/null and b/datasets/code_x_glue_tc_nl_code_search_adv/dummy/default/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_tc_nl_code_search_adv/generated_definitions.py b/datasets/code_x_glue_tc_nl_code_search_adv/generated_definitions.py new file mode 100644 index 00000000000..5ab5821248e --- /dev/null +++ b/datasets/code_x_glue_tc_nl_code_search_adv/generated_definitions.py @@ -0,0 +1,12 @@ +DEFINITIONS = { + "default": { + "class_name": "CodeXGlueTcNLCodeSearchAdv", + "dataset_type": "Text-Code", + "description": "CodeXGLUE NL-code-search-Adv dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Code/NL-code-search-Adv", + "dir_name": "NL-code-search-Adv", + "name": "default", + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Text-Code/NL-code-search-Adv", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Code/NL-code-search-Adv", + "sizes": {"test": 19210, "train": 251820, "validation": 9604}, + } +} diff --git a/datasets/code_x_glue_tc_text_to_code/README.md b/datasets/code_x_glue_tc_text_to_code/README.md new file mode 100644 index 00000000000..be080dc5e1a --- /dev/null +++ b/datasets/code_x_glue_tc_text_to_code/README.md @@ -0,0 +1,164 @@ +--- +annotations_creators: +- found +language_creators: +- found +languages: +- en +- code +licenses: +- other-C-UDA +multilinguality: +- other-programming-languages +size_categories: +- 100K List[datasets.SplitGenerator]: + return self.child._split_generators(dl_manager=dl_manager) + + def _generate_examples(self, split_name, file_paths): + return self.child._generate_examples(split_name, file_paths) diff --git a/datasets/code_x_glue_tc_text_to_code/common.py b/datasets/code_x_glue_tc_text_to_code/common.py new file mode 100644 index 00000000000..bbd28960601 --- /dev/null +++ b/datasets/code_x_glue_tc_text_to_code/common.py @@ -0,0 +1,75 @@ +from typing import List + +import datasets + + +# Citation, taken from https://github.com/microsoft/CodeXGLUE +_DEFAULT_CITATION = """@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},}""" + + +class Child: + _DESCRIPTION = None + _FEATURES = None + _CITATION = None + SPLITS = {"train": datasets.Split.TRAIN} + _SUPERVISED_KEYS = None + + def __init__(self, info): + self.info = info + + def homepage(self): + return self.info["project_url"] + + def _info(self): + # This is the description that will appear on the datasets page. + return datasets.DatasetInfo( + description=self.info["description"] + "\n\n" + self._DESCRIPTION, + features=datasets.Features(self._FEATURES), + homepage=self.homepage(), + citation=self._CITATION or _DEFAULT_CITATION, + supervised_keys=self._SUPERVISED_KEYS, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + SPLITS = self.SPLITS + _URL = self.info["raw_url"] + urls_to_download = {} + for split in SPLITS: + if split not in urls_to_download: + urls_to_download[split] = {} + + for key, url in self.generate_urls(split): + if not url.startswith("http"): + url = _URL + "/" + url + urls_to_download[split][key] = url + + downloaded_files = {} + for k, v in urls_to_download.items(): + downloaded_files[k] = dl_manager.download_and_extract(v) + + return [ + datasets.SplitGenerator( + name=SPLITS[k], + gen_kwargs={"split_name": k, "file_paths": downloaded_files[k]}, + ) + for k in SPLITS + ] + + def check_empty(self, entries): + all_empty = all([v == "" for v in entries.values()]) + all_non_empty = all([v != "" for v in entries.values()]) + + if not all_non_empty and not all_empty: + raise RuntimeError("Parallel data files should have the same number of lines.") + + return all_empty + + +class TrainValidTestChild(Child): + SPLITS = { + "train": datasets.Split.TRAIN, + "valid": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } diff --git a/datasets/code_x_glue_tc_text_to_code/dataset_infos.json b/datasets/code_x_glue_tc_text_to_code/dataset_infos.json new file mode 100644 index 00000000000..7933b6e3178 --- /dev/null +++ b/datasets/code_x_glue_tc_text_to_code/dataset_infos.json @@ -0,0 +1 @@ +{"default": {"description": "CodeXGLUE text-to-code dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Code/text-to-code\n\nWe use concode dataset which is a widely used code generation dataset from Iyer's EMNLP 2018 paper Mapping Language to Code in Programmatic Context. See paper for details.", "citation": "@article{iyer2018mapping,\ntitle={Mapping language to code in programmatic context},\nauthor={Iyer, Srinivasan and Konstas, Ioannis and Cheung, Alvin and Zettlemoyer, Luke},\njournal={arXiv preprint arXiv:1808.09588},\nyear={2018}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Text-Code/text-to-code", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "nl": {"dtype": "string", "id": null, "_type": "Value"}, "code": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "code", "output": ""}, "task_templates": null, "builder_name": "code_x_glue_tc_text_to_code", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 96225611, "num_examples": 100000, "dataset_name": "code_x_glue_tc_text_to_code"}, "validation": {"name": "validation", "num_bytes": 1749751, "num_examples": 2000, "dataset_name": "code_x_glue_tc_text_to_code"}, "test": {"name": "test", "num_bytes": 1609306, "num_examples": 2000, "dataset_name": "code_x_glue_tc_text_to_code"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Code/text-to-code/dataset/concode/train.json": {"num_bytes": 97365680, "checksum": "a130f375c415932ffe0188e76b3c8aaef92b1b52d228e342328657b9ae97f17f"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Code/text-to-code/dataset/concode/dev.json": {"num_bytes": 1772646, "checksum": "cd4f91cfaa12a886a1d7acaf92eaf8ab066845c37a9221e56634d345958c922a"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Code/text-to-code/dataset/concode/test.json": {"num_bytes": 1631312, "checksum": "3323b1d723c2183a0ef4693413d25ef5a6b988b50b37b2f401de6c2c5c55159f"}}, "download_size": 100769638, "post_processing_size": null, "dataset_size": 99584668, "size_in_bytes": 200354306}} \ No newline at end of file diff --git a/datasets/code_x_glue_tc_text_to_code/dummy/default/0.0.0/dummy_data.zip b/datasets/code_x_glue_tc_text_to_code/dummy/default/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..6caa5ea8515 Binary files /dev/null and b/datasets/code_x_glue_tc_text_to_code/dummy/default/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_tc_text_to_code/generated_definitions.py b/datasets/code_x_glue_tc_text_to_code/generated_definitions.py new file mode 100644 index 00000000000..9defb975e19 --- /dev/null +++ b/datasets/code_x_glue_tc_text_to_code/generated_definitions.py @@ -0,0 +1,12 @@ +DEFINITIONS = { + "default": { + "class_name": "CodeXGlueTcTextToCode", + "dataset_type": "Text-Code", + "description": "CodeXGLUE text-to-code dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Code/text-to-code", + "dir_name": "text-to-code", + "name": "default", + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Text-Code/text-to-code", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Code/text-to-code/dataset", + "sizes": {"test": 2000, "train": 100000, "validation": 2000}, + } +} diff --git a/datasets/code_x_glue_tt_text_to_text/README.md b/datasets/code_x_glue_tt_text_to_text/README.md new file mode 100644 index 00000000000..7e324a10927 --- /dev/null +++ b/datasets/code_x_glue_tt_text_to_text/README.md @@ -0,0 +1,202 @@ +--- +annotations_creators: +- found +language_creators: +- found +languages: +- da +- nb +- lv +- zh +- en +licenses: +- other-C-UDA +multilinguality: +- multilingual +size_categories: +- 10K List[datasets.SplitGenerator]: + return self.child._split_generators(dl_manager=dl_manager) + + def _generate_examples(self, split_name, file_paths): + return self.child._generate_examples(split_name, file_paths) diff --git a/datasets/code_x_glue_tt_text_to_text/common.py b/datasets/code_x_glue_tt_text_to_text/common.py new file mode 100644 index 00000000000..bbd28960601 --- /dev/null +++ b/datasets/code_x_glue_tt_text_to_text/common.py @@ -0,0 +1,75 @@ +from typing import List + +import datasets + + +# Citation, taken from https://github.com/microsoft/CodeXGLUE +_DEFAULT_CITATION = """@article{CodeXGLUE, + title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence}, + year={2020},}""" + + +class Child: + _DESCRIPTION = None + _FEATURES = None + _CITATION = None + SPLITS = {"train": datasets.Split.TRAIN} + _SUPERVISED_KEYS = None + + def __init__(self, info): + self.info = info + + def homepage(self): + return self.info["project_url"] + + def _info(self): + # This is the description that will appear on the datasets page. + return datasets.DatasetInfo( + description=self.info["description"] + "\n\n" + self._DESCRIPTION, + features=datasets.Features(self._FEATURES), + homepage=self.homepage(), + citation=self._CITATION or _DEFAULT_CITATION, + supervised_keys=self._SUPERVISED_KEYS, + ) + + def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]: + SPLITS = self.SPLITS + _URL = self.info["raw_url"] + urls_to_download = {} + for split in SPLITS: + if split not in urls_to_download: + urls_to_download[split] = {} + + for key, url in self.generate_urls(split): + if not url.startswith("http"): + url = _URL + "/" + url + urls_to_download[split][key] = url + + downloaded_files = {} + for k, v in urls_to_download.items(): + downloaded_files[k] = dl_manager.download_and_extract(v) + + return [ + datasets.SplitGenerator( + name=SPLITS[k], + gen_kwargs={"split_name": k, "file_paths": downloaded_files[k]}, + ) + for k in SPLITS + ] + + def check_empty(self, entries): + all_empty = all([v == "" for v in entries.values()]) + all_non_empty = all([v != "" for v in entries.values()]) + + if not all_non_empty and not all_empty: + raise RuntimeError("Parallel data files should have the same number of lines.") + + return all_empty + + +class TrainValidTestChild(Child): + SPLITS = { + "train": datasets.Split.TRAIN, + "valid": datasets.Split.VALIDATION, + "test": datasets.Split.TEST, + } diff --git a/datasets/code_x_glue_tt_text_to_text/dataset_infos.json b/datasets/code_x_glue_tt_text_to_text/dataset_infos.json new file mode 100644 index 00000000000..eca45208d59 --- /dev/null +++ b/datasets/code_x_glue_tt_text_to_text/dataset_infos.json @@ -0,0 +1 @@ +{"da_en": {"description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text\n\nThe dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "citation": "@article{DBLP:journals/corr/abs-2102-04664,\n author = {Shuai Lu and\n Daya Guo and\n Shuo Ren and\n Junjie Huang and\n Alexey Svyatkovskiy and\n Ambrosio Blanco and\n Colin B. Clement and\n Dawn Drain and\n Daxin Jiang and\n Duyu Tang and\n Ge Li and\n Lidong Zhou and\n Linjun Shou and\n Long Zhou and\n Michele Tufano and\n Ming Gong and\n Ming Zhou and\n Nan Duan and\n Neel Sundaresan and\n Shao Kun Deng and\n Shengyu Fu and\n Shujie Liu},\n title = {CodeXGLUE: {A} Machine Learning Benchmark Dataset for Code Understanding\n and Generation},\n journal = {CoRR},\n volume = {abs/2102.04664},\n year = {2021}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Text-Text/text-to-text", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "target", "output": ""}, "task_templates": null, "builder_name": "code_x_glue_tt_text_to_text", "config_name": "da_en", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 8163215, "num_examples": 42701, "dataset_name": "code_x_glue_tt_text_to_text"}, "validation": {"name": "validation", "num_bytes": 190340, "num_examples": 1000, "dataset_name": "code_x_glue_tt_text_to_text"}, "test": {"name": "test", "num_bytes": 190780, "num_examples": 1000, "dataset_name": "code_x_glue_tt_text_to_text"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/train/da-en.train.da": {"num_bytes": 4371929, "checksum": "4b62e847011e2c13126f179f6f75c0ea4692ae24638872fe352af70864e98c6b"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/train/da-en.train.en": {"num_bytes": 3278834, "checksum": "b9f3253f0f19b036b75637b94c7fd96afd358ea5c104ff97dfd543b8722bef3d"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/dev/da-en.dev.da": {"num_bytes": 101626, "checksum": "522e3896d36d360a2cfe819c680f593f19902fd9d35a7bb5c340e3dfcc637294"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/dev/da-en.dev.en": {"num_bytes": 76706, "checksum": "fe1c304bebe346ee0076e51c9ac77edbad2a82f19c870e8c8d503f1114c9c925"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/test/da-en.test.da": {"num_bytes": 101374, "checksum": "2bb8a734242c0ee89559a2d9166f74df60259a0ec09b99a6091cff53001bfa94"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/test/da-en.test.en": {"num_bytes": 77398, "checksum": "850e36c666a8a4adce9bec78065503af8b619e76f8ce29e1be52b7da249b3eac"}}, "download_size": 8007867, "post_processing_size": null, "dataset_size": 8544335, "size_in_bytes": 16552202}, "lv_en": {"description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text\n\nThe dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "citation": "@article{DBLP:journals/corr/abs-2102-04664,\n author = {Shuai Lu and\n Daya Guo and\n Shuo Ren and\n Junjie Huang and\n Alexey Svyatkovskiy and\n Ambrosio Blanco and\n Colin B. Clement and\n Dawn Drain and\n Daxin Jiang and\n Duyu Tang and\n Ge Li and\n Lidong Zhou and\n Linjun Shou and\n Long Zhou and\n Michele Tufano and\n Ming Gong and\n Ming Zhou and\n Nan Duan and\n Neel Sundaresan and\n Shao Kun Deng and\n Shengyu Fu and\n Shujie Liu},\n title = {CodeXGLUE: {A} Machine Learning Benchmark Dataset for Code Understanding\n and Generation},\n journal = {CoRR},\n volume = {abs/2102.04664},\n year = {2021}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Text-Text/text-to-text", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "target", "output": ""}, "task_templates": null, "builder_name": "code_x_glue_tt_text_to_text", "config_name": "lv_en", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3644127, "num_examples": 18749, "dataset_name": "code_x_glue_tt_text_to_text"}, "validation": {"name": "validation", "num_bytes": 192519, "num_examples": 1000, "dataset_name": "code_x_glue_tt_text_to_text"}, "test": {"name": "test", "num_bytes": 190875, "num_examples": 1000, "dataset_name": "code_x_glue_tt_text_to_text"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/train/lv-en.train.lv": {"num_bytes": 1946457, "checksum": "67aded4f9685048ddddcb562247d703ed760893084b7c144982ad997e9ba02e2"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/train/lv-en.train.en": {"num_bytes": 1472666, "checksum": "baadb95d607d4671fb7442443792708ca73b98c4f8b61b6b1a18c028f54fd737"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/dev/lv-en.dev.lv": {"num_bytes": 102750, "checksum": "adb33098479c5b51c18ce03f94f754aec53614b0f4c32aba7b09aad7b7876181"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/dev/lv-en.dev.en": {"num_bytes": 77761, "checksum": "2230457897639bdcea567141eedc2409bde3ba34851ed8436f75ec96ebcc51a5"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/test/lv-en.test.lv": {"num_bytes": 101858, "checksum": "1624d188ba42298bd7a6c6f941837865682732e5a4f0256c6dad5ccc295b44e9"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/test/lv-en.test.en": {"num_bytes": 77009, "checksum": "fd70d49126fc5d82ea3850bd17e6940c1a0a6da108fa2e44bc2c5b4c99b9d635"}}, "download_size": 3778501, "post_processing_size": null, "dataset_size": 4027521, "size_in_bytes": 7806022}, "no_en": {"description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text\n\nThe dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "citation": "@article{DBLP:journals/corr/abs-2102-04664,\n author = {Shuai Lu and\n Daya Guo and\n Shuo Ren and\n Junjie Huang and\n Alexey Svyatkovskiy and\n Ambrosio Blanco and\n Colin B. Clement and\n Dawn Drain and\n Daxin Jiang and\n Duyu Tang and\n Ge Li and\n Lidong Zhou and\n Linjun Shou and\n Long Zhou and\n Michele Tufano and\n Ming Gong and\n Ming Zhou and\n Nan Duan and\n Neel Sundaresan and\n Shao Kun Deng and\n Shengyu Fu and\n Shujie Liu},\n title = {CodeXGLUE: {A} Machine Learning Benchmark Dataset for Code Understanding\n and Generation},\n journal = {CoRR},\n volume = {abs/2102.04664},\n year = {2021}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Text-Text/text-to-text", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "target", "output": ""}, "task_templates": null, "builder_name": "code_x_glue_tt_text_to_text", "config_name": "no_en", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 8761795, "num_examples": 44322, "dataset_name": "code_x_glue_tt_text_to_text"}, "validation": {"name": "validation", "num_bytes": 203823, "num_examples": 1000, "dataset_name": "code_x_glue_tt_text_to_text"}, "test": {"name": "test", "num_bytes": 197135, "num_examples": 1000, "dataset_name": "code_x_glue_tt_text_to_text"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/train/no-en.train.no": {"num_bytes": 4608004, "checksum": "85ab003e4149322091197d233f4f74bd6c2304675e506830fdbc7ce6b38066a8"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/train/no-en.train.en": {"num_bytes": 3621887, "checksum": "1ae6767caf7eab60b7d95908016c3fd480ef59a4d417e7e58f7f40c25ab92880"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/dev/no-en.dev.no": {"num_bytes": 106174, "checksum": "f19c98c23c746764a85dcbfbef68116e8e1d9857e283ce70a502ebdd2bcc2974"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/dev/no-en.dev.en": {"num_bytes": 85641, "checksum": "7990441ef0c0244f556eb7736d550fd599bc778b41889ae1543f98f9329543ce"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/test/no-en.test.no": {"num_bytes": 103342, "checksum": "30f6201d480969ede3440d00003f735d53f8434351f2304fe6f5b1f2e3e906ae"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/test/no-en.test.en": {"num_bytes": 81785, "checksum": "86c8a4d2a41deffc88c48a419213fa7dc89fe1a0315788a472a3b91822a191f6"}}, "download_size": 8606833, "post_processing_size": null, "dataset_size": 9162753, "size_in_bytes": 17769586}, "zh_en": {"description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text\n\nThe dataset we use is crawled and filtered from Microsoft Documentation, whose document located at https://github.com/MicrosoftDocs/.", "citation": "@article{DBLP:journals/corr/abs-2102-04664,\n author = {Shuai Lu and\n Daya Guo and\n Shuo Ren and\n Junjie Huang and\n Alexey Svyatkovskiy and\n Ambrosio Blanco and\n Colin B. Clement and\n Dawn Drain and\n Daxin Jiang and\n Duyu Tang and\n Ge Li and\n Lidong Zhou and\n Linjun Shou and\n Long Zhou and\n Michele Tufano and\n Ming Gong and\n Ming Zhou and\n Nan Duan and\n Neel Sundaresan and\n Shao Kun Deng and\n Shengyu Fu and\n Shujie Liu},\n title = {CodeXGLUE: {A} Machine Learning Benchmark Dataset for Code Understanding\n and Generation},\n journal = {CoRR},\n volume = {abs/2102.04664},\n year = {2021}\n}", "homepage": "https://github.com/madlag/CodeXGLUE/tree/main/Text-Text/text-to-text", "license": "", "features": {"id": {"dtype": "int32", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "target", "output": ""}, "task_templates": null, "builder_name": "code_x_glue_tt_text_to_text", "config_name": "zh_en", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 9592196, "num_examples": 50154, "dataset_name": "code_x_glue_tt_text_to_text"}, "validation": {"name": "validation", "num_bytes": 192155, "num_examples": 1000, "dataset_name": "code_x_glue_tt_text_to_text"}, "test": {"name": "test", "num_bytes": 195245, "num_examples": 1000, "dataset_name": "code_x_glue_tt_text_to_text"}}, "download_checksums": {"https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/train/zh-en.train.zh": {"num_bytes": 4946356, "checksum": "8d6db4cc2ff27aea99410f0b6f674c558cc74cc68a365915e8b89c0cce92e4be"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/train/zh-en.train.en": {"num_bytes": 4043944, "checksum": "ca08892b854a65823abcc63efa23e975894e0be74c54f3593d34395726de6126"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/dev/zh-en.dev.zh": {"num_bytes": 98538, "checksum": "5baa187a524e8ff88859e29f6c67226b83a4b31cfadbd309585004a301e1dac5"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/dev/zh-en.dev.en": {"num_bytes": 81609, "checksum": "505b46cba0825aa2f0fda3b03b3746cd6aeb1bf17ccde80bc5a8c3a6b195f32c"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/test/zh-en.test.zh": {"num_bytes": 100849, "checksum": "247fcf6250c8b3ac6e30968130320e94fa67eb11b8aa8b6ad6a2a770f5f7c5a2"}, "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data/test/zh-en.test.en": {"num_bytes": 82388, "checksum": "2b6af4ebdbd1ed95fa42ee7670d158b03f30458c2c0b6a602a7b2dd25bd7e263"}}, "download_size": 9353684, "post_processing_size": null, "dataset_size": 9979596, "size_in_bytes": 19333280}} \ No newline at end of file diff --git a/datasets/code_x_glue_tt_text_to_text/dummy/da_en/0.0.0/dummy_data.zip b/datasets/code_x_glue_tt_text_to_text/dummy/da_en/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..2db162d3d3d Binary files /dev/null and b/datasets/code_x_glue_tt_text_to_text/dummy/da_en/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_tt_text_to_text/dummy/lv_en/0.0.0/dummy_data.zip b/datasets/code_x_glue_tt_text_to_text/dummy/lv_en/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..ac1d8fd075a Binary files /dev/null and b/datasets/code_x_glue_tt_text_to_text/dummy/lv_en/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_tt_text_to_text/dummy/no_en/0.0.0/dummy_data.zip b/datasets/code_x_glue_tt_text_to_text/dummy/no_en/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..ee3cccd0ac2 Binary files /dev/null and b/datasets/code_x_glue_tt_text_to_text/dummy/no_en/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_tt_text_to_text/dummy/zh_en/0.0.0/dummy_data.zip b/datasets/code_x_glue_tt_text_to_text/dummy/zh_en/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..afe9fbe4c60 Binary files /dev/null and b/datasets/code_x_glue_tt_text_to_text/dummy/zh_en/0.0.0/dummy_data.zip differ diff --git a/datasets/code_x_glue_tt_text_to_text/generated_definitions.py b/datasets/code_x_glue_tt_text_to_text/generated_definitions.py new file mode 100644 index 00000000000..00a90e00e60 --- /dev/null +++ b/datasets/code_x_glue_tt_text_to_text/generated_definitions.py @@ -0,0 +1,46 @@ +DEFINITIONS = { + "da_en": { + "class_name": "CodeXGlueTtTextToText", + "dataset_type": "Text-Text", + "description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text", + "dir_name": "text-to-text", + "name": "da_en", + "parameters": {"natural_language_pair": "da-en"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Text-Text/text-to-text", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data", + "sizes": {"test": 1000, "train": 42701, "validation": 1000}, + }, + "lv_en": { + "class_name": "CodeXGlueTtTextToText", + "dataset_type": "Text-Text", + "description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text", + "dir_name": "text-to-text", + "name": "lv_en", + "parameters": {"natural_language_pair": "lv-en"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Text-Text/text-to-text", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data", + "sizes": {"test": 1000, "train": 18749, "validation": 1000}, + }, + "no_en": { + "class_name": "CodeXGlueTtTextToText", + "dataset_type": "Text-Text", + "description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text", + "dir_name": "text-to-text", + "name": "no_en", + "parameters": {"natural_language_pair": "no-en"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Text-Text/text-to-text", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data", + "sizes": {"test": 1000, "train": 44322, "validation": 1000}, + }, + "zh_en": { + "class_name": "CodeXGlueTtTextToText", + "dataset_type": "Text-Text", + "description": "CodeXGLUE text-to-text dataset, available at https://github.com/microsoft/CodeXGLUE/tree/main/Text-Text/text-to-text", + "dir_name": "text-to-text", + "name": "zh_en", + "parameters": {"natural_language_pair": "zh-en"}, + "project_url": "https://github.com/madlag/CodeXGLUE/tree/main/Text-Text/text-to-text", + "raw_url": "https://raw.githubusercontent.com/madlag/CodeXGLUE/main/Text-Text/text-to-text/data", + "sizes": {"test": 1000, "train": 50154, "validation": 1000}, + }, +}