From 475c1fba7e66a34827f8f0210f9517ea04d97d9e Mon Sep 17 00:00:00 2001 From: Ilias Chalkidis Date: Fri, 30 Sep 2022 12:46:01 +0300 Subject: [PATCH 1/4] Update lex_glue.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix for a critical bug in the EURLEX dataset label list to make LexGLUE EURLEX results replicable. In LexGLUE (Chalkidis et al., 2022), the following is mentioned w.r.t. EUR-LEX: "It supports four different label granularities, comprising 21, 127, 567, 7390 EuroVoc concepts, respectively. We use the 100 most frequent concepts from level 2 [...]”. The current label list has all 127 labels, which leads to different (lower) results, as communicated by users. --- datasets/lex_glue/lex_glue.py | 228 +++++++++++++++------------------- 1 file changed, 100 insertions(+), 128 deletions(-) diff --git a/datasets/lex_glue/lex_glue.py b/datasets/lex_glue/lex_glue.py index 27d63700ad1..294866936ec 100644 --- a/datasets/lex_glue/lex_glue.py +++ b/datasets/lex_glue/lex_glue.py @@ -46,134 +46,106 @@ ECTHR_ARTICLES = ["2", "3", "5", "6", "8", "9", "10", "11", "14", "P1-1"] EUROVOC_CONCEPTS = [ - "100163", - "100164", - "100165", - "100166", - "100167", - "100168", - "100169", - "100170", - "100171", - "100172", - "100173", - "100174", - "100175", - "100176", - "100177", - "100178", - "100179", - "100180", - "100181", - "100182", - "100183", - "100184", - "100185", - "100186", - "100187", - "100188", - "100189", - "100190", - "100191", - "100192", - "100193", - "100194", - "100195", - "100196", - "100197", - "100198", - "100199", - "100200", - "100201", - "100202", - "100203", - "100204", - "100205", - "100206", - "100207", - "100208", - "100209", - "100210", - "100211", - "100212", - "100213", - "100214", - "100215", - "100216", - "100217", - "100218", - "100219", - "100220", - "100221", - "100222", - "100223", - "100224", - "100225", - "100226", - "100227", - "100228", - "100229", - "100230", - "100231", - "100232", - "100233", - "100234", - "100235", - "100236", - "100237", - "100238", - "100239", - "100240", - "100241", - "100242", - "100243", - "100244", - "100245", - "100246", - "100247", - "100248", - "100249", - "100250", - "100251", - "100252", - "100253", - "100254", - "100255", - "100256", - "100257", - "100258", - "100259", - "100260", - "100261", - "100262", - "100263", - "100264", - "100265", - "100266", - "100267", - "100268", - "100269", - "100270", - "100271", - "100272", - "100273", - "100274", - "100275", - "100276", - "100277", - "100278", - "100279", - "100280", - "100281", - "100282", - "100283", - "100284", - "100285", - "100286", - "100287", - "100288", - "100289", -] + '100163', + '100168', + '100169', + '100170', + '100171', + '100172', + '100173', + '100174', + '100175', + '100176', + '100177', + '100179', + '100180', + '100183', + '100184', + '100185', + '100186', + '100187', + '100189', + '100190', + '100191', + '100192', + '100193', + '100194', + '100195', + '100196', + '100197', + '100198', + '100199', + '100200', + '100201', + '100202', + '100204', + '100205', + '100206', + '100207', + '100212', + '100214', + '100215', + '100220', + '100221', + '100222', + '100223', + '100224', + '100226', + '100227', + '100229', + '100230', + '100231', + '100232', + '100233', + '100234', + '100235', + '100237', + '100238', + '100239', + '100240', + '100241', + '100242', + '100243', + '100244', + '100245', + '100246', + '100247', + '100248', + '100249', + '100250', + '100252', + '100253', + '100254', + '100255', + '100256', + '100257', + '100258', + '100259', + '100260', + '100261', + '100262', + '100263', + '100264', + '100265', + '100266', + '100268', + '100269', + '100270', + '100271', + '100272', + '100273', + '100274', + '100275', + '100276', + '100277', + '100278', + '100279', + '100280', + '100281', + '100282', + '100283', + '100284', + '100285'] LEDGAR_CATEGORIES = [ "Adjustments", From 0f7750cfb88ecaf15878f6e79221cb5f4edac6c0 Mon Sep 17 00:00:00 2001 From: Ilias Chalkidis Date: Fri, 30 Sep 2022 12:56:11 +0300 Subject: [PATCH 2/4] Update lex_glue.py Update code formatting --- datasets/lex_glue/lex_glue.py | 201 +++++++++++++++++----------------- 1 file changed, 101 insertions(+), 100 deletions(-) diff --git a/datasets/lex_glue/lex_glue.py b/datasets/lex_glue/lex_glue.py index 294866936ec..711a313f02a 100644 --- a/datasets/lex_glue/lex_glue.py +++ b/datasets/lex_glue/lex_glue.py @@ -46,106 +46,107 @@ ECTHR_ARTICLES = ["2", "3", "5", "6", "8", "9", "10", "11", "14", "P1-1"] EUROVOC_CONCEPTS = [ - '100163', - '100168', - '100169', - '100170', - '100171', - '100172', - '100173', - '100174', - '100175', - '100176', - '100177', - '100179', - '100180', - '100183', - '100184', - '100185', - '100186', - '100187', - '100189', - '100190', - '100191', - '100192', - '100193', - '100194', - '100195', - '100196', - '100197', - '100198', - '100199', - '100200', - '100201', - '100202', - '100204', - '100205', - '100206', - '100207', - '100212', - '100214', - '100215', - '100220', - '100221', - '100222', - '100223', - '100224', - '100226', - '100227', - '100229', - '100230', - '100231', - '100232', - '100233', - '100234', - '100235', - '100237', - '100238', - '100239', - '100240', - '100241', - '100242', - '100243', - '100244', - '100245', - '100246', - '100247', - '100248', - '100249', - '100250', - '100252', - '100253', - '100254', - '100255', - '100256', - '100257', - '100258', - '100259', - '100260', - '100261', - '100262', - '100263', - '100264', - '100265', - '100266', - '100268', - '100269', - '100270', - '100271', - '100272', - '100273', - '100274', - '100275', - '100276', - '100277', - '100278', - '100279', - '100280', - '100281', - '100282', - '100283', - '100284', - '100285'] + "100163", + "100168", + "100169", + "100170", + "100171", + "100172", + "100173", + "100174", + "100175", + "100176", + "100177", + "100179", + "100180", + "100183", + "100184", + "100185", + "100186", + "100187", + "100189", + "100190", + "100191", + "100192", + "100193", + "100194", + "100195", + "100196", + "100197", + "100198", + "100199", + "100200", + "100201", + "100202", + "100204", + "100205", + "100206", + "100207", + "100212", + "100214", + "100215", + "100220", + "100221", + "100222", + "100223", + "100224", + "100226", + "100227", + "100229", + "100230", + "100231", + "100232", + "100233", + "100234", + "100235", + "100237", + "100238", + "100239", + "100240", + "100241", + "100242", + "100243", + "100244", + "100245", + "100246", + "100247", + "100248", + "100249", + "100250", + "100252", + "100253", + "100254", + "100255", + "100256", + "100257", + "100258", + "100259", + "100260", + "100261", + "100262", + "100263", + "100264", + "100265", + "100266", + "100268", + "100269", + "100270", + "100271", + "100272", + "100273", + "100274", + "100275", + "100276", + "100277", + "100278", + "100279", + "100280", + "100281", + "100282", + "100283", + "100284", + "100285", +] LEDGAR_CATEGORIES = [ "Adjustments", From cf2fdd41120d2ab60416ba6642ffeb9b5f9ceb00 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 30 Sep 2022 17:35:22 +0200 Subject: [PATCH 3/4] Update dataset card --- datasets/lex_glue/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/lex_glue/README.md b/datasets/lex_glue/README.md index 5f521b69261..213bef9d137 100644 --- a/datasets/lex_glue/README.md +++ b/datasets/lex_glue/README.md @@ -199,7 +199,7 @@ An example of 'train' looks as follows. ```json { "text": "COMMISSION REGULATION (EC) No 1629/96 of 13 August 1996 on an invitation to tender for the refund on export of wholly milled round grain rice to certain third countries ...", - "labels": [2, 42, 72, 76, 86] + "labels": [4, 20, 21, 35, 68] } ``` From 74a0c640736d4262e9eb1a34a5ad2054902a2024 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 30 Sep 2022 17:37:22 +0200 Subject: [PATCH 4/4] Update metadata JSON --- datasets/lex_glue/dataset_infos.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/lex_glue/dataset_infos.json b/datasets/lex_glue/dataset_infos.json index 074448e1dcc..d578109ba13 100644 --- a/datasets/lex_glue/dataset_infos.json +++ b/datasets/lex_glue/dataset_infos.json @@ -1 +1 @@ -{"ecthr_a": {"description": "The European Court of Human Rights (ECtHR) hears allegations that a state has\nbreached human rights provisions of the European Convention of Human Rights (ECHR).\nFor each case, the dataset provides a list of factual paragraphs (facts) from the case description.\nEach case is mapped to articles of the ECHR that were violated (if any).", "citation": "@inproceedings{chalkidis-etal-2021-paragraph,\n title = \"Paragraph-level Rationale Extraction through Regularization: A case study on {E}uropean Court of Human Rights Cases\",\n author = \"Chalkidis, Ilias and\n Fergadiotis, Manos and\n Tsarapatsanis, Dimitrios and\n Aletras, Nikolaos and\n Androutsopoulos, Ion and\n Malakasiotis, Prodromos\",\n booktitle = \"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies\",\n month = jun,\n year = \"2021\",\n address = \"Online\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://aclanthology.org/2021.naacl-main.22\",\n doi = \"10.18653/v1/2021.naacl-main.22\",\n pages = \"226--241\",\n}\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "https://archive.org/details/ECtHR-NAACL2021", "license": "", "features": {"text": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"num_classes": 10, "names": ["2", "3", "5", "6", "8", "9", "10", "11", "14", "P1-1"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "ecthr_a", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 89637461, "num_examples": 9000, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 11884180, "num_examples": 1000, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 10985180, "num_examples": 1000, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/ecthr.tar.gz": {"num_bytes": 32852475, "checksum": "461c1f6016af3a7ac0bd115c1f9ff65031258bfec39e570fec74a16d8946398e"}}, "download_size": 32852475, "post_processing_size": null, "dataset_size": 112506821, "size_in_bytes": 145359296}, "ecthr_b": {"description": "The European Court of Human Rights (ECtHR) hears allegations that a state has\nbreached human rights provisions of the European Convention of Human Rights (ECHR).\nFor each case, the dataset provides a list of factual paragraphs (facts) from the case description.\nEach case is mapped to articles of ECHR that were allegedly violated (considered by the court).", "citation": "@inproceedings{chalkidis-etal-2021-paragraph,\n title = \"Paragraph-level Rationale Extraction through Regularization: A case study on {E}uropean Court of Human Rights Cases\",\n author = \"Chalkidis, Ilias\n and Fergadiotis, Manos\n and Tsarapatsanis, Dimitrios\n and Aletras, Nikolaos\n and Androutsopoulos, Ion\n and Malakasiotis, Prodromos\",\n booktitle = \"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies\",\n year = \"2021\",\n address = \"Online\",\n url = \"https://aclanthology.org/2021.naacl-main.22\",\n}\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "https://archive.org/details/ECtHR-NAACL2021", "license": "", "features": {"text": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"num_classes": 10, "names": ["2", "3", "5", "6", "8", "9", "10", "11", "14", "P1-1"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "ecthr_b", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 89657661, "num_examples": 9000, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 11886940, "num_examples": 1000, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 10987828, "num_examples": 1000, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/ecthr.tar.gz": {"num_bytes": 32852475, "checksum": "461c1f6016af3a7ac0bd115c1f9ff65031258bfec39e570fec74a16d8946398e"}}, "download_size": 32852475, "post_processing_size": null, "dataset_size": 112532429, "size_in_bytes": 145384904}, "eurlex": {"description": "European Union (EU) legislation is published in EUR-Lex portal.\nAll EU laws are annotated by EU's Publications Office with multiple concepts from the EuroVoc thesaurus,\na multilingual thesaurus maintained by the Publications Office.\nThe current version of EuroVoc contains more than 7k concepts referring to various activities\nof the EU and its Member States (e.g., economics, health-care, trade).\nGiven a document, the task is to predict its EuroVoc labels (concepts).", "citation": "@inproceedings{chalkidis-etal-2021-multieurlex,\n author = {Chalkidis, Ilias and\n Fergadiotis, Manos and\n Androutsopoulos, Ion},\n title = {MultiEURLEX -- A multi-lingual and multi-label legal document\n classification dataset for zero-shot cross-lingual transfer},\n booktitle = {Proceedings of the 2021 Conference on Empirical Methods\n in Natural Language Processing},\n year = {2021},\n location = {Punta Cana, Dominican Republic},\n}\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "https://zenodo.org/record/5363165#.YVJOAi8RqaA", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "labels": {"feature": {"num_classes": 127, "names": ["100163", "100164", "100165", "100166", "100167", "100168", "100169", "100170", "100171", "100172", "100173", "100174", "100175", "100176", "100177", "100178", "100179", "100180", "100181", "100182", "100183", "100184", "100185", "100186", "100187", "100188", "100189", "100190", "100191", "100192", "100193", "100194", "100195", "100196", "100197", "100198", "100199", "100200", "100201", "100202", "100203", "100204", "100205", "100206", "100207", "100208", "100209", "100210", "100211", "100212", "100213", "100214", "100215", "100216", "100217", "100218", "100219", "100220", "100221", "100222", "100223", "100224", "100225", "100226", "100227", "100228", "100229", "100230", "100231", "100232", "100233", "100234", "100235", "100236", "100237", "100238", "100239", "100240", "100241", "100242", "100243", "100244", "100245", "100246", "100247", "100248", "100249", "100250", "100251", "100252", "100253", "100254", "100255", "100256", "100257", "100258", "100259", "100260", "100261", "100262", "100263", "100264", "100265", "100266", "100267", "100268", "100269", "100270", "100271", "100272", "100273", "100274", "100275", "100276", "100277", "100278", "100279", "100280", "100281", "100282", "100283", "100284", "100285", "100286", "100287", "100288", "100289"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "eurlex", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 390789505, "num_examples": 55000, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 59742502, "num_examples": 5000, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 41546764, "num_examples": 5000, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/eurlex.tar.gz": {"num_bytes": 125413277, "checksum": "82376ff55c3812632d8a21ad0d7e515e2e7ec6431ca7673a454cdd41a3a7bf46"}}, "download_size": 125413277, "post_processing_size": null, "dataset_size": 492078771, "size_in_bytes": 617492048}, "scotus": {"description": "The US Supreme Court (SCOTUS) is the highest federal court in the United States of America\nand generally hears only the most controversial or otherwise complex cases which have not\nbeen sufficiently well solved by lower courts. This is a single-label multi-class classification\ntask, where given a document (court opinion), the task is to predict the relevant issue areas.\nThe 14 issue areas cluster 278 issues whose focus is on the subject matter of the controversy (dispute).", "citation": "@misc{spaeth2020,\n author = {Harold J. Spaeth and Lee Epstein and Andrew D. Martin, Jeffrey A. Segal\n and Theodore J. Ruger and Sara C. Benesh},\n year = {2020},\n title ={{Supreme Court Database, Version 2020 Release 01}},\n url= {http://Supremecourtdatabase.org},\n howpublished={Washington University Law}\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "http://scdb.wustl.edu/data.php", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 13, "names": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "scotus", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 178959320, "num_examples": 5000, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 76213283, "num_examples": 1400, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 75600247, "num_examples": 1400, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/scotus.tar.gz": {"num_bytes": 104763335, "checksum": "d53cc99aaf60b24ca7e4cf634f08a2572b5b3532f83aecdfc2c4257050dc9d0a"}}, "download_size": 104763335, "post_processing_size": null, "dataset_size": 330772850, "size_in_bytes": 435536185}, "ledgar": {"description": "LEDGAR dataset aims contract provision (paragraph) classification.\nThe contract provisions come from contracts obtained from the US Securities and Exchange Commission (SEC)\nfilings, which are publicly available from EDGAR. Each label represents the single main topic\n(theme) of the corresponding contract provision.", "citation": "@inproceedings{tuggener-etal-2020-ledgar,\n title = \"{LEDGAR}: A Large-Scale Multi-label Corpus for Text Classification of Legal Provisions in Contracts\",\n author = {Tuggener, Don and\n von D{\"a}niken, Pius and\n Peetz, Thomas and\n Cieliebak, Mark},\n booktitle = \"Proceedings of the 12th Language Resources and Evaluation Conference\",\n year = \"2020\",\n address = \"Marseille, France\",\n url = \"https://aclanthology.org/2020.lrec-1.155\",\n}\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "https://metatext.io/datasets/ledgar", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 100, "names": ["Adjustments", "Agreements", "Amendments", "Anti-Corruption Laws", "Applicable Laws", "Approvals", "Arbitration", "Assignments", "Assigns", "Authority", "Authorizations", "Base Salary", "Benefits", "Binding Effects", "Books", "Brokers", "Capitalization", "Change In Control", "Closings", "Compliance With Laws", "Confidentiality", "Consent To Jurisdiction", "Consents", "Construction", "Cooperation", "Costs", "Counterparts", "Death", "Defined Terms", "Definitions", "Disability", "Disclosures", "Duties", "Effective Dates", "Effectiveness", "Employment", "Enforceability", "Enforcements", "Entire Agreements", "Erisa", "Existence", "Expenses", "Fees", "Financial Statements", "Forfeitures", "Further Assurances", "General", "Governing Laws", "Headings", "Indemnifications", "Indemnity", "Insurances", "Integration", "Intellectual Property", "Interests", "Interpretations", "Jurisdictions", "Liens", "Litigations", "Miscellaneous", "Modifications", "No Conflicts", "No Defaults", "No Waivers", "Non-Disparagement", "Notices", "Organizations", "Participations", "Payments", "Positions", "Powers", "Publicity", "Qualifications", "Records", "Releases", "Remedies", "Representations", "Sales", "Sanctions", "Severability", "Solvency", "Specific Performance", "Submission To Jurisdiction", "Subsidiaries", "Successors", "Survival", "Tax Withholdings", "Taxes", "Terminations", "Terms", "Titles", "Transactions With Affiliates", "Use Of Proceeds", "Vacations", "Venues", "Vesting", "Waiver Of Jury Trials", "Waivers", "Warranties", "Withholdings"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "ledgar", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 43358315, "num_examples": 60000, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 6845585, "num_examples": 10000, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 7143592, "num_examples": 10000, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/ledgar.tar.gz": {"num_bytes": 16255623, "checksum": "f7507bcce46ce03e3e91b8aaa1b84ddf6e8f1d628c0d7fa351f97ce45366d5d8"}}, "download_size": 16255623, "post_processing_size": null, "dataset_size": 57347492, "size_in_bytes": 73603115}, "unfair_tos": {"description": "The UNFAIR-ToS dataset contains 50 Terms of Service (ToS) from on-line platforms (e.g., YouTube,\nEbay, Facebook, etc.). The dataset has been annotated on the sentence-level with 8 types of\nunfair contractual terms (sentences), meaning terms that potentially violate user rights\naccording to the European consumer law.", "citation": "@article{lippi-etal-2019-claudette,\n title = \"{CLAUDETTE}: an automated detector of potentially unfair clauses in online terms of service\",\n author = {Lippi, Marco\n and Pa\u0142ka, Przemys\u0142aw\n and Contissa, Giuseppe\n and Lagioia, Francesca\n and Micklitz, Hans-Wolfgang\n and Sartor, Giovanni\n and Torroni, Paolo},\n journal = \"Artificial Intelligence and Law\",\n year = \"2019\",\n publisher = \"Springer\",\n url = \"https://doi.org/10.1007/s10506-019-09243-2\",\n pages = \"117--139\",\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "http://claudette.eui.eu", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "labels": {"feature": {"num_classes": 8, "names": ["Limitation of liability", "Unilateral termination", "Unilateral change", "Content removal", "Contract by using", "Choice of law", "Jurisdiction", "Arbitration"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "unfair_tos", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1041790, "num_examples": 5532, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 303107, "num_examples": 1607, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 452119, "num_examples": 2275, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/unfair_tos.tar.gz": {"num_bytes": 511342, "checksum": "934470d74b62139dfbfad4a13b75a32e4a4d26a680ab12eedfb7659cdf669d53"}}, "download_size": 511342, "post_processing_size": null, "dataset_size": 1797016, "size_in_bytes": 2308358}, "case_hold": {"description": "The CaseHOLD (Case Holdings on Legal Decisions) dataset contains approx. 53k multiple choice\nquestions about holdings of US court cases from the Harvard Law Library case law corpus.\nHoldings are short summaries of legal rulings accompany referenced decisions relevant for the present case.\nThe input consists of an excerpt (or prompt) from a court decision, containing a reference\nto a particular case, while the holding statement is masked out. The model must identify\nthe correct (masked) holding statement from a selection of five choices.", "citation": "@inproceedings{Zheng2021,\n author = {Lucia Zheng and\n Neel Guha and\n Brandon R. Anderson and\n Peter Henderson and\n Daniel E. Ho},\n title = {When Does Pretraining Help? Assessing Self-Supervised Learning for\n Law and the CaseHOLD Dataset},\n year = {2021},\n booktitle = {International Conference on Artificial Intelligence and Law},\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "https://github.com/reglab/casehold", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "endings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "label": {"num_classes": 5, "names": ["0", "1", "2", "3", "4"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "case_hold", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 74781766, "num_examples": 45000, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 5989964, "num_examples": 3600, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 6474615, "num_examples": 3900, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/casehold.tar.gz": {"num_bytes": 30422703, "checksum": "728827dae0019880fe6be609e23f8c47fa2b49a2f0814a36687ace8db1c32d5e"}}, "download_size": 30422703, "post_processing_size": null, "dataset_size": 87246345, "size_in_bytes": 117669048}} \ No newline at end of file +{"ecthr_a": {"description": "The European Court of Human Rights (ECtHR) hears allegations that a state has\nbreached human rights provisions of the European Convention of Human Rights (ECHR).\nFor each case, the dataset provides a list of factual paragraphs (facts) from the case description.\nEach case is mapped to articles of the ECHR that were violated (if any).", "citation": "@inproceedings{chalkidis-etal-2021-paragraph,\n title = \"Paragraph-level Rationale Extraction through Regularization: A case study on {E}uropean Court of Human Rights Cases\",\n author = \"Chalkidis, Ilias and\n Fergadiotis, Manos and\n Tsarapatsanis, Dimitrios and\n Aletras, Nikolaos and\n Androutsopoulos, Ion and\n Malakasiotis, Prodromos\",\n booktitle = \"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies\",\n month = jun,\n year = \"2021\",\n address = \"Online\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://aclanthology.org/2021.naacl-main.22\",\n doi = \"10.18653/v1/2021.naacl-main.22\",\n pages = \"226--241\",\n}\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "https://archive.org/details/ECtHR-NAACL2021", "license": "", "features": {"text": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"num_classes": 10, "names": ["2", "3", "5", "6", "8", "9", "10", "11", "14", "P1-1"], "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "ecthr_a", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 89637461, "num_examples": 9000, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 11884180, "num_examples": 1000, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 10985180, "num_examples": 1000, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/ecthr.tar.gz": {"num_bytes": 32852475, "checksum": "461c1f6016af3a7ac0bd115c1f9ff65031258bfec39e570fec74a16d8946398e"}}, "download_size": 32852475, "post_processing_size": null, "dataset_size": 112506821, "size_in_bytes": 145359296}, "ecthr_b": {"description": "The European Court of Human Rights (ECtHR) hears allegations that a state has\nbreached human rights provisions of the European Convention of Human Rights (ECHR).\nFor each case, the dataset provides a list of factual paragraphs (facts) from the case description.\nEach case is mapped to articles of ECHR that were allegedly violated (considered by the court).", "citation": "@inproceedings{chalkidis-etal-2021-paragraph,\n title = \"Paragraph-level Rationale Extraction through Regularization: A case study on {E}uropean Court of Human Rights Cases\",\n author = \"Chalkidis, Ilias\n and Fergadiotis, Manos\n and Tsarapatsanis, Dimitrios\n and Aletras, Nikolaos\n and Androutsopoulos, Ion\n and Malakasiotis, Prodromos\",\n booktitle = \"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies\",\n year = \"2021\",\n address = \"Online\",\n url = \"https://aclanthology.org/2021.naacl-main.22\",\n}\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "https://archive.org/details/ECtHR-NAACL2021", "license": "", "features": {"text": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "labels": {"feature": {"num_classes": 10, "names": ["2", "3", "5", "6", "8", "9", "10", "11", "14", "P1-1"], "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "ecthr_b", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 89657661, "num_examples": 9000, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 11886940, "num_examples": 1000, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 10987828, "num_examples": 1000, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/ecthr.tar.gz": {"num_bytes": 32852475, "checksum": "461c1f6016af3a7ac0bd115c1f9ff65031258bfec39e570fec74a16d8946398e"}}, "download_size": 32852475, "post_processing_size": null, "dataset_size": 112532429, "size_in_bytes": 145384904}, "eurlex": {"description": "European Union (EU) legislation is published in EUR-Lex portal.\nAll EU laws are annotated by EU's Publications Office with multiple concepts from the EuroVoc thesaurus,\na multilingual thesaurus maintained by the Publications Office.\nThe current version of EuroVoc contains more than 7k concepts referring to various activities\nof the EU and its Member States (e.g., economics, health-care, trade).\nGiven a document, the task is to predict its EuroVoc labels (concepts).", "citation": "@inproceedings{chalkidis-etal-2021-multieurlex,\n author = {Chalkidis, Ilias and\n Fergadiotis, Manos and\n Androutsopoulos, Ion},\n title = {MultiEURLEX -- A multi-lingual and multi-label legal document\n classification dataset for zero-shot cross-lingual transfer},\n booktitle = {Proceedings of the 2021 Conference on Empirical Methods\n in Natural Language Processing},\n year = {2021},\n location = {Punta Cana, Dominican Republic},\n}\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "https://zenodo.org/record/5363165#.YVJOAi8RqaA", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "labels": {"feature": {"num_classes": 100, "names": ["100163", "100168", "100169", "100170", "100171", "100172", "100173", "100174", "100175", "100176", "100177", "100179", "100180", "100183", "100184", "100185", "100186", "100187", "100189", "100190", "100191", "100192", "100193", "100194", "100195", "100196", "100197", "100198", "100199", "100200", "100201", "100202", "100204", "100205", "100206", "100207", "100212", "100214", "100215", "100220", "100221", "100222", "100223", "100224", "100226", "100227", "100229", "100230", "100231", "100232", "100233", "100234", "100235", "100237", "100238", "100239", "100240", "100241", "100242", "100243", "100244", "100245", "100246", "100247", "100248", "100249", "100250", "100252", "100253", "100254", "100255", "100256", "100257", "100258", "100259", "100260", "100261", "100262", "100263", "100264", "100265", "100266", "100268", "100269", "100270", "100271", "100272", "100273", "100274", "100275", "100276", "100277", "100278", "100279", "100280", "100281", "100282", "100283", "100284", "100285"], "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "eurlex", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 390770289, "num_examples": 55000, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 59739102, "num_examples": 5000, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 41544484, "num_examples": 5000, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/eurlex.tar.gz": {"num_bytes": 125413277, "checksum": "82376ff55c3812632d8a21ad0d7e515e2e7ec6431ca7673a454cdd41a3a7bf46"}}, "download_size": 125413277, "post_processing_size": null, "dataset_size": 492053875, "size_in_bytes": 617467152}, "scotus": {"description": "The US Supreme Court (SCOTUS) is the highest federal court in the United States of America\nand generally hears only the most controversial or otherwise complex cases which have not\nbeen sufficiently well solved by lower courts. This is a single-label multi-class classification\ntask, where given a document (court opinion), the task is to predict the relevant issue areas.\nThe 14 issue areas cluster 278 issues whose focus is on the subject matter of the controversy (dispute).", "citation": "@misc{spaeth2020,\n author = {Harold J. Spaeth and Lee Epstein and Andrew D. Martin, Jeffrey A. Segal\n and Theodore J. Ruger and Sara C. Benesh},\n year = {2020},\n title ={{Supreme Court Database, Version 2020 Release 01}},\n url= {http://Supremecourtdatabase.org},\n howpublished={Washington University Law}\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "http://scdb.wustl.edu/data.php", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 13, "names": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "scotus", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 178959320, "num_examples": 5000, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 76213283, "num_examples": 1400, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 75600247, "num_examples": 1400, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/scotus.tar.gz": {"num_bytes": 104763335, "checksum": "d53cc99aaf60b24ca7e4cf634f08a2572b5b3532f83aecdfc2c4257050dc9d0a"}}, "download_size": 104763335, "post_processing_size": null, "dataset_size": 330772850, "size_in_bytes": 435536185}, "ledgar": {"description": "LEDGAR dataset aims contract provision (paragraph) classification.\nThe contract provisions come from contracts obtained from the US Securities and Exchange Commission (SEC)\nfilings, which are publicly available from EDGAR. Each label represents the single main topic\n(theme) of the corresponding contract provision.", "citation": "@inproceedings{tuggener-etal-2020-ledgar,\n title = \"{LEDGAR}: A Large-Scale Multi-label Corpus for Text Classification of Legal Provisions in Contracts\",\n author = {Tuggener, Don and\n von D{\"a}niken, Pius and\n Peetz, Thomas and\n Cieliebak, Mark},\n booktitle = \"Proceedings of the 12th Language Resources and Evaluation Conference\",\n year = \"2020\",\n address = \"Marseille, France\",\n url = \"https://aclanthology.org/2020.lrec-1.155\",\n}\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "https://metatext.io/datasets/ledgar", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 100, "names": ["Adjustments", "Agreements", "Amendments", "Anti-Corruption Laws", "Applicable Laws", "Approvals", "Arbitration", "Assignments", "Assigns", "Authority", "Authorizations", "Base Salary", "Benefits", "Binding Effects", "Books", "Brokers", "Capitalization", "Change In Control", "Closings", "Compliance With Laws", "Confidentiality", "Consent To Jurisdiction", "Consents", "Construction", "Cooperation", "Costs", "Counterparts", "Death", "Defined Terms", "Definitions", "Disability", "Disclosures", "Duties", "Effective Dates", "Effectiveness", "Employment", "Enforceability", "Enforcements", "Entire Agreements", "Erisa", "Existence", "Expenses", "Fees", "Financial Statements", "Forfeitures", "Further Assurances", "General", "Governing Laws", "Headings", "Indemnifications", "Indemnity", "Insurances", "Integration", "Intellectual Property", "Interests", "Interpretations", "Jurisdictions", "Liens", "Litigations", "Miscellaneous", "Modifications", "No Conflicts", "No Defaults", "No Waivers", "Non-Disparagement", "Notices", "Organizations", "Participations", "Payments", "Positions", "Powers", "Publicity", "Qualifications", "Records", "Releases", "Remedies", "Representations", "Sales", "Sanctions", "Severability", "Solvency", "Specific Performance", "Submission To Jurisdiction", "Subsidiaries", "Successors", "Survival", "Tax Withholdings", "Taxes", "Terminations", "Terms", "Titles", "Transactions With Affiliates", "Use Of Proceeds", "Vacations", "Venues", "Vesting", "Waiver Of Jury Trials", "Waivers", "Warranties", "Withholdings"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "ledgar", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 43358315, "num_examples": 60000, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 6845585, "num_examples": 10000, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 7143592, "num_examples": 10000, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/ledgar.tar.gz": {"num_bytes": 16255623, "checksum": "f7507bcce46ce03e3e91b8aaa1b84ddf6e8f1d628c0d7fa351f97ce45366d5d8"}}, "download_size": 16255623, "post_processing_size": null, "dataset_size": 57347492, "size_in_bytes": 73603115}, "unfair_tos": {"description": "The UNFAIR-ToS dataset contains 50 Terms of Service (ToS) from on-line platforms (e.g., YouTube,\nEbay, Facebook, etc.). The dataset has been annotated on the sentence-level with 8 types of\nunfair contractual terms (sentences), meaning terms that potentially violate user rights\naccording to the European consumer law.", "citation": "@article{lippi-etal-2019-claudette,\n title = \"{CLAUDETTE}: an automated detector of potentially unfair clauses in online terms of service\",\n author = {Lippi, Marco\n and Pa\u0142ka, Przemys\u0142aw\n and Contissa, Giuseppe\n and Lagioia, Francesca\n and Micklitz, Hans-Wolfgang\n and Sartor, Giovanni\n and Torroni, Paolo},\n journal = \"Artificial Intelligence and Law\",\n year = \"2019\",\n publisher = \"Springer\",\n url = \"https://doi.org/10.1007/s10506-019-09243-2\",\n pages = \"117--139\",\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "http://claudette.eui.eu", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "labels": {"feature": {"num_classes": 8, "names": ["Limitation of liability", "Unilateral termination", "Unilateral change", "Content removal", "Contract by using", "Choice of law", "Jurisdiction", "Arbitration"], "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "unfair_tos", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1041790, "num_examples": 5532, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 303107, "num_examples": 1607, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 452119, "num_examples": 2275, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/unfair_tos.tar.gz": {"num_bytes": 511342, "checksum": "934470d74b62139dfbfad4a13b75a32e4a4d26a680ab12eedfb7659cdf669d53"}}, "download_size": 511342, "post_processing_size": null, "dataset_size": 1797016, "size_in_bytes": 2308358}, "case_hold": {"description": "The CaseHOLD (Case Holdings on Legal Decisions) dataset contains approx. 53k multiple choice\nquestions about holdings of US court cases from the Harvard Law Library case law corpus.\nHoldings are short summaries of legal rulings accompany referenced decisions relevant for the present case.\nThe input consists of an excerpt (or prompt) from a court decision, containing a reference\nto a particular case, while the holding statement is masked out. The model must identify\nthe correct (masked) holding statement from a selection of five choices.", "citation": "@inproceedings{Zheng2021,\n author = {Lucia Zheng and\n Neel Guha and\n Brandon R. Anderson and\n Peter Henderson and\n Daniel E. Ho},\n title = {When Does Pretraining Help? Assessing Self-Supervised Learning for\n Law and the CaseHOLD Dataset},\n year = {2021},\n booktitle = {International Conference on Artificial Intelligence and Law},\n}\n@article{chalkidis-etal-2021-lexglue,\n title={{LexGLUE}: A Benchmark Dataset for Legal Language Understanding in English},\n author={Chalkidis, Ilias and\n Jana, Abhik and\n Hartung, Dirk and\n Bommarito, Michael and\n Androutsopoulos, Ion and\n Katz, Daniel Martin and\n Aletras, Nikolaos},\n year={2021},\n eprint={2110.00976},\n archivePrefix={arXiv},\n primaryClass={cs.CL},\n note = {arXiv: 2110.00976},\n}", "homepage": "https://github.com/reglab/casehold", "license": "", "features": {"context": {"dtype": "string", "id": null, "_type": "Value"}, "endings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "label": {"num_classes": 5, "names": ["0", "1", "2", "3", "4"], "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "lex_glue", "config_name": "case_hold", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 74781766, "num_examples": 45000, "dataset_name": "lex_glue"}, "test": {"name": "test", "num_bytes": 5989964, "num_examples": 3600, "dataset_name": "lex_glue"}, "validation": {"name": "validation", "num_bytes": 6474615, "num_examples": 3900, "dataset_name": "lex_glue"}}, "download_checksums": {"https://zenodo.org/record/5532997/files/casehold.tar.gz": {"num_bytes": 30422703, "checksum": "728827dae0019880fe6be609e23f8c47fa2b49a2f0814a36687ace8db1c32d5e"}}, "download_size": 30422703, "post_processing_size": null, "dataset_size": 87246345, "size_in_bytes": 117669048}} \ No newline at end of file