| Task Name | Train | Val | Test | Val/Test Docs | Metrics |
|---|---|---|---|---|---|
| anagrams1 | ✓ | 10000 | acc | ||
| anagrams2 | ✓ | 10000 | acc | ||
| anli_r1 | ✓ | ✓ | ✓ | 1000 | acc |
| anli_r2 | ✓ | ✓ | ✓ | 1000 | acc |
| anli_r3 | ✓ | ✓ | ✓ | 1200 | acc |
| arc_challenge | ✓ | ✓ | ✓ | 1172 | acc, acc_norm |
| arc_easy | ✓ | ✓ | ✓ | 2376 | acc, acc_norm |
| arithmetic_1dc | ✓ | 2000 | acc | ||
| arithmetic_2da | ✓ | 2000 | acc | ||
| arithmetic_2dm | ✓ | 2000 | acc | ||
| arithmetic_2ds | ✓ | 2000 | acc | ||
| arithmetic_3da | ✓ | 2000 | acc | ||
| arithmetic_3ds | ✓ | 2000 | acc | ||
| arithmetic_4da | ✓ | 2000 | acc | ||
| arithmetic_4ds | ✓ | 2000 | acc | ||
| arithmetic_5da | ✓ | 2000 | acc | ||
| arithmetic_5ds | ✓ | 2000 | acc | ||
| bigbench_causal_judgement | ✓ | 190 | multiple_choice_grade, exact_str_match | ||
| bigbench_date_understanding | ✓ | 369 | multiple_choice_grade, exact_str_match | ||
| bigbench_disambiguation_qa | ✓ | 258 | multiple_choice_grade, exact_str_match | ||
| bigbench_dyck_languages | ✓ | 1000 | multiple_choice_grade, exact_str_match | ||
| bigbench_formal_fallacies_syllogisms_negation | ✓ | 14200 | multiple_choice_grade, exact_str_match | ||
| bigbench_geometric_shapes | ✓ | 359 | multiple_choice_grade, exact_str_match | ||
| bigbench_hyperbaton | ✓ | 50000 | multiple_choice_grade, exact_str_match | ||
| bigbench_logical_deduction_five_objects | ✓ | 500 | multiple_choice_grade, exact_str_match | ||
| bigbench_logical_deduction_seven_objects | ✓ | 700 | multiple_choice_grade, exact_str_match | ||
| bigbench_logical_deduction_three_objects | ✓ | 300 | multiple_choice_grade, exact_str_match | ||
| bigbench_movie_recommendation | ✓ | 500 | multiple_choice_grade, exact_str_match | ||
| bigbench_navigate | ✓ | 1000 | multiple_choice_grade, exact_str_match | ||
| bigbench_reasoning_about_colored_objects | ✓ | 2000 | multiple_choice_grade, exact_str_match | ||
| bigbench_ruin_names | ✓ | 448 | multiple_choice_grade, exact_str_match | ||
| bigbench_salient_translation_error_detection | ✓ | 998 | multiple_choice_grade, exact_str_match | ||
| bigbench_snarks | ✓ | 181 | multiple_choice_grade, exact_str_match | ||
| bigbench_sports_understanding | ✓ | 986 | multiple_choice_grade, exact_str_match | ||
| bigbench_temporal_sequences | ✓ | 1000 | multiple_choice_grade, exact_str_match | ||
| bigbench_tracking_shuffled_objects_five_objects | ✓ | 1250 | multiple_choice_grade, exact_str_match | ||
| bigbench_tracking_shuffled_objects_seven_objects | ✓ | 1750 | multiple_choice_grade, exact_str_match | ||
| bigbench_tracking_shuffled_objects_three_objects | ✓ | 300 | multiple_choice_grade, exact_str_match | ||
| blimp_adjunct_island | ✓ | 1000 | acc | ||
| blimp_anaphor_gender_agreement | ✓ | 1000 | acc | ||
| blimp_anaphor_number_agreement | ✓ | 1000 | acc | ||
| blimp_animate_subject_passive | ✓ | 1000 | acc | ||
| blimp_animate_subject_trans | ✓ | 1000 | acc | ||
| blimp_causative | ✓ | 1000 | acc | ||
| blimp_complex_NP_island | ✓ | 1000 | acc | ||
| blimp_coordinate_structure_constraint_complex_left_branch | ✓ | 1000 | acc | ||
| blimp_coordinate_structure_constraint_object_extraction | ✓ | 1000 | acc | ||
| blimp_determiner_noun_agreement_1 | ✓ | 1000 | acc | ||
| blimp_determiner_noun_agreement_2 | ✓ | 1000 | acc | ||
| blimp_determiner_noun_agreement_irregular_1 | ✓ | 1000 | acc | ||
| blimp_determiner_noun_agreement_irregular_2 | ✓ | 1000 | acc | ||
| blimp_determiner_noun_agreement_with_adj_2 | ✓ | 1000 | acc | ||
| blimp_determiner_noun_agreement_with_adj_irregular_1 | ✓ | 1000 | acc | ||
| blimp_determiner_noun_agreement_with_adj_irregular_2 | ✓ | 1000 | acc | ||
| blimp_determiner_noun_agreement_with_adjective_1 | ✓ | 1000 | acc | ||
| blimp_distractor_agreement_relational_noun | ✓ | 1000 | acc | ||
| blimp_distractor_agreement_relative_clause | ✓ | 1000 | acc | ||
| blimp_drop_argument | ✓ | 1000 | acc | ||
| blimp_ellipsis_n_bar_1 | ✓ | 1000 | acc | ||
| blimp_ellipsis_n_bar_2 | ✓ | 1000 | acc | ||
| blimp_existential_there_object_raising | ✓ | 1000 | acc | ||
| blimp_existential_there_quantifiers_1 | ✓ | 1000 | acc | ||
| blimp_existential_there_quantifiers_2 | ✓ | 1000 | acc | ||
| blimp_existential_there_subject_raising | ✓ | 1000 | acc | ||
| blimp_expletive_it_object_raising | ✓ | 1000 | acc | ||
| blimp_inchoative | ✓ | 1000 | acc | ||
| blimp_intransitive | ✓ | 1000 | acc | ||
| blimp_irregular_past_participle_adjectives | ✓ | 1000 | acc | ||
| blimp_irregular_past_participle_verbs | ✓ | 1000 | acc | ||
| blimp_irregular_plural_subject_verb_agreement_1 | ✓ | 1000 | acc | ||
| blimp_irregular_plural_subject_verb_agreement_2 | ✓ | 1000 | acc | ||
| blimp_left_branch_island_echo_question | ✓ | 1000 | acc | ||
| blimp_left_branch_island_simple_question | ✓ | 1000 | acc | ||
| blimp_matrix_question_npi_licensor_present | ✓ | 1000 | acc | ||
| blimp_npi_present_1 | ✓ | 1000 | acc | ||
| blimp_npi_present_2 | ✓ | 1000 | acc | ||
| blimp_only_npi_licensor_present | ✓ | 1000 | acc | ||
| blimp_only_npi_scope | ✓ | 1000 | acc | ||
| blimp_passive_1 | ✓ | 1000 | acc | ||
| blimp_passive_2 | ✓ | 1000 | acc | ||
| blimp_principle_A_c_command | ✓ | 1000 | acc | ||
| blimp_principle_A_case_1 | ✓ | 1000 | acc | ||
| blimp_principle_A_case_2 | ✓ | 1000 | acc | ||
| blimp_principle_A_domain_1 | ✓ | 1000 | acc | ||
| blimp_principle_A_domain_2 | ✓ | 1000 | acc | ||
| blimp_principle_A_domain_3 | ✓ | 1000 | acc | ||
| blimp_principle_A_reconstruction | ✓ | 1000 | acc | ||
| blimp_regular_plural_subject_verb_agreement_1 | ✓ | 1000 | acc | ||
| blimp_regular_plural_subject_verb_agreement_2 | ✓ | 1000 | acc | ||
| blimp_sentential_negation_npi_licensor_present | ✓ | 1000 | acc | ||
| blimp_sentential_negation_npi_scope | ✓ | 1000 | acc | ||
| blimp_sentential_subject_island | ✓ | 1000 | acc | ||
| blimp_superlative_quantifiers_1 | ✓ | 1000 | acc | ||
| blimp_superlative_quantifiers_2 | ✓ | 1000 | acc | ||
| blimp_tough_vs_raising_1 | ✓ | 1000 | acc | ||
| blimp_tough_vs_raising_2 | ✓ | 1000 | acc | ||
| blimp_transitive | ✓ | 1000 | acc | ||
| blimp_wh_island | ✓ | 1000 | acc | ||
| blimp_wh_questions_object_gap | ✓ | 1000 | acc | ||
| blimp_wh_questions_subject_gap | ✓ | 1000 | acc | ||
| blimp_wh_questions_subject_gap_long_distance | ✓ | 1000 | acc | ||
| blimp_wh_vs_that_no_gap | ✓ | 1000 | acc | ||
| blimp_wh_vs_that_no_gap_long_distance | ✓ | 1000 | acc | ||
| blimp_wh_vs_that_with_gap | ✓ | 1000 | acc | ||
| blimp_wh_vs_that_with_gap_long_distance | ✓ | 1000 | acc | ||
| boolq | ✓ | ✓ | 3270 | acc | |
| cb | ✓ | ✓ | 56 | acc, f1 | |
| cola | ✓ | ✓ | 1043 | mcc | |
| copa | ✓ | ✓ | 100 | acc | |
| coqa | ✓ | ✓ | 500 | f1, em | |
| crows_pairs_english | ✓ | 1677 | likelihood_difference, pct_stereotype | ||
| crows_pairs_english_age | ✓ | 91 | likelihood_difference, pct_stereotype | ||
| crows_pairs_english_autre | ✓ | 11 | likelihood_difference, pct_stereotype | ||
| crows_pairs_english_disability | ✓ | 65 | likelihood_difference, pct_stereotype | ||
| crows_pairs_english_gender | ✓ | 320 | likelihood_difference, pct_stereotype | ||
| crows_pairs_english_nationality | ✓ | 216 | likelihood_difference, pct_stereotype | ||
| crows_pairs_english_physical_appearance | ✓ | 72 | likelihood_difference, pct_stereotype | ||
| crows_pairs_english_race_color | ✓ | 508 | likelihood_difference, pct_stereotype | ||
| crows_pairs_english_religion | ✓ | 111 | likelihood_difference, pct_stereotype | ||
| crows_pairs_english_sexual_orientation | ✓ | 93 | likelihood_difference, pct_stereotype | ||
| crows_pairs_english_socioeconomic | ✓ | 190 | likelihood_difference, pct_stereotype | ||
| crows_pairs_french | ✓ | 1677 | likelihood_difference, pct_stereotype | ||
| crows_pairs_french_age | ✓ | 90 | likelihood_difference, pct_stereotype | ||
| crows_pairs_french_autre | ✓ | 13 | likelihood_difference, pct_stereotype | ||
| crows_pairs_french_disability | ✓ | 66 | likelihood_difference, pct_stereotype | ||
| crows_pairs_french_gender | ✓ | 321 | likelihood_difference, pct_stereotype | ||
| crows_pairs_french_nationality | ✓ | 253 | likelihood_difference, pct_stereotype | ||
| crows_pairs_french_physical_appearance | ✓ | 72 | likelihood_difference, pct_stereotype | ||
| crows_pairs_french_race_color | ✓ | 460 | likelihood_difference, pct_stereotype | ||
| crows_pairs_french_religion | ✓ | 115 | likelihood_difference, pct_stereotype | ||
| crows_pairs_french_sexual_orientation | ✓ | 91 | likelihood_difference, pct_stereotype | ||
| crows_pairs_french_socioeconomic | ✓ | 196 | likelihood_difference, pct_stereotype | ||
| cycle_letters | ✓ | 10000 | acc | ||
| drop | ✓ | ✓ | 9536 | em, f1 | |
| ethics_cm | ✓ | ✓ | 3885 | acc | |
| ethics_deontology | ✓ | ✓ | 3596 | acc, em | |
| ethics_justice | ✓ | ✓ | 2704 | acc, em | |
| ethics_utilitarianism | ✓ | ✓ | 4808 | acc | |
| ethics_utilitarianism_original | ✓ | 4808 | acc | ||
| ethics_virtue | ✓ | ✓ | 4975 | acc, em | |
| gsm8k | ✓ | ✓ | 1319 | acc | |
| headqa | ✓ | ✓ | ✓ | 2742 | acc, acc_norm |
| headqa_en | ✓ | ✓ | ✓ | 2742 | acc, acc_norm |
| headqa_es | ✓ | ✓ | ✓ | 2742 | acc, acc_norm |
| hellaswag | ✓ | ✓ | 10042 | acc, acc_norm | |
| hendrycksTest-abstract_algebra | ✓ | ✓ | 100 | acc, acc_norm | |
| hendrycksTest-anatomy | ✓ | ✓ | 135 | acc, acc_norm | |
| hendrycksTest-astronomy | ✓ | ✓ | 152 | acc, acc_norm | |
| hendrycksTest-business_ethics | ✓ | ✓ | 100 | acc, acc_norm | |
| hendrycksTest-clinical_knowledge | ✓ | ✓ | 265 | acc, acc_norm | |
| hendrycksTest-college_biology | ✓ | ✓ | 144 | acc, acc_norm | |
| hendrycksTest-college_chemistry | ✓ | ✓ | 100 | acc, acc_norm | |
| hendrycksTest-college_computer_science | ✓ | ✓ | 100 | acc, acc_norm | |
| hendrycksTest-college_mathematics | ✓ | ✓ | 100 | acc, acc_norm | |
| hendrycksTest-college_medicine | ✓ | ✓ | 173 | acc, acc_norm | |
| hendrycksTest-college_physics | ✓ | ✓ | 102 | acc, acc_norm | |
| hendrycksTest-computer_security | ✓ | ✓ | 100 | acc, acc_norm | |
| hendrycksTest-conceptual_physics | ✓ | ✓ | 235 | acc, acc_norm | |
| hendrycksTest-econometrics | ✓ | ✓ | 114 | acc, acc_norm | |
| hendrycksTest-electrical_engineering | ✓ | ✓ | 145 | acc, acc_norm | |
| hendrycksTest-elementary_mathematics | ✓ | ✓ | 378 | acc, acc_norm | |
| hendrycksTest-formal_logic | ✓ | ✓ | 126 | acc, acc_norm | |
| hendrycksTest-global_facts | ✓ | ✓ | 100 | acc, acc_norm | |
| hendrycksTest-high_school_biology | ✓ | ✓ | 310 | acc, acc_norm | |
| hendrycksTest-high_school_chemistry | ✓ | ✓ | 203 | acc, acc_norm | |
| hendrycksTest-high_school_computer_science | ✓ | ✓ | 100 | acc, acc_norm | |
| hendrycksTest-high_school_european_history | ✓ | ✓ | 165 | acc, acc_norm | |
| hendrycksTest-high_school_geography | ✓ | ✓ | 198 | acc, acc_norm | |
| hendrycksTest-high_school_government_and_politics | ✓ | ✓ | 193 | acc, acc_norm | |
| hendrycksTest-high_school_macroeconomics | ✓ | ✓ | 390 | acc, acc_norm | |
| hendrycksTest-high_school_mathematics | ✓ | ✓ | 270 | acc, acc_norm | |
| hendrycksTest-high_school_microeconomics | ✓ | ✓ | 238 | acc, acc_norm | |
| hendrycksTest-high_school_physics | ✓ | ✓ | 151 | acc, acc_norm | |
| hendrycksTest-high_school_psychology | ✓ | ✓ | 545 | acc, acc_norm | |
| hendrycksTest-high_school_statistics | ✓ | ✓ | 216 | acc, acc_norm | |
| hendrycksTest-high_school_us_history | ✓ | ✓ | 204 | acc, acc_norm | |
| hendrycksTest-high_school_world_history | ✓ | ✓ | 237 | acc, acc_norm | |
| hendrycksTest-human_aging | ✓ | ✓ | 223 | acc, acc_norm | |
| hendrycksTest-human_sexuality | ✓ | ✓ | 131 | acc, acc_norm | |
| hendrycksTest-international_law | ✓ | ✓ | 121 | acc, acc_norm | |
| hendrycksTest-jurisprudence | ✓ | ✓ | 108 | acc, acc_norm | |
| hendrycksTest-logical_fallacies | ✓ | ✓ | 163 | acc, acc_norm | |
| hendrycksTest-machine_learning | ✓ | ✓ | 112 | acc, acc_norm | |
| hendrycksTest-management | ✓ | ✓ | 103 | acc, acc_norm | |
| hendrycksTest-marketing | ✓ | ✓ | 234 | acc, acc_norm | |
| hendrycksTest-medical_genetics | ✓ | ✓ | 100 | acc, acc_norm | |
| hendrycksTest-miscellaneous | ✓ | ✓ | 783 | acc, acc_norm | |
| hendrycksTest-moral_disputes | ✓ | ✓ | 346 | acc, acc_norm | |
| hendrycksTest-moral_scenarios | ✓ | ✓ | 895 | acc, acc_norm | |
| hendrycksTest-nutrition | ✓ | ✓ | 306 | acc, acc_norm | |
| hendrycksTest-philosophy | ✓ | ✓ | 311 | acc, acc_norm | |
| hendrycksTest-prehistory | ✓ | ✓ | 324 | acc, acc_norm | |
| hendrycksTest-professional_accounting | ✓ | ✓ | 282 | acc, acc_norm | |
| hendrycksTest-professional_law | ✓ | ✓ | 1534 | acc, acc_norm | |
| hendrycksTest-professional_medicine | ✓ | ✓ | 272 | acc, acc_norm | |
| hendrycksTest-professional_psychology | ✓ | ✓ | 612 | acc, acc_norm | |
| hendrycksTest-public_relations | ✓ | ✓ | 110 | acc, acc_norm | |
| hendrycksTest-security_studies | ✓ | ✓ | 245 | acc, acc_norm | |
| hendrycksTest-sociology | ✓ | ✓ | 201 | acc, acc_norm | |
| hendrycksTest-us_foreign_policy | ✓ | ✓ | 100 | acc, acc_norm | |
| hendrycksTest-virology | ✓ | ✓ | 166 | acc, acc_norm | |
| hendrycksTest-world_religions | ✓ | ✓ | 171 | acc, acc_norm | |
| iwslt17-ar-en | ✓ | 1460 | bleu, chrf, ter | ||
| iwslt17-en-ar | ✓ | 1460 | bleu, chrf, ter | ||
| lambada_openai | ✓ | 5153 | ppl, acc | ||
| lambada_openai_cloze | ✓ | 5153 | ppl, acc | ||
| lambada_openai_mt_de | ✓ | 5153 | ppl, acc | ||
| lambada_openai_mt_en | ✓ | 5153 | ppl, acc | ||
| lambada_openai_mt_es | ✓ | 5153 | ppl, acc | ||
| lambada_openai_mt_fr | ✓ | 5153 | ppl, acc | ||
| lambada_openai_mt_it | ✓ | 5153 | ppl, acc | ||
| lambada_standard | ✓ | ✓ | 5153 | ppl, acc | |
| lambada_standard_cloze | ✓ | ✓ | 5153 | ppl, acc | |
| logiqa | ✓ | ✓ | ✓ | 651 | acc, acc_norm |
| math_algebra | ✓ | ✓ | 1187 | acc | |
| math_asdiv | ✓ | 2305 | acc | ||
| math_counting_and_prob | ✓ | ✓ | 474 | acc | |
| math_geometry | ✓ | ✓ | 479 | acc | |
| math_intermediate_algebra | ✓ | ✓ | 903 | acc | |
| math_num_theory | ✓ | ✓ | 540 | acc | |
| math_prealgebra | ✓ | ✓ | 871 | acc | |
| math_precalc | ✓ | ✓ | 546 | acc | |
| mathqa | ✓ | ✓ | ✓ | 2985 | acc, acc_norm |
| mc_taco | ✓ | ✓ | 9442 | f1, em | |
| mgsm_bn | ✓ | ✓ | 250 | acc | |
| mgsm_de | ✓ | ✓ | 250 | acc | |
| mgsm_en | ✓ | ✓ | 250 | acc | |
| mgsm_es | ✓ | ✓ | 250 | acc | |
| mgsm_fr | ✓ | ✓ | 250 | acc | |
| mgsm_ja | ✓ | ✓ | 250 | acc | |
| mgsm_ru | ✓ | ✓ | 250 | acc | |
| mgsm_sw | ✓ | ✓ | 250 | acc | |
| mgsm_te | ✓ | ✓ | 250 | acc | |
| mgsm_th | ✓ | ✓ | 250 | acc | |
| mgsm_zh | ✓ | ✓ | 250 | acc | |
| mnli | ✓ | ✓ | 9815 | acc | |
| mnli_mismatched | ✓ | ✓ | 9832 | acc | |
| mrpc | ✓ | ✓ | 408 | acc, f1 | |
| multirc | ✓ | ✓ | 4848 | acc | |
| mutual | ✓ | ✓ | 886 | r@1, r@2, mrr | |
| mutual_plus | ✓ | ✓ | 886 | r@1, r@2, mrr | |
| openbookqa | ✓ | ✓ | ✓ | 500 | acc, acc_norm |
| pawsx_de | ✓ | ✓ | ✓ | 2000 | acc |
| pawsx_en | ✓ | ✓ | ✓ | 2000 | acc |
| pawsx_es | ✓ | ✓ | ✓ | 2000 | acc |
| pawsx_fr | ✓ | ✓ | ✓ | 2000 | acc |
| pawsx_ja | ✓ | ✓ | ✓ | 2000 | acc |
| pawsx_ko | ✓ | ✓ | ✓ | 2000 | acc |
| pawsx_zh | ✓ | ✓ | ✓ | 2000 | acc |
| pile_arxiv | ✓ | ✓ | 2407 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_bookcorpus2 | ✓ | ✓ | 28 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_books3 | ✓ | ✓ | 269 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_dm-mathematics | ✓ | ✓ | 1922 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_enron | ✓ | ✓ | 1010 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_europarl | ✓ | ✓ | 157 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_freelaw | ✓ | ✓ | 5101 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_github | ✓ | ✓ | 18195 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_gutenberg | ✓ | ✓ | 80 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_hackernews | ✓ | ✓ | 1632 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_nih-exporter | ✓ | ✓ | 1884 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_opensubtitles | ✓ | ✓ | 642 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_openwebtext2 | ✓ | ✓ | 32925 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_philpapers | ✓ | ✓ | 68 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_pile-cc | ✓ | ✓ | 52790 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_pubmed-abstracts | ✓ | ✓ | 29895 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_pubmed-central | ✓ | ✓ | 5911 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_stackexchange | ✓ | ✓ | 30378 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_ubuntu-irc | ✓ | ✓ | 22 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_uspto | ✓ | ✓ | 11415 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_wikipedia | ✓ | ✓ | 17511 | word_perplexity, byte_perplexity, bits_per_byte | |
| pile_youtubesubtitles | ✓ | ✓ | 342 | word_perplexity, byte_perplexity, bits_per_byte | |
| piqa | ✓ | ✓ | 1838 | acc, acc_norm | |
| prost | ✓ | 18736 | acc, acc_norm | ||
| pubmedqa | ✓ | 1000 | acc | ||
| qa4mre_2011 | ✓ | 120 | acc, acc_norm | ||
| qa4mre_2012 | ✓ | 160 | acc, acc_norm | ||
| qa4mre_2013 | ✓ | 284 | acc, acc_norm | ||
| qasper | ✓ | ✓ | 1764 | f1_yesno, f1_abstractive | |
| qnli | ✓ | ✓ | 5463 | acc | |
| qqp | ✓ | ✓ | 40430 | acc, f1 | |
| race | ✓ | ✓ | ✓ | 1045 | acc |
| random_insertion | ✓ | 10000 | acc | ||
| record | ✓ | ✓ | 10000 | f1, em | |
| reversed_words | ✓ | 10000 | acc | ||
| rte | ✓ | ✓ | 277 | acc | |
| sciq | ✓ | ✓ | ✓ | 1000 | acc, acc_norm |
| squad2 | ✓ | ✓ | 11873 | exact, f1, HasAns_exact, HasAns_f1, NoAns_exact, NoAns_f1, best_exact, best_f1 | |
| sst | ✓ | ✓ | 872 | acc | |
| swag | ✓ | ✓ | 20006 | acc, acc_norm | |
| toxigen | ✓ | ✓ | 940 | acc, acc_norm | |
| triviaqa | ✓ | ✓ | 11313 | acc | |
| truthfulqa_gen | ✓ | 817 | bleurt_max, bleurt_acc, bleurt_diff, bleu_max, bleu_acc, bleu_diff, rouge1_max, rouge1_acc, rouge1_diff, rouge2_max, rouge2_acc, rouge2_diff, rougeL_max, rougeL_acc, rougeL_diff | ||
| truthfulqa_mc | ✓ | 817 | mc1, mc2 | ||
| webqs | ✓ | ✓ | 2032 | acc | |
| wic | ✓ | ✓ | 638 | acc | |
| wikitext | ✓ | ✓ | ✓ | 62 | word_perplexity, byte_perplexity, bits_per_byte |
| winogrande | ✓ | ✓ | 1267 | acc | |
| wmt14-en-fr | ✓ | 3003 | bleu, chrf, ter | ||
| wmt14-fr-en | ✓ | 3003 | bleu, chrf, ter | ||
| wmt16-de-en | ✓ | 2999 | bleu, chrf, ter | ||
| wmt16-en-de | ✓ | 2999 | bleu, chrf, ter | ||
| wmt16-en-ro | ✓ | 1999 | bleu, chrf, ter | ||
| wmt16-ro-en | ✓ | 1999 | bleu, chrf, ter | ||
| wmt20-cs-en | ✓ | 664 | bleu, chrf, ter | ||
| wmt20-de-en | ✓ | 785 | bleu, chrf, ter | ||
| wmt20-de-fr | ✓ | 1619 | bleu, chrf, ter | ||
| wmt20-en-cs | ✓ | 1418 | bleu, chrf, ter | ||
| wmt20-en-de | ✓ | 1418 | bleu, chrf, ter | ||
| wmt20-en-iu | ✓ | 2971 | bleu, chrf, ter | ||
| wmt20-en-ja | ✓ | 1000 | bleu, chrf, ter | ||
| wmt20-en-km | ✓ | 2320 | bleu, chrf, ter | ||
| wmt20-en-pl | ✓ | 1000 | bleu, chrf, ter | ||
| wmt20-en-ps | ✓ | 2719 | bleu, chrf, ter | ||
| wmt20-en-ru | ✓ | 2002 | bleu, chrf, ter | ||
| wmt20-en-ta | ✓ | 1000 | bleu, chrf, ter | ||
| wmt20-en-zh | ✓ | 1418 | bleu, chrf, ter | ||
| wmt20-fr-de | ✓ | 1619 | bleu, chrf, ter | ||
| wmt20-iu-en | ✓ | 2971 | bleu, chrf, ter | ||
| wmt20-ja-en | ✓ | 993 | bleu, chrf, ter | ||
| wmt20-km-en | ✓ | 2320 | bleu, chrf, ter | ||
| wmt20-pl-en | ✓ | 1001 | bleu, chrf, ter | ||
| wmt20-ps-en | ✓ | 2719 | bleu, chrf, ter | ||
| wmt20-ru-en | ✓ | 991 | bleu, chrf, ter | ||
| wmt20-ta-en | ✓ | 997 | bleu, chrf, ter | ||
| wmt20-zh-en | ✓ | 2000 | bleu, chrf, ter | ||
| wnli | ✓ | ✓ | 71 | acc | |
| wsc | ✓ | ✓ | 104 | acc | |
| wsc273 | ✓ | 273 | acc | ||
| xcopa_et | ✓ | ✓ | 500 | acc | |
| xcopa_ht | ✓ | ✓ | 500 | acc | |
| xcopa_id | ✓ | ✓ | 500 | acc | |
| xcopa_it | ✓ | ✓ | 500 | acc | |
| xcopa_qu | ✓ | ✓ | 500 | acc | |
| xcopa_sw | ✓ | ✓ | 500 | acc | |
| xcopa_ta | ✓ | ✓ | 500 | acc | |
| xcopa_th | ✓ | ✓ | 500 | acc | |
| xcopa_tr | ✓ | ✓ | 500 | acc | |
| xcopa_vi | ✓ | ✓ | 500 | acc | |
| xcopa_zh | ✓ | ✓ | 500 | acc | |
| xnli_ar | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_bg | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_de | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_el | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_en | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_es | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_fr | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_hi | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_ru | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_sw | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_th | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_tr | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_ur | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_vi | ✓ | ✓ | ✓ | 5010 | acc |
| xnli_zh | ✓ | ✓ | ✓ | 5010 | acc |
| xstory_cloze_ar | ✓ | ✓ | 1511 | acc | |
| xstory_cloze_en | ✓ | ✓ | 1511 | acc | |
| xstory_cloze_es | ✓ | ✓ | 1511 | acc | |
| xstory_cloze_eu | ✓ | ✓ | 1511 | acc | |
| xstory_cloze_hi | ✓ | ✓ | 1511 | acc | |
| xstory_cloze_id | ✓ | ✓ | 1511 | acc | |
| xstory_cloze_my | ✓ | ✓ | 1511 | acc | |
| xstory_cloze_ru | ✓ | ✓ | 1511 | acc | |
| xstory_cloze_sw | ✓ | ✓ | 1511 | acc | |
| xstory_cloze_te | ✓ | ✓ | 1511 | acc | |
| xstory_cloze_zh | ✓ | ✓ | 1511 | acc | |
| xwinograd_en | ✓ | 2325 | acc | ||
| xwinograd_fr | ✓ | 83 | acc | ||
| xwinograd_jp | ✓ | 959 | acc | ||
| xwinograd_pt | ✓ | 263 | acc | ||
| xwinograd_ru | ✓ | 315 | acc | ||
| xwinograd_zh | ✓ | 504 | acc |
This repository was archived by the owner on Nov 26, 2024. It is now read-only.