From be074e3122a162f550b4b04822bea97564e01b87 Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Tue, 17 May 2022 15:51:52 +0200 Subject: [PATCH 1/6] Update dataset_infos for UDHN/udhr dataset --- datasets/udhr/dataset_infos.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/udhr/dataset_infos.json b/datasets/udhr/dataset_infos.json index 128da9af288..8d275a48db5 100644 --- a/datasets/udhr/dataset_infos.json +++ b/datasets/udhr/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "The Universal Declaration of Human Rights (UDHR) is a milestone document in the history of human rights. Drafted by\nrepresentatives with different legal and cultural backgrounds from all regions of the world, it set out, for the\nfirst time, fundamental human rights to be universally protected. The Declaration was adopted by the UN General\nAssembly in Paris on 10 December 1948 during its 183rd plenary meeting. The dataset includes translations of the\ndocument in 464 languages and dialects.\n\n\u00a9 1996 \u2013 2009 The Office of the High Commissioner for Human Rights\n\nThis plain text version prepared by the \u201cUDHR in Unicode\u201d project, https://www.unicode.org/udhr.\n", "citation": "", "homepage": "https://www.ohchr.org/EN/UDHR/Pages/UDHRIndex.aspx", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "lang_key": {"dtype": "string", "id": null, "_type": "Value"}, "lang_name": {"dtype": "string", "id": null, "_type": "Value"}, "iso639-3": {"dtype": "string", "id": null, "_type": "Value"}, "iso15924": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "udhn", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6453404, "num_examples": 464, "dataset_name": "udhn"}}, "download_checksums": {"https://unicode.org/udhr/assemblies/udhr_xml.zip": {"num_bytes": 2273633, "checksum": "0565fa62c2ff155b84123198bcc967edd8c5eb9679eadc01e6fb44a5cf730fee"}, "https://unicode.org/udhr/assemblies/udhr_txt.zip": {"num_bytes": 2107471, "checksum": "087b474a070dd4096ae3028f9ee0b30dcdcb030cc85a1ca02e143be46327e5e5"}}, "download_size": 4381104, "post_processing_size": null, "dataset_size": 6453404, "size_in_bytes": 10834508}} \ No newline at end of file +{"default": {"description": "The Universal Declaration of Human Rights (UDHR) is a milestone document in the history of human rights. Drafted by\nrepresentatives with different legal and cultural backgrounds from all regions of the world, it set out, for the\nfirst time, fundamental human rights to be universally protected. The Declaration was adopted by the UN General\nAssembly in Paris on 10 December 1948 during its 183rd plenary meeting. The dataset includes translations of the\ndocument in 464 languages and dialects.\n\n\u00a9 1996 \u2013 2009 The Office of the High Commissioner for Human Rights\n\nThis plain text version prepared by the \u201cUDHR in Unicode\u201d project, https://www.unicode.org/udhr.\n", "citation": "", "homepage": "https://www.ohchr.org/EN/UDHR/Pages/UDHRIndex.aspx", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "lang_key": {"dtype": "string", "id": null, "_type": "Value"}, "lang_name": {"dtype": "string", "id": null, "_type": "Value"}, "iso639-3": {"dtype": "string", "id": null, "_type": "Value"}, "iso15924": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "udhn", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6749909, "num_examples": 488, "dataset_name": "udhn"}}, "download_checksums": {"https://unicode.org/udhr/assemblies/udhr_xml.zip": {"num_bytes": 2389690, "checksum": "a3350912790196c6e1b26bfd1c8a50e8575f5cf185922ecd9bd15713d7d21438"}, "https://unicode.org/udhr/assemblies/udhr_txt.zip": {"num_bytes": 2215441, "checksum": "cb87ecb25b56f34e4fd6f22b323000524fd9c06ae2a29f122b048789cf17e9fe"}}, "download_size": 4605131, "post_processing_size": null, "dataset_size": 6749909, "size_in_bytes": 11355040}} \ No newline at end of file From 373ca2f4b12bac5e029d9fd54da96589040d39b8 Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Thu, 19 May 2022 11:39:38 +0200 Subject: [PATCH 2/6] Update languages list, Add versioning --- datasets/udhr/README.md | 246 +++++++++++++++++++++++++++---- datasets/udhr/dataset_infos.json | 2 +- datasets/udhr/udhr.py | 2 + 3 files changed, 217 insertions(+), 33 deletions(-) diff --git a/datasets/udhr/README.md b/datasets/udhr/README.md index c8596fb15c5..d1f3da3f84e 100644 --- a/datasets/udhr/README.md +++ b/datasets/udhr/README.md @@ -8,12 +8,17 @@ languages: - ab - ace - acu +- acu - ada - ady - af - agr - aii - ajg +- ak +- ak +- ak-akuapem +- ak-asante - als - alt - am @@ -21,60 +26,84 @@ languages: - ame - ami - amr -- arb +- ar - arl - arn +- as - ast - auc -- ayr -- azj +- awa +- ay +- ayo +- az +- az-Cyrl +- az-Latn - ban +- bap - bax - bba - bci -- bcl - be - bem - bfa - bg +- bgp - bho - bi +- bik - bin +- bjj - blt - bm +- bmj - bn - bo - boa +- bpy - br +- brd +- brx - bs +- bs-Cyrl +- bs-Latn - buc - bug - bum +- bvi +- byh - ca - cab - cak +- cas +- cax - cbi - cbr - cbs - cbt - cbu - ccp +- cdm - ceb - cfm - ch - chj - chk - chr +- chr +- chx - cic - cjk +- cjk - cjs - cjy - ckb -- cmn +- ckb-Latn - cnh - cni +- cnr - co - cof +- con - cot - cpu - crh @@ -84,30 +113,38 @@ languages: - csa - csw - ctd +- cv - cy - da - dag - ddn - de +- de-1901 +- de-1996 - dga +- dhi +- dhw - dip +- dry - duu - dv - dyo - dyu - dz - ee -- ekk - el -- emk +- el-monoton +- el-polyton - en - eo - es - ese +- et - eu - eve - evn -- fat +- fa +- fa-AF - fi - fj - fkv @@ -115,95 +152,134 @@ languages: - fon - fr - fuf +- fuf-Adlm - fur - fuv +- fuv +- fvr - fy - ga - gaa - gag - gan -- gaz +- gbm - gd - gjn - gkp - gl - gld +- gn +- gno +- gnw +- gsw - gu - guc -- gug - guu - gv +- gvr - gyr - ha +- ha-NE +- ha-NG - hak - haw - he -- hea - hi - hil - hlt +- hmn - hms - hna +- hne - hni - hnj - hns +- hoc - hr - hsb +- hsn +- ht - ht - hu - hus +- hus +- hus - huu - hy - ia - ibb - id +- idu - ig - ii -- ike +- ijs - ilo - io - is - it +- iu +- ja +- ja - ja - jiv +- jul - jv +- jv-Java - ka +- kaa - kbd - kbp - kde - kdh - kea - kek +- kfa +- kfx +- kg +- kg-AO +- kgg - kha -- khk +- khr - kjh - kk - kkh +- kkh-Lana - kl - km - kmb -- kmr +- kmb - kn -- knc -- kng - ko - koi - koo - kqn - kqs +- kr +- kra - kri - krl +- ks +- ksw - ktu +- ku +- ku-Arab - kwi +- kxl - ky - la +- la - lad +- lah - lb +- lep - lg +- lhm - lia +- lif - lij - lld - ln +- ln - lns - lo - lob @@ -214,28 +290,41 @@ languages: - lue - lun - lus -- lvs +- lv - mad - mag - mai +- mai - mam +- man - maz - mcd - mcf - men - mfq +- mg +- mgp - mh - mi - mic - min - miq +- mjz - mk - ml +- ml +- mn +- mn-Cyrl +- mn-Mong - mnw +- mor - mos - mr - mt - mto +- mtp +- mvf +- mvf-Mong - mxi - mxv - my @@ -244,27 +333,39 @@ languages: - nb - nba - nds +- ne +- new - ng - nhn - nio - niu +- niv - njo - nku - nl - nn - not -- npi - nr - nso +- nus - nv - ny +- ny - nym - nyn - nzi - oaa - oc +- oc +- oc +- oc +- oc +- oc +- oc - ojb - oki +- om +- or - orh - os - ote @@ -273,27 +374,28 @@ languages: - pap - pau - pbb -- pbu - pcd - pcm -- pes - pis - piu - pl -- plt -- pnb - pon - pov +- pov - ppl - prq -- prs +- ps - pt +- pt-BR +- pt-PT +- pwo +- qu - qu - quc - qug - quh +- qul - quy -- quz - qva - qvc - qvh @@ -302,18 +404,39 @@ languages: - qwh - qxn - qxu +- rab +- raj - rar - rgn +- rhg +- rji +- rjs - rm +- rm-puter +- rm-rumgr +- rm-surmiran +- rm-sursilv +- rm-sutsilv +- rm-vallader +- rmn - rmn - rn - ro +- ro +- ro - ru - rup - rw +- rwr - sa +- sa-Gran - sah +- sat +- sc +- sck - sco +- scp +- sd - se - sey - sg @@ -321,27 +444,37 @@ languages: - shn - shp - si +- sja - sk +- skj - skr - sl +- slr - sm - sn - snk - snn - so - sr -- src +- sr-Cyrl +- sr-Latn +- srq - srr - ss - st - su - suk - sus +- suz +- suz - sv +- sw - swb -- swh - ta +- ta-LK - taj +- taq +- taq-Tfng - tbz - tca - tdt @@ -350,12 +483,21 @@ languages: - tet - tg - th +- th +- thf +- thl +- ths - ti - tiv +- tji - tk +- tk-Cyrl +- tk-Latn - tl +- tl-Tglg - tly - tn +- tna - to - tob - toi @@ -363,27 +505,54 @@ languages: - top - tpi - tr +- trn +- ts - ts - tsz - tt -- tw - ty - tyv - tzh - tzm +- tzm-Tfng - tzo +- udu - ug +- ug-Arab +- ug-Latn - uk - umb +- umb +- und +- und +- und +- und +- und +- und - und +- und +- und +- und +- und +- und +- und +- und +- und +- unr +- ur - ur - ura -- uzn +- uz +- uz-Cyrl +- uz-Latn - vai +- vay +- ve - ve - vec - vep - vi +- vi-Hani - vmw - wa - war @@ -392,22 +561,35 @@ languages: - wwa - xh - xsm +- xsr - yad - yao - yap -- ydd +- ybh +- yi - ykg - yo +- yrk - yua - yue +- yuz +- za - zam - zdj - zgh +- zh +- zh +- zh +- zh +- zh +- zh +- zh-Hant - zlm +- zlm-Arab +- zlm-Latn - zro - ztu - zu -- zyb licenses: - unknown multilinguality: @@ -474,7 +656,7 @@ This plain text version prepared by the “UDHR in Unicode” project, https://w ### Languages -The dataset includes translations of the document in 467 languages and dialects. The list of languages can be found +The dataset includes translations of the document in over 400 languages and dialects. The list of languages can be found [here](https://unicode.org/udhr/translations.html). ## Dataset Structure @@ -555,7 +737,7 @@ The txt/xml data files used here were compiled by The Unicode Consortium, which ### Licensing Information -© 1996 – 2009 The Office of the High Commissioner for Human Rights +© 1996 – 2022 The Office of the High Commissioner for Human Rights [More Information Needed] diff --git a/datasets/udhr/dataset_infos.json b/datasets/udhr/dataset_infos.json index 8d275a48db5..418fb0214b5 100644 --- a/datasets/udhr/dataset_infos.json +++ b/datasets/udhr/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "The Universal Declaration of Human Rights (UDHR) is a milestone document in the history of human rights. Drafted by\nrepresentatives with different legal and cultural backgrounds from all regions of the world, it set out, for the\nfirst time, fundamental human rights to be universally protected. The Declaration was adopted by the UN General\nAssembly in Paris on 10 December 1948 during its 183rd plenary meeting. The dataset includes translations of the\ndocument in 464 languages and dialects.\n\n\u00a9 1996 \u2013 2009 The Office of the High Commissioner for Human Rights\n\nThis plain text version prepared by the \u201cUDHR in Unicode\u201d project, https://www.unicode.org/udhr.\n", "citation": "", "homepage": "https://www.ohchr.org/EN/UDHR/Pages/UDHRIndex.aspx", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "lang_key": {"dtype": "string", "id": null, "_type": "Value"}, "lang_name": {"dtype": "string", "id": null, "_type": "Value"}, "iso639-3": {"dtype": "string", "id": null, "_type": "Value"}, "iso15924": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "udhn", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6749909, "num_examples": 488, "dataset_name": "udhn"}}, "download_checksums": {"https://unicode.org/udhr/assemblies/udhr_xml.zip": {"num_bytes": 2389690, "checksum": "a3350912790196c6e1b26bfd1c8a50e8575f5cf185922ecd9bd15713d7d21438"}, "https://unicode.org/udhr/assemblies/udhr_txt.zip": {"num_bytes": 2215441, "checksum": "cb87ecb25b56f34e4fd6f22b323000524fd9c06ae2a29f122b048789cf17e9fe"}}, "download_size": 4605131, "post_processing_size": null, "dataset_size": 6749909, "size_in_bytes": 11355040}} \ No newline at end of file +{"default": {"description": "The Universal Declaration of Human Rights (UDHR) is a milestone document in the history of human rights. Drafted by\nrepresentatives with different legal and cultural backgrounds from all regions of the world, it set out, for the\nfirst time, fundamental human rights to be universally protected. The Declaration was adopted by the UN General\nAssembly in Paris on 10 December 1948 during its 183rd plenary meeting. The dataset includes translations of the\ndocument in 464 languages and dialects.\n\n\u00a9 1996 \u2013 2009 The Office of the High Commissioner for Human Rights\n\nThis plain text version prepared by the \u201cUDHR in Unicode\u201d project, https://www.unicode.org/udhr.\n", "citation": "", "homepage": "https://www.ohchr.org/EN/UDHR/Pages/UDHRIndex.aspx", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "lang_key": {"dtype": "string", "id": null, "_type": "Value"}, "lang_name": {"dtype": "string", "id": null, "_type": "Value"}, "iso639-3": {"dtype": "string", "id": null, "_type": "Value"}, "iso15924": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "udhn", "config_name": "default", "version": {"version_str": "1.2022.0519", "description": null, "major": 1, "minor": 2022, "patch": 519}, "splits": {"train": {"name": "train", "num_bytes": 6749909, "num_examples": 488, "dataset_name": "udhn"}}, "download_checksums": {"https://unicode.org/udhr/assemblies/udhr_xml.zip": {"num_bytes": 2389690, "checksum": "a3350912790196c6e1b26bfd1c8a50e8575f5cf185922ecd9bd15713d7d21438"}, "https://unicode.org/udhr/assemblies/udhr_txt.zip": {"num_bytes": 2215441, "checksum": "cb87ecb25b56f34e4fd6f22b323000524fd9c06ae2a29f122b048789cf17e9fe"}}, "download_size": 4605131, "post_processing_size": null, "dataset_size": 6749909, "size_in_bytes": 11355040}} \ No newline at end of file diff --git a/datasets/udhr/udhr.py b/datasets/udhr/udhr.py index da4126fe9a5..e47224ff9f7 100644 --- a/datasets/udhr/udhr.py +++ b/datasets/udhr/udhr.py @@ -46,6 +46,8 @@ class UDHN(datasets.GeneratorBasedBuilder): """Universal Declaration of Human Rights""" + VERSION = datasets.Version("1.2022.0519") + def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, From 106253aff4bd8e05a5900c651977334342ff0912 Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Fri, 20 May 2022 16:30:12 +0200 Subject: [PATCH 3/6] Reset version; remove stage1+2 languages; add licensing and repo information --- datasets/udhr/README.md | 147 ++----------------------------- datasets/udhr/dataset_infos.json | 2 +- datasets/udhr/udhr.py | 4 +- 3 files changed, 11 insertions(+), 142 deletions(-) diff --git a/datasets/udhr/README.md b/datasets/udhr/README.md index d1f3da3f84e..6480bcd55bb 100644 --- a/datasets/udhr/README.md +++ b/datasets/udhr/README.md @@ -8,7 +8,6 @@ languages: - ab - ace - acu -- acu - ada - ady - af @@ -16,7 +15,6 @@ languages: - aii - ajg - ak -- ak - ak-akuapem - ak-asante - als @@ -29,17 +27,13 @@ languages: - ar - arl - arn -- as - ast - auc -- awa - ay -- ayo - az - az-Cyrl - az-Latn - ban -- bap - bax - bba - bci @@ -47,53 +41,39 @@ languages: - bem - bfa - bg -- bgp - bho - bi - bik - bin -- bjj - blt - bm -- bmj - bn - bo - boa -- bpy - br -- brd -- brx - bs - bs-Cyrl - bs-Latn - buc - bug - bum -- bvi -- byh - ca - cab - cak -- cas -- cax - cbi - cbr - cbs - cbt - cbu - ccp -- cdm - ceb - cfm - ch - chj - chk - chr -- chr -- chx - cic - cjk -- cjk - cjs - cjy - ckb @@ -103,7 +83,6 @@ languages: - cnr - co - cof -- con - cot - cpu - crh @@ -113,7 +92,6 @@ languages: - csa - csw - ctd -- cv - cy - da - dag @@ -122,10 +100,7 @@ languages: - de-1901 - de-1996 - dga -- dhi -- dhw - dip -- dry - duu - dv - dyo @@ -155,33 +130,28 @@ languages: - fuf-Adlm - fur - fuv -- fuv - fvr - fy - ga - gaa - gag - gan -- gbm - gd - gjn - gkp - gl - gld - gn -- gno -- gnw - gsw - gu - guc - guu - gv -- gvr - gyr - ha +- hak - ha-NE - ha-NG -- hak - haw - he - hi @@ -190,20 +160,15 @@ languages: - hmn - hms - hna -- hne - hni - hnj - hns -- hoc - hr - hsb - hsn - ht -- ht - hu - hus -- hus -- hus - huu - hy - ia @@ -219,10 +184,7 @@ languages: - it - iu - ja -- ja -- ja - jiv -- jul - jv - jv-Java - ka @@ -233,13 +195,9 @@ languages: - kdh - kea - kek -- kfa -- kfx - kg - kg-AO -- kgg - kha -- khr - kjh - kk - kkh @@ -247,7 +205,6 @@ languages: - kl - km - kmb -- kmb - kn - ko - koi @@ -255,31 +212,21 @@ languages: - kqn - kqs - kr -- kra - kri - krl -- ks -- ksw - ktu - ku -- ku-Arab - kwi -- kxl - ky - la -- la - lad - lah - lb -- lep - lg -- lhm - lia -- lif - lij - lld - ln -- ln - lns - lo - lob @@ -294,7 +241,6 @@ languages: - mad - mag - mai -- mai - mam - man - maz @@ -303,28 +249,21 @@ languages: - men - mfq - mg -- mgp - mh - mi - mic - min - miq -- mjz - mk - ml -- ml - mn - mn-Cyrl -- mn-Mong - mnw - mor - mos - mr - mt - mto -- mtp -- mvf -- mvf-Mong - mxi - mxv - my @@ -334,7 +273,6 @@ languages: - nba - nds - ne -- new - ng - nhn - nio @@ -347,25 +285,16 @@ languages: - not - nr - nso -- nus - nv - ny -- ny - nym - nyn - nzi - oaa - oc -- oc -- oc -- oc -- oc -- oc -- oc - ojb - oki - om -- or - orh - os - ote @@ -381,20 +310,16 @@ languages: - pl - pon - pov -- pov - ppl - prq - ps - pt - pt-BR - pt-PT -- pwo -- qu - qu - quc - qug - quh -- qul - quy - qva - qvc @@ -404,39 +329,26 @@ languages: - qwh - qxn - qxu -- rab -- raj - rar - rgn -- rhg -- rji -- rjs - rm +- rmn - rm-puter - rm-rumgr - rm-surmiran - rm-sursilv - rm-sutsilv - rm-vallader -- rmn -- rmn - rn - ro -- ro -- ro - ru - rup - rw -- rwr - sa - sa-Gran - sah -- sat - sc -- sck - sco -- scp -- sd - se - sey - sg @@ -444,9 +356,7 @@ languages: - shn - shp - si -- sja - sk -- skj - skr - sl - slr @@ -458,23 +368,18 @@ languages: - sr - sr-Cyrl - sr-Latn -- srq - srr - ss - st - su - suk - sus -- suz -- suz - sv - sw - swb - ta -- ta-LK - taj -- taq -- taq-Tfng +- ta-LK - tbz - tca - tdt @@ -483,21 +388,14 @@ languages: - tet - tg - th -- th -- thf -- thl -- ths - ti - tiv -- tji - tk - tk-Cyrl - tk-Latn - tl -- tl-Tglg - tly - tn -- tna - to - tob - toi @@ -505,8 +403,6 @@ languages: - top - tpi - tr -- trn -- ts - ts - tsz - tt @@ -514,7 +410,6 @@ languages: - tyv - tzh - tzm -- tzm-Tfng - tzo - udu - ug @@ -522,32 +417,13 @@ languages: - ug-Latn - uk - umb -- umb -- und -- und -- und -- und -- und -- und -- und -- und -- und - und -- und -- und -- und -- und -- und -- unr -- ur - ur - ura - uz - uz-Cyrl - uz-Latn - vai -- vay -- ve - ve - vec - vep @@ -561,28 +437,20 @@ languages: - wwa - xh - xsm -- xsr - yad - yao - yap -- ybh - yi - ykg - yo - yrk - yua - yue -- yuz - za - zam - zdj - zgh - zh -- zh -- zh -- zh -- zh -- zh - zh-Hant - zlm - zlm-Arab @@ -634,7 +502,7 @@ pretty_name: The Universal Declaration of Human Rights (UDHR) ## Dataset Description - **Homepage:** https://www.ohchr.org/EN/UDHR/Pages/UDHRIndex.aspx, https://unicode.org/udhr/index.html -- **Repository:** +- **Repository:** https://github.com/unicode-org/udhr - **Paper:** - **Leaderboard:** - **Point of Contact:** @@ -737,9 +605,10 @@ The txt/xml data files used here were compiled by The Unicode Consortium, which ### Licensing Information -© 1996 – 2022 The Office of the High Commissioner for Human Rights +Source text © 1996 – 2022 The Office of the High Commissioner for Human Rights + +The [Unicode license](https://www.unicode.org/license.txt) applies to these translations. -[More Information Needed] ### Citation Information @@ -747,4 +616,4 @@ The txt/xml data files used here were compiled by The Unicode Consortium, which ### Contributions -Thanks to [@joeddav](https://github.com/joeddav) for adding this dataset. +Thanks to [@joeddav](https://github.com/joeddav) for adding this dataset. Updated May 2022 [@leondz](https://github.com/leondz). diff --git a/datasets/udhr/dataset_infos.json b/datasets/udhr/dataset_infos.json index 418fb0214b5..4bcca0c85e4 100644 --- a/datasets/udhr/dataset_infos.json +++ b/datasets/udhr/dataset_infos.json @@ -1 +1 @@ -{"default": {"description": "The Universal Declaration of Human Rights (UDHR) is a milestone document in the history of human rights. Drafted by\nrepresentatives with different legal and cultural backgrounds from all regions of the world, it set out, for the\nfirst time, fundamental human rights to be universally protected. The Declaration was adopted by the UN General\nAssembly in Paris on 10 December 1948 during its 183rd plenary meeting. The dataset includes translations of the\ndocument in 464 languages and dialects.\n\n\u00a9 1996 \u2013 2009 The Office of the High Commissioner for Human Rights\n\nThis plain text version prepared by the \u201cUDHR in Unicode\u201d project, https://www.unicode.org/udhr.\n", "citation": "", "homepage": "https://www.ohchr.org/EN/UDHR/Pages/UDHRIndex.aspx", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "lang_key": {"dtype": "string", "id": null, "_type": "Value"}, "lang_name": {"dtype": "string", "id": null, "_type": "Value"}, "iso639-3": {"dtype": "string", "id": null, "_type": "Value"}, "iso15924": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "udhn", "config_name": "default", "version": {"version_str": "1.2022.0519", "description": null, "major": 1, "minor": 2022, "patch": 519}, "splits": {"train": {"name": "train", "num_bytes": 6749909, "num_examples": 488, "dataset_name": "udhn"}}, "download_checksums": {"https://unicode.org/udhr/assemblies/udhr_xml.zip": {"num_bytes": 2389690, "checksum": "a3350912790196c6e1b26bfd1c8a50e8575f5cf185922ecd9bd15713d7d21438"}, "https://unicode.org/udhr/assemblies/udhr_txt.zip": {"num_bytes": 2215441, "checksum": "cb87ecb25b56f34e4fd6f22b323000524fd9c06ae2a29f122b048789cf17e9fe"}}, "download_size": 4605131, "post_processing_size": null, "dataset_size": 6749909, "size_in_bytes": 11355040}} \ No newline at end of file +{"default": {"description": "The Universal Declaration of Human Rights (UDHR) is a milestone document in the history of human rights. Drafted by\nrepresentatives with different legal and cultural backgrounds from all regions of the world, it set out, for the\nfirst time, fundamental human rights to be universally protected. The Declaration was adopted by the UN General\nAssembly in Paris on 10 December 1948 during its 183rd plenary meeting. The dataset includes translations of the\ndocument in 464 languages and dialects.\n\n\u00a9 1996 \u2013 2009 The Office of the High Commissioner for Human Rights\n\nThis plain text version prepared by the \u201cUDHR in Unicode\u201d project, https://www.unicode.org/udhr.\n", "citation": "", "homepage": "https://www.ohchr.org/EN/UDHR/Pages/UDHRIndex.aspx", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "lang_key": {"dtype": "string", "id": null, "_type": "Value"}, "lang_name": {"dtype": "string", "id": null, "_type": "Value"}, "iso639-3": {"dtype": "string", "id": null, "_type": "Value"}, "iso15924": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "udhn", "config_name": "default", "version": {"version_str": "1.0.0", "description": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 6749909, "num_examples": 488, "dataset_name": "udhn"}}, "download_checksums": {"https://unicode.org/udhr/assemblies/udhr_xml.zip": {"num_bytes": 2389690, "checksum": "a3350912790196c6e1b26bfd1c8a50e8575f5cf185922ecd9bd15713d7d21438"}, "https://unicode.org/udhr/assemblies/udhr_txt.zip": {"num_bytes": 2215441, "checksum": "cb87ecb25b56f34e4fd6f22b323000524fd9c06ae2a29f122b048789cf17e9fe"}}, "download_size": 4605131, "post_processing_size": null, "dataset_size": 6749909, "size_in_bytes": 11355040}} \ No newline at end of file diff --git a/datasets/udhr/udhr.py b/datasets/udhr/udhr.py index e47224ff9f7..28b5f8e781b 100644 --- a/datasets/udhr/udhr.py +++ b/datasets/udhr/udhr.py @@ -29,7 +29,7 @@ representatives with different legal and cultural backgrounds from all regions of the world, it set out, for the first time, fundamental human rights to be universally protected. The Declaration was adopted by the UN General Assembly in Paris on 10 December 1948 during its 183rd plenary meeting. The dataset includes translations of the -document in 464 languages and dialects. +document in 464+ languages and dialects. © 1996 – 2009 The Office of the High Commissioner for Human Rights @@ -46,7 +46,7 @@ class UDHN(datasets.GeneratorBasedBuilder): """Universal Declaration of Human Rights""" - VERSION = datasets.Version("1.2022.0519") + VERSION = datasets.Version("1.0.0") def _info(self): return datasets.DatasetInfo( From 4bc4d38925d8841381210238795eda690374d0cc Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Fri, 20 May 2022 19:51:25 +0200 Subject: [PATCH 4/6] bump dummy version --- datasets/udhr/dummy/{0.0.0 => 1.0.0}/dummy_data.zip | Bin 1 file changed, 0 insertions(+), 0 deletions(-) rename datasets/udhr/dummy/{0.0.0 => 1.0.0}/dummy_data.zip (100%) diff --git a/datasets/udhr/dummy/0.0.0/dummy_data.zip b/datasets/udhr/dummy/1.0.0/dummy_data.zip similarity index 100% rename from datasets/udhr/dummy/0.0.0/dummy_data.zip rename to datasets/udhr/dummy/1.0.0/dummy_data.zip From 0299c1fc1d5021593f1a5ba60cd2b4ec5e233ddf Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 8 Jun 2022 14:27:32 +0200 Subject: [PATCH 5/6] Fix language tags --- datasets/udhr/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datasets/udhr/README.md b/datasets/udhr/README.md index 6480bcd55bb..cb87f9f8ac1 100644 --- a/datasets/udhr/README.md +++ b/datasets/udhr/README.md @@ -14,9 +14,6 @@ languages: - agr - aii - ajg -- ak -- ak-akuapem -- ak-asante - als - alt - am @@ -120,6 +117,7 @@ languages: - evn - fa - fa-AF +- fat - fi - fj - fkv @@ -406,6 +404,8 @@ languages: - ts - tsz - tt +- tw-akuapem +- tw-asante - ty - tyv - tzh From f8d72c33880ef1d6c7112af8a3fc7aa408f59f4c Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 8 Jun 2022 20:14:46 +0200 Subject: [PATCH 6/6] Add citation to dataset card --- datasets/udhr/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/udhr/README.md b/datasets/udhr/README.md index cb87f9f8ac1..a6737d8c59c 100644 --- a/datasets/udhr/README.md +++ b/datasets/udhr/README.md @@ -612,7 +612,7 @@ The [Unicode license](https://www.unicode.org/license.txt) applies to these tran ### Citation Information -[More Information Needed] +United Nations. (1998). The Universal Declaration of Human Rights, 1948-1998. New York: United Nations Dept. of Public Information. ### Contributions