Skip to content

Commit 2ab8879

Browse files
committed
fix detectLangsFromName and add a test set, including tests from PR #700
1 parent 82fb3fb commit 2ab8879

File tree

3 files changed

+307
-3
lines changed

3 files changed

+307
-3
lines changed

pyglossary/glossary_info.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,18 +192,18 @@ def detectLangsFromName(self) -> None:
192192
if self._info.get(c_sourceLang):
193193
return
194194

195+
name = name.lower().replace("_", " ")
196+
195197
langNames = []
196198

197199
def checkPart(part: str) -> None:
198200
for match in re.findall(r"\w\w\w*", part):
199-
# print(f"{match = }")
200201
lang = langDict[match]
201202
if lang is None:
202203
continue
203204
langNames.append(lang.name)
204205

205-
for part in re.split("-|_| to ", name):
206-
# print(f"{part = }")
206+
for part in re.split("-| to ", name):
207207
checkPart(part)
208208
if len(langNames) >= 2: # noqa: PLR2004
209209
break

tests/glossary_v2_test.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from pyglossary.glossary_v2 import ConvertArgs, Glossary
2424
from pyglossary.os_utils import rmtree
2525
from pyglossary.text_utils import crc32hex
26+
from tests.name_langs_test_data import nameLangsTestData
2627

2728
if TYPE_CHECKING:
2829
from collections.abc import Callable
@@ -539,6 +540,17 @@ def test_lang_getObj_target(self):
539540
if glos.targetLang is not None:
540541
self.assertEqual(glos.targetLang.name, "Malay")
541542

543+
def test_lang_detect_dataset_1(self):
544+
for name, sourceLang, targetLang in nameLangsTestData:
545+
glos = self.glos = Glossary()
546+
glos.setInfo("name", name)
547+
glos.detectLangsFromName()
548+
self.assertEqual(
549+
(glos.sourceLangName, glos.targetLangName),
550+
(sourceLang or "", targetLang or ""),
551+
f"{name=}",
552+
)
553+
542554
def test_lang_detect_1(self):
543555
glos = self.glos = Glossary()
544556
glos.setInfo("name", "en-fa")
@@ -584,6 +596,27 @@ def test_lang_detect_5(self):
584596
("English", "German"),
585597
)
586598

599+
def test_lang_detect_6(self):
600+
glos = self.glos = Glossary()
601+
glos.setInfo("name", "Church Slavonic-deu.index")
602+
glos.detectLangsFromName()
603+
print(glos.sourceLangName)
604+
# ("Church Slavonic", "German"),
605+
self.assertEqual(
606+
(glos.sourceLangName, glos.targetLangName),
607+
("", ""),
608+
)
609+
610+
# FIXME: should be either ("", "") or ("Fijian", "German")
611+
def test_lang_detect_7(self):
612+
glos = self.glos = Glossary()
613+
glos.setInfo("name", "Na vosa vaka-Viti-deu.index")
614+
glos.detectLangsFromName()
615+
self.assertEqual(
616+
(glos.sourceLangName, glos.targetLangName),
617+
("Nauruan", "German"),
618+
)
619+
587620
def convert_to_txtZip(
588621
self,
589622
fname, # input file with extension

tests/name_langs_test_data.py

Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
nameLangsTestData = [
2+
# multi-word lang names that we detect wrong:
3+
# ("Eurfa Cymraeg, Welsh-English Eurfa/Freedict dictionary", "Welsh", "English"), # cym-eng
4+
# ("Norwegian Nynorsk-Norwegian Bokmål FreeDict Dictionary", "Norwegian Nynorsk", "Norwegian"), # nno-nob
5+
# cases that detects badly and should skip detecting
6+
# ("Farajbeik Farsi ( Windows Farsi )", None, None),
7+
# ("Computer And IT Dictionary for Persian v4.01", None, None),
8+
# (
9+
# "French (and/or English) to Pârsi (Persian) epistemological Dict. (Latin chars)",
10+
# "French",
11+
# "Persian",
12+
# ),
13+
("Castellano >Turko Diccionario", None, None),
14+
("LeXiCoN castelán-galego", None, None),
15+
("Aragonés-Castellán", None, None),
16+
("Arab2English", None, None),
17+
("The Romanization of Korean", None, None),
18+
("Deutschland auf Google Maps", None, None),
19+
("Glossary of Latin-Genus-Names", None, None),
20+
("Dictionario Interlingua - Nederlandese", None, None),
21+
("Papiamento - Dutch", None, None),
22+
("Loghatname(Alef)", None, None),
23+
("Lunfardo (Argentina)", None, None),
24+
("Medciclopedia", None, None),
25+
("spanis learner's dictionary", None, None),
26+
("Dutch - Papiamento", None, None),
27+
("Dahl's Russian Dictionary", None, None),
28+
("Great Encyclopedic Glossary", None, None),
29+
("Efremova (Russian Explanatory Dictionary)", None, None),
30+
("RUSSIAN LEARNER'S DICTIONARY", None, None),
31+
("One-Click Engllish-Urdu Dictionary v1.3", None, None),
32+
("Acronyms from A - Z", None, None),
33+
("hFarsi - advanced version", None, None),
34+
("Mathematics Glossary - Mohammad Reza Majidee", None, None),
35+
("Industrial Engineering Version 2.0", None, None),
36+
("Geology Science (M.M.Ma'leki)", None, None),
37+
("Farsi Aviation Dictionary", None, None),
38+
("Persian Computer Encyclopedia", None, None),
39+
("Mokhtari Law Dict. (v1.0)", None, None),
40+
("Hafez Poems", None, None),
41+
("INGLESPANISH", None, None),
42+
("Mehran - All about Computer", None, None),
43+
("Surinaams-Nederlands Trafasi", None, None),
44+
("Technisch E-NL Woordenboek", None, None),
45+
("Nederlands - Surinaams Trafasi", None, None),
46+
("Collins Cobuild 5", None, None),
47+
("Glossary of Computer and Internet Terms", None, None),
48+
("Customs and Excise Glossary", None, None),
49+
("Currency In Each Country", None, None),
50+
("9300+ Computer Acronyms", None, None),
51+
("Legal Systems of All Countries", None, None),
52+
("QURAN", None, None),
53+
("Britannica Concise Encyclopedia", None, None),
54+
("Collins English Dictionary", None, None),
55+
("XML Acronym Demystifier", None, None),
56+
("Solar Physics Glossary", None, None),
57+
("English Phonetics", None, None),
58+
("Flavours of Malaysia/ Malaysian delights", None, None),
59+
("Wordset.org", None, None),
60+
("Astronomy and Physics Terms by ExploreSpace.com", None, None),
61+
("Sorani-Kurmanji Ferheng/FreeDict Dictionary", None, None), # ckb-kmr
62+
("Aryanpour (en-fa, fa-en)", "English", "Persian"),
63+
("Castellano-Catalán", "Spanish", "Catalan"),
64+
("MB_Dictionary Spanish_to_Persian", "Spanish", "Persian"),
65+
("Deutsch-English FreeDict+WikDict dictionary (de-en)", "German", "English"),
66+
("Persisch-Deutsch; Deutsch-Persisch (Alefbâye 2om)", "Persian", "German"),
67+
("Indonesia-Nederlands", "Indonesian", "Dutch"),
68+
("Lexin Svensk-Spanskt Lexikon", "Swedish", "Spanish"),
69+
("Azhdari :: German To Persian Glossary 1.1", "German", "Persian"),
70+
("Latvian-Russian Dictionary", "Latvian", "Russian"),
71+
("AACS Mongolian-English", "Mongolian", "English"),
72+
("Urdu to English Gloassry", "Urdu", "English"),
73+
("ADO'S WOORDENBOEK TURKS-NEDERLANDS", "Turkish", "Dutch"),
74+
("Babylon Turkish-English", "Turkish", "English"),
75+
("German to Persian", "German", "Persian"),
76+
("deutsch-spanisch", "German", "Spanish"),
77+
("Schiffahrtsausdrücke Deutsch - Holländisch", "German", "Dutch"),
78+
("ADO's Deutsch-Niederländisch", "German", "Dutch"),
79+
("Elif - German / English Tourist Dic.", "German", "English"),
80+
("Babylon German-English", "German", "English"),
81+
("WinCept Glass Dictionary (GER>ENG)", "German", "English"),
82+
("Babylon German-English", "German", "English"),
83+
("technical terms German-English", "German", "English"),
84+
("Runasimi (Quechua) - Español", "Quechua", "Quechua"),
85+
("Persisch-Deutsch (Alefbā-ye 2om, 2. persisches Alphabet)", "Persian", "German"),
86+
("Azhdari ::: Persian To German Glossary version 1.1", "Persian", "German"),
87+
("Farsi to arabic", "Persian", "Arabic"),
88+
("Persian Italian Glossary", "Persian", "Italian"),
89+
("Ourstat - Farsi to English Dictionary", "Persian", "English"),
90+
("HmT - Persian to English Glossary", "Persian", "English"),
91+
("Arianpour Persian-English (OpenDictionary)", "Persian", "English"),
92+
("Malay to English", "Malay", "English"),
93+
("Korean-English Dictionary", "Korean", "English"),
94+
("Babylon Korean-English", "Korean", "English"),
95+
("Babylon Chinese(S)-English", "Chinese", "English"),
96+
("euskera-español", "Basque", "Spanish"),
97+
("Babylon Japanese-English", "Japanese", "English"),
98+
("Animal names in Latin and English", "Latin", "English"),
99+
("Dictionary Portuguese - Dutch", "Portuguese", "Dutch"),
100+
("Finnish To farsi", "Finnish", "Persian"),
101+
("Català-Castellà", "Catalan", "Spanish"),
102+
("Esp-Deu Wörterbuch", "Spanish", "German"),
103+
("ADO's SPANISCH-DEUTSCH", "Spanish", "German"),
104+
("Spanish To Farsi", "Spanish", "Persian"),
105+
("JM Spanish-Danish Dictionary", "Spanish", "Danish"),
106+
("Spa-Fin", "Spanish", "Finnish"),
107+
("Spanish-Bulgarian", "Spanish", "Bulgarian"),
108+
("Babylon Spanish-English", "Spanish", "English"),
109+
(
110+
"A Spanish-English Dictionary (Granada University, Spain), 14.4",
111+
"Spanish",
112+
"English",
113+
),
114+
("Babylon Spanish-English", "Spanish", "English"),
115+
("Spanish-English Online Dictionaries", "Spanish", "English"),
116+
("Arabic to Farsi", "Arabic", "Persian"),
117+
("ADO's FRENCH-GERMAN", "French", "German"),
118+
("French to Farsi", "French", "Persian"),
119+
("MB_Dictionary French_to_Persian", "French", "Persian"),
120+
("French-Chinese GBK", "French", "Chinese"),
121+
("French-Chinese", "French", "Chinese"),
122+
("French-Bulgarian", "French", "Bulgarian"),
123+
("Babylon French-English", "French", "English"),
124+
("Nederlands-Indonesia", "Dutch", "Indonesian"),
125+
("Nederlands-Duits", "Dutch", "German"),
126+
("Nederlands - Kroatisch woordenboek", "Dutch", "Croatian"),
127+
("Woordenboek Dutch - Portugees", "Dutch", "Portuguese"),
128+
("Néerlandais-Français", "Dutch", "French"),
129+
("Bahasa Indonesia-Nederlands Adaptasi", "Indonesian", "Dutch"),
130+
("Nederlands-Bahasa Indonesia Adaptasi", "Dutch", "Indonesian"),
131+
("Dutch-English Online Dictionay", "Dutch", "English"),
132+
("Dutch_English 22000", "Dutch", "English"),
133+
("Babylon Dutch-English", "Dutch", "English"),
134+
("Russian-Latvian Dictionary", "Russian", "Latvian"),
135+
("Russian-Turkish Dictionary", "Russian", "Turkish"),
136+
("Russisch-Deutsch Woerterbuch", "Russian", "German"),
137+
("MHM Russian > Persian Dictionary", "Russian", "Persian"),
138+
("Babylon Russian-English", "Russian", "English"),
139+
("Italian to farsi", "Italian", "Persian"),
140+
("Italian Persian glossary", "Italian", "Persian"),
141+
("Italian>Farsi(Persian) Advanced V 3.0", "Italian", "Persian"),
142+
("Italian Persian glossary", "Italian", "Persian"),
143+
("Italiano - Español (GI)", "Italian", "Spanish"),
144+
("Babylon Italian-English", "Italian", "English"),
145+
("English To Urdu Lughat", "English", "Urdu"),
146+
("English-Urdu dictionary", "English", "Urdu"),
147+
("Babylon English-Turkish", "English", "Turkish"),
148+
("English-Turkish", "English", "Turkish"),
149+
("Eng-Tur_Computer/Electronics Terms", "English", "Turkish"),
150+
("WinCept Glass Dictionary (ENG>GER)", "English", "German"),
151+
("PONS Universelles Wörterbuch Englisch-Deutsch", "English", "German"),
152+
("Babylon English-German", "English", "German"),
153+
("English to Malay", "English", "Malay"),
154+
("Salaty English-Farsi Dict. (Text ver.)", "English", "Persian"),
155+
("HmT - English to Persian Glossary", "English", "Persian"),
156+
("Morteza English > Farsi", "English", "Persian"),
157+
("Dr. ALLI Malay - Farsi Dictionary", "Malay", "Persian"),
158+
("Arianpour English-Persian (OpenDictionary)", "English", "Persian"),
159+
("Salaty English-Farsi Dict. (Graphical ver.)", "English", "Persian"),
160+
("Accounting English-Persian", "English", "Persian"),
161+
("PAKcw English-Korean Dictionary", "English", "Korean"),
162+
("Babylon English-Korean", "English", "Korean"),
163+
("English-Spanish Online Dictionaries", "English", "Spanish"),
164+
("Babylon English-Spanish", "English", "Spanish"),
165+
("English_Spanish by Jaime Aguirre", "English", "Spanish"),
166+
(
167+
"An English-Spanish Dictionary (Granada University, Spain), 14.4",
168+
"English",
169+
"Spanish",
170+
),
171+
("Wadan English-Arabic Auditing Terms", "English", "Arabic"),
172+
("English 2 Arabic Glossary", "English", "Arabic"),
173+
("English 2 Arabic", "English", "Arabic"),
174+
("Babylon English-French", "English", "French"),
175+
("Babylon English-Dutch", "English", "Dutch"),
176+
("English-Dutch Online Dictionary", "English", "Dutch"),
177+
("Morteza English > Russian", "English", "Russian"),
178+
("english-russian", "English", "Russian"),
179+
("English/Russian - Mueller24", "English", "Russian"),
180+
("English-Russian Lingvistica'98 dictionary", "English", "Russian"),
181+
("Babylon English-Russian", "English", "Russian"),
182+
("Babylon English-English", "English", "English"),
183+
("Afrikaans-German FreeDict Dictionary", "Afrikaans", "German"), # afr-deu
184+
("Afrikaans-English FreeDict Dictionary", "Afrikaans", "English"), # afr-eng
185+
("Arabic-English FreeDict Dictionary", "Arabic", "English"), # ara-eng
186+
("Breton-French FreeDict Dictionary (Geriadur Tomaz)", "Breton", "French"), # bre-fra
187+
("Czech-English FreeDict Dictionary", "Czech", "English"), # ces-eng
188+
("Danish-English FreeDict Dictionary", "Danish", "English"), # dan-eng
189+
("German-Italian FreeDict Dictionary", "German", "Italian"), # deu-ita
190+
("German-Kurdish Ferheng/FreeDict Dictionary", "German", "Kurdish"), # deu-kur
191+
("German-Dutch FreeDict Dictionary", "German", "Dutch"), # deu-nld
192+
("German-Portuguese FreeDict Dictionary", "German", "Portuguese"), # deu-por
193+
("German-Turkish Ferheng/FreeDict Dictionary", "German", "Turkish"), # deu-tur
194+
("English-Afrikaans FreeDict Dictionary", "English", "Afrikaans"), # eng-afr
195+
("English-Arabic FreeDict Dictionary", "English", "Arabic"), # eng-ara
196+
("English-Czech dicts.info/FreeDict Dictionary", "English", "Czech"), # eng-ces
197+
(
198+
"Eurfa Saesneg, English-Welsh Eurfa/Freedict dictionary",
199+
"English",
200+
"Welsh",
201+
), # eng-cym
202+
("English-Danish FreeDict Dictionary", "English", "Danish"), # eng-dan
203+
("English - Modern Greek XDXF/FreeDict dictionary", "English", "Greek"), # eng-ell
204+
("English-French FreeDict Dictionary", "English", "French"), # eng-fra
205+
("English-Irish FreeDict Dictionary", "English", "Irish"), # eng-gle
206+
("English-Hindi FreeDict Dictionary", "English", "Hindi"), # eng-hin
207+
("English-Croatian FreeDict Dictionary", "English", "Croatian"), # eng-hrv
208+
("English-Hungarian FreeDict Dictionary", "English", "Hungarian"), # eng-hun
209+
("English-Italian FreeDict Dictionary", "English", "Italian"), # eng-ita
210+
("English-Latin FreeDict Dictionary", "English", "Latin"), # eng-lat
211+
("English-Lithuanian FreeDict Dictionary", "English", "Lithuanian"), # eng-lit
212+
("English-Dutch FreeDict Dictionary", "English", "Dutch"), # eng-nld
213+
(
214+
"English - Polish Piotrowski+Saloni/FreeDict dictionary",
215+
"English",
216+
"Polish",
217+
), # eng-pol
218+
("English-Portuguese FreeDict Dictionary", "English", "Portuguese"), # eng-por
219+
("English-Romanian FreeDict Dictionary", "English", "Romanian"), # eng-rom
220+
("English-Serbian FreeDict Dictionary", "English", "Serbian"), # eng-srp
221+
("English-Swahili xFried/FreeDict Dictionary", "English", "Swahili"), # eng-swh
222+
("English-Turkish FreeDict Dictionary", "English", "Turkish"), # eng-tur
223+
("French-Breton FreeDict Dictionary (Geriadur Tomaz)", "French", "Breton"), # fra-bre
224+
("French-English FreeDict Dictionary", "French", "English"), # fra-eng
225+
("French-Dutch FreeDict Dictionary", "French", "Dutch"), # fra-nld
226+
(
227+
"Scottish Gaelic-German FreeDict Dictionary",
228+
"Scottish Gaelic",
229+
"German",
230+
), # gla-deu
231+
("Irish-English FreeDict Dictionary", "Irish", "English"), # gle-eng
232+
("Irish-Polish FreeDict Dictionary", "Irish", "Polish"), # gle-pol
233+
("Croatian-English FreeDict Dictionary", "Croatian", "English"), # hrv-eng
234+
("Hungarian-English FreeDict Dictionary", "Hungarian", "English"), # hun-eng
235+
("íslenska - English FreeDict Dictionary", "Icelandic", "English"), # isl-eng
236+
("Italian-German FreeDict Dictionary", "Italian", "German"), # ita-deu
237+
("Italian-English FreeDict Dictionary", "Italian", "English"), # ita-eng
238+
("Japanese-German FreeDict Dictionary", "Japanese", "German"), # jpn-deu
239+
("Japanese-English FreeDict Dictionary", "Japanese", "English"), # jpn-eng
240+
("Japanese-French FreeDict Dictionary", "Japanese", "French"), # jpn-fra
241+
("Japanese-Russian FreeDict Dictionary", "Japanese", "Russian"), # jpn-rus
242+
("Khasi - German FreeDict Dictionary", "Khasi", "German"), # kha-deu
243+
("Khasi-English FreeDict Dictionary", "Khasi", "English"), # kha-eng
244+
("Kurdish-German Ferheng/FreeDict Dictionary", "Kurdish", "German"), # kur-deu
245+
("Kurdish-English Ferheng/FreeDict Dictionary", "Kurdish", "English"), # kur-eng
246+
("Kurdish-Turkish Ferheng/FreeDict Dictionary", "Kurdish", "Turkish"), # kur-tur
247+
("Lateinisch-Deutsch FreeDict-Wörterbuch", "Latin", "German"), # lat-deu
248+
("Latin-English FreeDict Dictionary", "Latin", "English"), # lat-eng
249+
("Lithuanian-English FreeDict Dictionary", "Lithuanian", "English"), # lit-eng
250+
("Macedonian - Bulgarian FreeDict Dictionary", "Macedonian", "Bulgarian"), # mkd-bul
251+
("Dutch-German FreeDict Dictionary", "Dutch", "German"), # nld-deu
252+
("Dutch-English Freedict Dictionary", "Dutch", "English"), # nld-eng
253+
("Nederlands-French FreeDict Dictionary", "Dutch", "French"), # nld-fra
254+
("Lenga d'òc - Català FreeDict Dictionary", "", ""), # oci-cat
255+
("Polish-Irish FreeDict Dictionary", "Polish", "Irish"), # pol-gle
256+
("Portuguese-German FreeDict Dictionary", "Portuguese", "German"), # por-deu
257+
("Portuguese-English FreeDict Dictionary", "Portuguese", "English"), # por-eng
258+
("Sanskrit-German FreeDict Dictionary", "Sanskrit", "German"), # san-deu
259+
("Slovak-English FreeDict Dictionary", "Slovak", "English"), # slk-eng
260+
("Slovenian-English FreeDict Dictionary", "Slovene", "English"), # slv-eng
261+
("Spanish - Asturian FreeDict Dictionary", "Spanish", "Asturian"), # spa-ast
262+
("Spanish-English FreeDict Dictionary", "Spanish", "English"), # spa-eng
263+
("Spanish-Portuguese FreeDict Dictionary", "Spanish", "Portuguese"), # spa-por
264+
("Serbian - English FreeDict Dictionary", "Serbian", "English"), # srp-eng
265+
("Swedish-English FreeDict Dictionary", "Swedish", "English"), # swe-eng
266+
("Swahili-English xFried/FreeDict Dictionary", "Swahili", "English"), # swh-eng
267+
("Swahili-Polish SSSP/FreeDict Dictionary", "Swahili", "Polish"), # swh-pol
268+
("Turkish-German FreeDict Dictionary", "Turkish", "German"), # tur-deu
269+
("Turkish-English FreeDict Dictionary", "Turkish", "English"), # tur-eng
270+
("Wolof - French FreeDict dictionary", "Wolof", "French"), # wol-fra
271+
]

0 commit comments

Comments
 (0)