Skip to content

Commit 6d5ac2b

Browse files
committed
fix detectLangsFromName and add a test set, including tests from PR #700
1 parent 6f318d7 commit 6d5ac2b

3 files changed

Lines changed: 213 additions & 1 deletion

File tree

pyglossary/glossary_info.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ def detectLangsFromName(self) -> None:
189189
name = self._info.get(c_name)
190190
if not name:
191191
return
192+
name = name.lower()
192193
if self._info.get(c_sourceLang):
193194
return
194195

@@ -202,7 +203,9 @@ def checkPart(part: str) -> None:
202203
continue
203204
langNames.append(lang.name)
204205

205-
for part in re.split("-|_| to ", name):
206+
# name = name.replace("_to_", " to ")
207+
name = name.replace("_", " ")
208+
for part in re.split("-| to ", name):
206209
# print(f"{part = }")
207210
checkPart(part)
208211
if len(langNames) >= 2: # noqa: PLR2004

tests/glossary_v2_test.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from pyglossary.glossary_v2 import ConvertArgs, Glossary
2424
from pyglossary.os_utils import rmtree
2525
from pyglossary.text_utils import crc32hex
26+
from tests.name_langs_test_data import nameLangsTestData
2627

2728
if TYPE_CHECKING:
2829
from collections.abc import Callable
@@ -539,6 +540,17 @@ def test_lang_getObj_target(self):
539540
if glos.targetLang is not None:
540541
self.assertEqual(glos.targetLang.name, "Malay")
541542

543+
def test_lang_detect_dataset_1(self):
544+
for name, sourceLang, targetLang in nameLangsTestData:
545+
glos = self.glos = Glossary()
546+
glos.setInfo("name", name)
547+
glos.detectLangsFromName()
548+
self.assertEqual(
549+
(glos.sourceLangName, glos.targetLangName),
550+
(sourceLang or "", targetLang or ""),
551+
f"{name=}",
552+
)
553+
542554
def test_lang_detect_1(self):
543555
glos = self.glos = Glossary()
544556
glos.setInfo("name", "en-fa")
@@ -584,6 +596,27 @@ def test_lang_detect_5(self):
584596
("English", "German"),
585597
)
586598

599+
def test_lang_detect_6(self):
600+
glos = self.glos = Glossary()
601+
glos.setInfo("name", "Church Slavonic-deu.index")
602+
glos.detectLangsFromName()
603+
print(glos.sourceLangName)
604+
# ("Church Slavonic", "German"),
605+
self.assertEqual(
606+
(glos.sourceLangName, glos.targetLangName),
607+
("", ""),
608+
)
609+
610+
# FIXME: should be either ("", "") or ("Fijian", "German")
611+
def test_lang_detect_7(self):
612+
glos = self.glos = Glossary()
613+
glos.setInfo("name", "Na vosa vaka-Viti-deu.index")
614+
glos.detectLangsFromName()
615+
self.assertEqual(
616+
(glos.sourceLangName, glos.targetLangName),
617+
("Nauruan", "German"),
618+
)
619+
587620
def convert_to_txtZip(
588621
self,
589622
fname, # input file with extension

tests/name_langs_test_data.py

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
nameLangsTestData = [
2+
# ("Castellano-Catalán", "Spanish", "Catalan"),
3+
# ("Persisch-Deutsch; Deutsch-Persisch (Alefbâye 2om)", "German", "Persian"),
4+
# ("Castellano >Turko Diccionario", "Spanish", "Turkish"),
5+
# ("MB_Dictionary Spanish_to_Persian", "Spanish", "Finnish"),
6+
# (
7+
# "French (and/or English) to Pârsi (Persian) epistemological Dict. (Latin chars)",
8+
# "French",
9+
# "Persian",
10+
# ),
11+
# ("Farajbeik Farsi ( Windows Farsi )", None, None),
12+
# ("Computer And IT Dictionary for Persian v4.01", None, None),
13+
("LeXiCoN castelán-galego", None, None),
14+
("Aragonés-Castellán", None, None),
15+
("Arab2English", None, None),
16+
("The Romanization of Korean", None, None),
17+
("Deutschland auf Google Maps", None, None),
18+
("Glossary of Latin-Genus-Names", None, None),
19+
("Dictionario Interlingua - Nederlandese", None, None),
20+
("Papiamento - Dutch", None, None),
21+
("Loghatname(Alef)", None, None),
22+
("Indonesia-Nederlands", "Indonesian", "Dutch"),
23+
("Lexin Svensk-Spanskt Lexikon", "Swedish", "Spanish"),
24+
("Azhdari :: German To Persian Glossary 1.1", "German", "Persian"),
25+
("Latvian-Russian Dictionary", "Latvian", "Russian"),
26+
("AACS Mongolian-English", "Mongolian", "English"),
27+
("Urdu to English Gloassry", "Urdu", "English"),
28+
("ADO'S WOORDENBOEK TURKS-NEDERLANDS", "Turkish", "Dutch"),
29+
("Babylon Turkish-English", "Turkish", "English"),
30+
("German to Persian", "German", "Persian"),
31+
("deutsch-spanisch", "German", "Spanish"),
32+
("Schiffahrtsausdrücke Deutsch - Holländisch", "German", "Dutch"),
33+
("ADO's Deutsch-Niederländisch", "German", "Dutch"),
34+
("Elif - German / English Tourist Dic.", "German", "English"),
35+
("Babylon German-English", "German", "English"),
36+
("WinCept Glass Dictionary (GER>ENG)", "German", "English"),
37+
("Babylon German-English", "German", "English"),
38+
("technical terms German-English", "German", "English"),
39+
("Runasimi (Quechua) - Español", "Quechua", "Quechua"),
40+
("Persisch-Deutsch (Alefbā-ye 2om, 2. persisches Alphabet)", "Persian", "German"),
41+
("Azhdari ::: Persian To German Glossary version 1.1", "Persian", "German"),
42+
("Farsi to arabic", "Persian", "Arabic"),
43+
("Persian Italian Glossary", "Persian", "Italian"),
44+
("Ourstat - Farsi to English Dictionary", "Persian", "English"),
45+
("HmT - Persian to English Glossary", "Persian", "English"),
46+
("Arianpour Persian-English (OpenDictionary)", "Persian", "English"),
47+
("Malay to English", "Malay", "English"),
48+
("Korean-English Dictionary", "Korean", "English"),
49+
("Babylon Korean-English", "Korean", "English"),
50+
("Babylon Chinese(S)-English", "Chinese", "English"),
51+
("euskera-español", "Basque", "Spanish"),
52+
("Babylon Japanese-English", "Japanese", "English"),
53+
("Animal names in Latin and English", "Latin", "English"),
54+
("Dictionary Portuguese - Dutch", "Portuguese", "Dutch"),
55+
("Finnish To farsi", "Finnish", "Persian"),
56+
("Català-Castellà", "Catalan", "Spanish"),
57+
("Esp-Deu Wörterbuch", "Spanish", "German"),
58+
("ADO's SPANISCH-DEUTSCH", "Spanish", "German"),
59+
("Spanish To Farsi", "Spanish", "Persian"),
60+
("JM Spanish-Danish Dictionary", "Spanish", "Danish"),
61+
("Spa-Fin", "Spanish", "Finnish"),
62+
("Lunfardo (Argentina)", None, None),
63+
("Medciclopedia", None, None),
64+
("Spanish-Bulgarian", "Spanish", "Bulgarian"),
65+
("spanis learner's dictionary", None, None),
66+
("Babylon Spanish-English", "Spanish", "English"),
67+
(
68+
"A Spanish-English Dictionary (Granada University, Spain), 14.4",
69+
"Spanish",
70+
"English",
71+
),
72+
("Babylon Spanish-English", "Spanish", "English"),
73+
("Spanish-English Online Dictionaries", "Spanish", "English"),
74+
("Arabic to Farsi", "Arabic", "Persian"),
75+
("ADO's FRENCH-GERMAN", "French", "German"),
76+
("French to Farsi", "French", "Persian"),
77+
("MB_Dictionary French_to_Persian", "French", "Persian"),
78+
("French-Chinese GBK", "French", "Chinese"),
79+
("French-Chinese", "French", "Chinese"),
80+
("French-Bulgarian", "French", "Bulgarian"),
81+
("Babylon French-English", "French", "English"),
82+
("Nederlands-Indonesia", "Dutch", "Indonesian"),
83+
("Nederlands-Duits", "Dutch", "German"),
84+
("Dutch - Papiamento", None, None),
85+
("Nederlands - Kroatisch woordenboek", "Dutch", "Croatian"),
86+
("Woordenboek Dutch - Portugees", "Dutch", "Portuguese"),
87+
("Néerlandais-Français", "Dutch", "French"),
88+
("Bahasa Indonesia-Nederlands Adaptasi", "Indonesian", "Dutch"),
89+
("Nederlands-Bahasa Indonesia Adaptasi", "Dutch", "Indonesian"),
90+
("Dutch-English Online Dictionay", "Dutch", "English"),
91+
("Dutch_English 22000", "Dutch", "English"),
92+
("Babylon Dutch-English", "Dutch", "English"),
93+
("Russian-Latvian Dictionary", "Russian", "Latvian"),
94+
("Russian-Turkish Dictionary", "Russian", "Turkish"),
95+
("Russisch-Deutsch Woerterbuch", "Russian", "German"),
96+
("MHM Russian > Persian Dictionary", "Russian", "Persian"),
97+
("Dahl's Russian Dictionary", None, None),
98+
("Great Encyclopedic Glossary", None, None),
99+
("Efremova (Russian Explanatory Dictionary)", None, None),
100+
("Babylon Russian-English", "Russian", "English"),
101+
("RUSSIAN LEARNER'S DICTIONARY", None, None),
102+
("Italian to farsi", "Italian", "Persian"),
103+
("Italian Persian glossary", "Italian", "Persian"),
104+
("Italian>Farsi(Persian) Advanced V 3.0", "Italian", "Persian"),
105+
("Italian Persian glossary", "Italian", "Persian"),
106+
("Italiano - Español (GI)", "Italian", "Spanish"),
107+
("Babylon Italian-English", "Italian", "English"),
108+
("English To Urdu Lughat", "English", "Urdu"),
109+
("English-Urdu dictionary", "English", "Urdu"),
110+
("One-Click Engllish-Urdu Dictionary v1.3", None, None),
111+
("Babylon English-Turkish", "English", "Turkish"),
112+
("English-Turkish", "English", "Turkish"),
113+
("Eng-Tur_Computer/Electronics Terms", "English", "Turkish"),
114+
("WinCept Glass Dictionary (ENG>GER)", "English", "German"),
115+
("Acronyms from A - Z", None, None),
116+
("PONS Universelles Wörterbuch Englisch-Deutsch", "English", "German"),
117+
("Babylon English-German", "English", "German"),
118+
("English to Malay", "English", "Malay"),
119+
("hFarsi - advanced version", None, None),
120+
("Mathematics Glossary - Mohammad Reza Majidee", None, None),
121+
("Salaty English-Farsi Dict. (Text ver.)", "English", "Persian"),
122+
("Industrial Engineering Version 2.0", None, None),
123+
("HmT - English to Persian Glossary", "English", "Persian"),
124+
("Morteza English > Farsi", "English", "Persian"),
125+
("Dr. ALLI Malay - Farsi Dictionary", "Malay", "Persian"),
126+
("Geology Science (M.M.Ma'leki)", None, None),
127+
("Arianpour English-Persian (OpenDictionary)", "English", "Persian"),
128+
("Farsi Aviation Dictionary", None, None),
129+
("Persian Computer Encyclopedia", None, None),
130+
("Salaty English-Farsi Dict. (Graphical ver.)", "English", "Persian"),
131+
("Accounting English-Persian", "English", "Persian"),
132+
("Mokhtari Law Dict. (v1.0)", None, None),
133+
("Hafez Poems", None, None),
134+
("PAKcw English-Korean Dictionary", "English", "Korean"),
135+
("Babylon English-Korean", "English", "Korean"),
136+
("English-Spanish Online Dictionaries", "English", "Spanish"),
137+
("Babylon English-Spanish", "English", "Spanish"),
138+
("English_Spanish by Jaime Aguirre", "English", "Spanish"),
139+
(
140+
"An English-Spanish Dictionary (Granada University, Spain), 14.4",
141+
"English",
142+
"Spanish",
143+
),
144+
("INGLESPANISH", None, None),
145+
("Mehran - All about Computer", None, None),
146+
("Wadan English-Arabic Auditing Terms", "English", "Arabic"),
147+
("English 2 Arabic Glossary", "English", "Arabic"),
148+
("English 2 Arabic", "English", "Arabic"),
149+
("Babylon English-French", "English", "French"),
150+
("Babylon English-Dutch", "English", "Dutch"),
151+
("Surinaams-Nederlands Trafasi", None, None),
152+
("Technisch E-NL Woordenboek", None, None),
153+
("English-Dutch Online Dictionary", "English", "Dutch"),
154+
("Nederlands - Surinaams Trafasi", None, None),
155+
("Morteza English > Russian", "English", "Russian"),
156+
("english-russian", "English", "Russian"),
157+
("English/Russian - Mueller24", "English", "Russian"),
158+
("English-Russian Lingvistica'98 dictionary", "English", "Russian"),
159+
("Babylon English-Russian", "English", "Russian"),
160+
("Collins Cobuild 5", None, None),
161+
("Glossary of Computer and Internet Terms", None, None),
162+
("Customs and Excise Glossary", None, None),
163+
("Currency In Each Country", None, None),
164+
("Babylon English-English", "English", "English"),
165+
("9300+ Computer Acronyms", None, None),
166+
("Legal Systems of All Countries", None, None),
167+
("QURAN", None, None),
168+
("Britannica Concise Encyclopedia", None, None),
169+
("Collins English Dictionary", None, None),
170+
("XML Acronym Demystifier", None, None),
171+
("Solar Physics Glossary", None, None),
172+
("English Phonetics", None, None),
173+
("Flavours of Malaysia/ Malaysian delights", None, None),
174+
("Wordset.org", None, None),
175+
("Astronomy and Physics Terms by ExploreSpace.com", None, None),
176+
]

0 commit comments

Comments
 (0)