Skip to content

Commit 0d65f00

Browse files
authored
Inflection-72 Improve multi-language handling (#73)
1 parent c3b1057 commit 0d65f00

File tree

8 files changed

+192
-81
lines changed

8 files changed

+192
-81
lines changed

inflection/src/inflection/analysis/RussianExposableMorphology.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ RussianExposableMorphology::RussianExposableMorphology()
3333
::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryAdjective,{u"adjective"}));
3434
::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryAdverb,{u"adverb"}));
3535
::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryVerb,{u"verb"}));
36-
::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryNumber,{u"number"}));
36+
::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryNumber,{u"numeral"}));
3737
::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryProperNoun,{u"proper-noun"}));
3838
dictionaryPOSMask = dictionaryNoun | dictionaryAdjective | dictionaryNumber | dictionaryProperNoun | dictionaryVerb | dictionaryAdverb;
3939

inflection/tools/dictionary-parser/ParseWiktionary

Lines changed: 0 additions & 9 deletions
This file was deleted.

inflection/tools/dictionary-parser/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@ These tools generate files that describes the grammatical properties of words fr
99
## Usage for Wikidata
1010

1111
1) Download a copy of Wikidata from https://dumps.wikimedia.org/wikidatawiki/entities/ (e.g. https://dumps.wikimedia.org/wikidatawiki/entities/20250115/wikidata-20250115-lexemes.json.bz2)
12-
2) Optionally decompress the file. This tool runs faster if it's decompressed.
12+
2) Optionally decompress the file. If this tool will be run more than once, decompressing the file will make the tool run faster.
1313
3) Check what options were used for your language. They are at the end of the generated dictionary_XX.lst, look for "generated with options"
1414
- Run `grep 'generated with options' ../../resources/org/unicode/inflection/dictionary/dictionary_XX.lst | cut -d':' -f2`
1515
- If the above command prints nothing, no additional options were used to generate the file, or it was generated with a different tool.
1616
- To see what options are available run `./ParseWikidata`
17-
- At minimum use the `--locale` option to specify the ISO-639 code for the language to extract.
17+
- At minimum use the `--language` option to specify the ISO-639 code for the language to extract.
1818
4) Run
1919
```
2020
./ParseWikidata <THE_OPTIONS_FROM_STEP_3> \

inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ClaimsJsonDeserializer.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import java.io.IOException;
1212
import java.util.ArrayList;
13+
import java.util.Collections;
1314
import java.util.List;
1415
import java.util.Map;
1516
import java.util.TreeMap;
@@ -18,9 +19,30 @@
1819
* Converts the claims in LexemeForm from Wikidata in the JSON structure to usable Java classes.
1920
*/
2021
public class ClaimsJsonDeserializer extends JsonDeserializer<Map<String, List<String>>> {
22+
23+
/**
24+
* If there are no lemmas due to a language mismatch, there is no point in deserializing.
25+
*/
26+
private static boolean isIgnorable(JsonParser jsonParser) {
27+
var parent = jsonParser.getParsingContext().getParent();
28+
while (parent != null) {
29+
if (parent.getCurrentValue() instanceof Lexeme lexeme) {
30+
return lexeme.lemmas.isEmpty();
31+
}
32+
parent = parent.getParent();
33+
}
34+
return false;
35+
}
36+
2137
@Override
2238
public Map<String, List<String>> deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException {
2339
Map<String, List<String>> result = null;
40+
if (isIgnorable(jsonParser)) {
41+
// If there are no lemmas matching the current language, then don't bother extracting this data.
42+
jsonParser.skipChildren();
43+
return Collections.emptyMap();
44+
}
45+
2446
JsonNode node = jsonParser.getCodec().readTree(jsonParser);
2547
var nodeItr = node.fields();
2648
while (nodeItr.hasNext()) {

inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Grammar.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,7 @@ public String toString() {
335335
enum Aspect {
336336
HABITUAL,
337337
IMPERFECTIVE,
338+
IMPERFECT,
338339
PERFECT, // Not to be confused with Perfective aspect. See https://en.wikipedia.org/wiki/Perfect_(grammar)
339340
PERFECTIVE,
340341
PLUPERFECT,
@@ -568,6 +569,7 @@ public String toString() {
568569
TYPEMAP.put("Q12262560", EnumSet.of(PartOfSpeech.CONJUNCTION)); // adversative linker
569570
TYPEMAP.put("Q12564489", EnumSet.of(PartOfSpeech.CONJUNCTION)); // disjunctive conjunction, which we don't need to differentiate.
570571
TYPEMAP.put("Q55965516", EnumSet.of(PartOfSpeech.CONJUNCTION)); // alias of disjunctive conjunction, which we don't need to differentiate.
572+
TYPEMAP.put("Q11655558", EnumSet.of(PartOfSpeech.CONJUNCTION)); // subordinating conjunction
571573
TYPEMAP.put("Q576271", EnumSet.of(PartOfSpeech.DETERMINER));
572574
TYPEMAP.put("Q5051", new HashSet<>(Arrays.asList(Case.GENITIVE, PartOfSpeech.DETERMINER))); // possessive determiner
573575
TYPEMAP.put("Q83034", EnumSet.of(PartOfSpeech.INTERJECTION));
@@ -712,6 +714,7 @@ public String toString() {
712714
TYPEMAP.put("Q1230649", new HashSet<>(Arrays.asList(Tense.PAST, VerbType.PARTICIPLE)));
713715
TYPEMAP.put("Q72249355", new HashSet<>(Arrays.asList(Voice.ACTIVE, VerbType.PARTICIPLE)));
714716
TYPEMAP.put("Q72249544", new HashSet<>(Arrays.asList(Voice.PASSIVE, VerbType.PARTICIPLE)));
717+
TYPEMAP.put("Q113133303", EnumSet.of(VerbType.PARTICIPLE)); // conjunctive participle
715718
TYPEMAP.put("Q192613", EnumSet.of(Tense.PRESENT)); // present tense
716719
TYPEMAP.put("Q3910936", new HashSet<>(Arrays.asList(Aspect.SIMPLE, Tense.PRESENT))); // simple present and usually future
717720
TYPEMAP.put("Q1994301", EnumSet.of(Tense.PAST)); // past tense
@@ -797,7 +800,7 @@ public String toString() {
797800
TYPEMAP.put("Q371427", EnumSet.of(Aspect.IMPERFECTIVE));
798801
TYPEMAP.put("Q54556033", EnumSet.of(Aspect.IMPERFECTIVE)); // imperfective verb
799802
TYPEMAP.put("Q2898727", EnumSet.of(Aspect.IMPERFECTIVE)); // imperfective form for Japanese verb
800-
// TYPEMAP.put("Q108524486", EnumSet.of(Aspect.IMPERFECT));
803+
TYPEMAP.put("Q108524486", EnumSet.of(Aspect.IMPERFECT));
801804
TYPEMAP.put("Q7240943", new HashSet<>(Arrays.asList(Tense.PRESENT, Aspect.IMPERFECTIVE))); // present continuous/present imperfect
802805
TYPEMAP.put("Q56650537", new HashSet<>(Arrays.asList(Tense.PAST, Aspect.IMPERFECTIVE))); // past continuous/present imperfect
803806
TYPEMAP.put("Q623742", EnumSet.of(Aspect.PLUPERFECT));
@@ -831,6 +834,9 @@ public String toString() {
831834
TYPEMAP.put("rieul-end", EnumSet.of(Sound.RIEUL_END));
832835
TYPEMAP.put("vowel-end", EnumSet.of(Sound.VOWEL_END));
833836
TYPEMAP.put("vowel-start", EnumSet.of(Sound.VOWEL_START));
837+
// TYPEMAP.put("Q650250", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // elision, omission of one or more sounds in a word or phrase
838+
// TYPEMAP.put("Q114092330", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // prevocalic form, linguistic feature marking a linguistic unit as appearing only before vowels
839+
// TYPEMAP.put("Q112154", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // apocope, loss of word-final sounds
834840
TYPEMAP.put("Q101252532", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // where consonant is unmutated
835841
TYPEMAP.put("Q56648699", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // soft mutation, where consonant becomes more sonorous
836842
TYPEMAP.put("Q117262361", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // pausal form, form of a word realised in hiatus between prosodic units
@@ -931,7 +937,7 @@ public String toString() {
931937
TYPEMAP.put("Q1122269", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // collocation, frequent occurrence of words next to each other
932938
TYPEMAP.put("Q18915698", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // established collocation
933939
TYPEMAP.put("Q1428334", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // paradigm, an inflection table instead of actual words
934-
TYPEMAP.put("Q102500", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // chemical symbol
940+
TYPEMAP.put("Q102500", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // chemical symbol
935941
TYPEMAP.put("Q80071", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // symbol
936942
TYPEMAP.put("Q308229", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // currency sign
937943
TYPEMAP.put("Q31963", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // emoticon

inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Lexeme.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
@JsonIgnoreProperties(ignoreUnknown = true)
1616
public class Lexeme {
1717
public String id;
18+
@JsonDeserialize(using = LexemesJsonDeserializer.class)
1819
public Map<String, LexemeRepresentation> lemmas;
1920
public String lexicalCategory;
2021
@JsonDeserialize(using = ClaimsJsonDeserializer.class)
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* Copyright 2025 Unicode Incorporated and others. All rights reserved.
3+
*/
4+
package org.unicode.wikidata;
5+
6+
import com.fasterxml.jackson.core.JsonParser;
7+
import com.fasterxml.jackson.core.JsonToken;
8+
import com.fasterxml.jackson.databind.DeserializationContext;
9+
import com.fasterxml.jackson.databind.JsonDeserializer;
10+
11+
import java.io.IOException;
12+
import java.util.List;
13+
import java.util.Map;
14+
import java.util.TreeMap;
15+
16+
/**
17+
* Converts the map of language identifiers to lexeme representations from Wikidata in the JSON structure to usable Java classes.
18+
*/
19+
public class LexemesJsonDeserializer extends JsonDeserializer<Map<String, LexemeRepresentation>> {
20+
private static String[] languages;
21+
22+
public static void setLanguage(List<String> languagesArray) {
23+
languages = languagesArray.toArray(new String[0]);
24+
}
25+
26+
/**
27+
* Is the base desired language contained in the variant language?
28+
* en, en true
29+
* en, en-us true
30+
* en-us, en-us true
31+
* ko, kok false
32+
*/
33+
public static boolean isContained(String currentLanguage) {
34+
var dashIdx = currentLanguage.indexOf('-');
35+
if (dashIdx < 0) {
36+
dashIdx = currentLanguage.length();
37+
}
38+
for (var language : languages) {
39+
if (language.regionMatches(0, currentLanguage, 0, dashIdx)) {
40+
return true;
41+
}
42+
}
43+
return false;
44+
}
45+
46+
@Override
47+
public Map<String, LexemeRepresentation> deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException {
48+
Map<String, LexemeRepresentation> result = new TreeMap<>();
49+
var objectCodec = jsonParser.getCodec();
50+
51+
while (jsonParser.nextToken() != JsonToken.END_OBJECT) {
52+
String languageKey = jsonParser.currentName();
53+
if (jsonParser.nextToken() != JsonToken.START_OBJECT) {
54+
throw new IOException("Start of LexemeRepresentation not found for " + languageKey);
55+
}
56+
// Only decode for known languages.
57+
if (isContained(languageKey)) {
58+
LexemeRepresentation lexemeRepresentation = objectCodec.readValue(jsonParser, LexemeRepresentation.class);
59+
if (result.put(languageKey, lexemeRepresentation) != null) {
60+
throw new IOException("Duplicate language " + languageKey);
61+
}
62+
}
63+
else {
64+
jsonParser.skipChildren();
65+
}
66+
}
67+
68+
return result;
69+
}
70+
}

0 commit comments

Comments
 (0)