Inflection-72 Improve multi-language handling (#73)

grhoten · web-flow · commit 0d65f00badf2 · 2025-01-27T23:23:30.000-08:00
diff --git a/inflection/src/inflection/analysis/RussianExposableMorphology.cpp b/inflection/src/inflection/analysis/RussianExposableMorphology.cpp
@@ -33,7 +33,7 @@ RussianExposableMorphology::RussianExposableMorphology()
     ::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryAdjective,{u"adjective"}));
     ::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryAdverb,{u"adverb"}));
     ::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryVerb,{u"verb"}));
-    ::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryNumber,{u"number"}));
+    ::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryNumber,{u"numeral"}));
     ::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryProperNoun,{u"proper-noun"}));
     dictionaryPOSMask = dictionaryNoun | dictionaryAdjective | dictionaryNumber | dictionaryProperNoun | dictionaryVerb | dictionaryAdverb;
 
diff --git a/inflection/tools/dictionary-parser/ParseWiktionary b/inflection/tools/dictionary-parser/ParseWiktionary
diff --git a/inflection/tools/dictionary-parser/README.md b/inflection/tools/dictionary-parser/README.md
@@ -9,12 +9,12 @@ These tools generate files that describes the grammatical properties of words fr
 ## Usage for Wikidata
 
 1) Download a copy of Wikidata from https://dumps.wikimedia.org/wikidatawiki/entities/ (e.g. https://dumps.wikimedia.org/wikidatawiki/entities/20250115/wikidata-20250115-lexemes.json.bz2)
-2) Optionally decompress the file. This tool runs faster if it's decompressed.
+2) Optionally decompress the file. If this tool will be run more than once, decompressing the file will make the tool run faster.
 3) Check what options were used for your language. They are at the end of the generated dictionary_XX.lst, look for "generated with options"
    - Run `grep 'generated with options' ../../resources/org/unicode/inflection/dictionary/dictionary_XX.lst | cut -d':' -f2`
    - If the above command prints nothing, no additional options were used to generate the file, or it was generated with a different tool.
    - To see what options are available run `./ParseWikidata`
-   - At minimum use the `--locale` option to specify the ISO-639 code for the language to extract.
+   - At minimum use the `--language` option to specify the ISO-639 code for the language to extract.
 4) Run
 ```
     ./ParseWikidata <THE_OPTIONS_FROM_STEP_3> \
diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ClaimsJsonDeserializer.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ClaimsJsonDeserializer.java
@@ -10,6 +10,7 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
@@ -18,9 +19,30 @@
  * Converts the claims in LexemeForm from Wikidata in the JSON structure to usable Java classes.
  */
 public class ClaimsJsonDeserializer extends JsonDeserializer<Map<String, List<String>>> {
+
+    /**
+     * If there are no lemmas due to a language mismatch, there is no point in deserializing.
+     */
+    private static boolean isIgnorable(JsonParser jsonParser) {
+        var parent = jsonParser.getParsingContext().getParent();
+        while (parent != null) {
+            if (parent.getCurrentValue() instanceof Lexeme lexeme) {
+                return lexeme.lemmas.isEmpty();
+            }
+            parent = parent.getParent();
+        }
+        return false;
+    }
+
     @Override
     public Map<String, List<String>> deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException {
         Map<String, List<String>> result = null;
+        if (isIgnorable(jsonParser)) {
+            // If there are no lemmas matching the current language, then don't bother extracting this data.
+            jsonParser.skipChildren();
+            return Collections.emptyMap();
+        }
+
         JsonNode node = jsonParser.getCodec().readTree(jsonParser);
         var nodeItr = node.fields();
         while (nodeItr.hasNext()) {
diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Grammar.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Grammar.java
@@ -335,6 +335,7 @@ public String toString() {
     enum Aspect {
         HABITUAL,
         IMPERFECTIVE,
+        IMPERFECT,
         PERFECT, // Not to be confused with Perfective aspect. See https://en.wikipedia.org/wiki/Perfect_(grammar)
         PERFECTIVE,
         PLUPERFECT,
@@ -568,6 +569,7 @@ public String toString() {
         TYPEMAP.put("Q12262560", EnumSet.of(PartOfSpeech.CONJUNCTION)); // adversative linker
         TYPEMAP.put("Q12564489", EnumSet.of(PartOfSpeech.CONJUNCTION)); // disjunctive conjunction, which we don't need to differentiate.
         TYPEMAP.put("Q55965516", EnumSet.of(PartOfSpeech.CONJUNCTION)); // alias of disjunctive conjunction, which we don't need to differentiate.
+        TYPEMAP.put("Q11655558", EnumSet.of(PartOfSpeech.CONJUNCTION)); // subordinating conjunction
         TYPEMAP.put("Q576271", EnumSet.of(PartOfSpeech.DETERMINER));
         TYPEMAP.put("Q5051", new HashSet<>(Arrays.asList(Case.GENITIVE, PartOfSpeech.DETERMINER))); // possessive determiner
         TYPEMAP.put("Q83034", EnumSet.of(PartOfSpeech.INTERJECTION));
@@ -712,6 +714,7 @@ public String toString() {
         TYPEMAP.put("Q1230649", new HashSet<>(Arrays.asList(Tense.PAST, VerbType.PARTICIPLE)));
         TYPEMAP.put("Q72249355", new HashSet<>(Arrays.asList(Voice.ACTIVE, VerbType.PARTICIPLE)));
         TYPEMAP.put("Q72249544", new HashSet<>(Arrays.asList(Voice.PASSIVE, VerbType.PARTICIPLE)));
+        TYPEMAP.put("Q113133303", EnumSet.of(VerbType.PARTICIPLE)); // conjunctive participle
         TYPEMAP.put("Q192613", EnumSet.of(Tense.PRESENT)); // present tense
         TYPEMAP.put("Q3910936", new HashSet<>(Arrays.asList(Aspect.SIMPLE, Tense.PRESENT))); // simple present and usually future
         TYPEMAP.put("Q1994301", EnumSet.of(Tense.PAST)); // past tense
@@ -797,7 +800,7 @@ public String toString() {
         TYPEMAP.put("Q371427", EnumSet.of(Aspect.IMPERFECTIVE));
         TYPEMAP.put("Q54556033", EnumSet.of(Aspect.IMPERFECTIVE)); // imperfective verb
         TYPEMAP.put("Q2898727", EnumSet.of(Aspect.IMPERFECTIVE)); // imperfective form for Japanese verb
-//        TYPEMAP.put("Q108524486", EnumSet.of(Aspect.IMPERFECT));
+        TYPEMAP.put("Q108524486", EnumSet.of(Aspect.IMPERFECT));
         TYPEMAP.put("Q7240943", new HashSet<>(Arrays.asList(Tense.PRESENT, Aspect.IMPERFECTIVE))); // present continuous/present imperfect
         TYPEMAP.put("Q56650537", new HashSet<>(Arrays.asList(Tense.PAST, Aspect.IMPERFECTIVE))); // past continuous/present imperfect
         TYPEMAP.put("Q623742", EnumSet.of(Aspect.PLUPERFECT));
@@ -831,6 +834,9 @@ public String toString() {
         TYPEMAP.put("rieul-end", EnumSet.of(Sound.RIEUL_END));
         TYPEMAP.put("vowel-end", EnumSet.of(Sound.VOWEL_END));
         TYPEMAP.put("vowel-start", EnumSet.of(Sound.VOWEL_START));
+//         TYPEMAP.put("Q650250", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // elision, omission of one or more sounds in a word or phrase
+//         TYPEMAP.put("Q114092330", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // prevocalic form, linguistic feature marking a linguistic unit as appearing only before vowels
+//         TYPEMAP.put("Q112154", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // apocope, loss of word-final sounds
         TYPEMAP.put("Q101252532", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // where consonant is unmutated
         TYPEMAP.put("Q56648699", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // soft mutation, where consonant becomes more sonorous
         TYPEMAP.put("Q117262361", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // pausal form, form of a word realised in hiatus between prosodic units
@@ -931,7 +937,7 @@ public String toString() {
         TYPEMAP.put("Q1122269", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // collocation, frequent occurrence of words next to each other
         TYPEMAP.put("Q18915698", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // established collocation
         TYPEMAP.put("Q1428334", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // paradigm, an inflection table instead of actual words
-        TYPEMAP.put("Q102500", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // chemical symbol
+        TYPEMAP.put("Q102500", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // chemical symbol
         TYPEMAP.put("Q80071", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // symbol
         TYPEMAP.put("Q308229", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // currency sign
         TYPEMAP.put("Q31963", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // emoticon
diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Lexeme.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Lexeme.java
@@ -15,6 +15,7 @@
 @JsonIgnoreProperties(ignoreUnknown = true)
 public class Lexeme {
     public String id;
+    @JsonDeserialize(using = LexemesJsonDeserializer.class)
     public Map<String, LexemeRepresentation> lemmas;
     public String lexicalCategory;
     @JsonDeserialize(using = ClaimsJsonDeserializer.class)
diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/LexemesJsonDeserializer.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/LexemesJsonDeserializer.java
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2025 Unicode Incorporated and others. All rights reserved.
+ */
+package org.unicode.wikidata;
+
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.JsonDeserializer;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+/**
+ * Converts the map of language identifiers to lexeme representations from Wikidata in the JSON structure to usable Java classes.
+ */
+public class LexemesJsonDeserializer extends JsonDeserializer<Map<String, LexemeRepresentation>> {
+    private static String[] languages;
+
+    public static void setLanguage(List<String> languagesArray) {
+        languages = languagesArray.toArray(new String[0]);
+    }
+
+    /**
+     * Is the base desired language contained in the variant language?
+     * en, en true
+     * en, en-us true
+     * en-us, en-us true
+     * ko, kok false
+     */
+    public static boolean isContained(String currentLanguage) {
+        var dashIdx = currentLanguage.indexOf('-');
+        if (dashIdx < 0) {
+            dashIdx = currentLanguage.length();
+        }
+        for (var language : languages) {
+            if (language.regionMatches(0, currentLanguage, 0, dashIdx)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    @Override
+    public Map<String, LexemeRepresentation> deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException {
+        Map<String, LexemeRepresentation> result = new TreeMap<>();
+        var objectCodec = jsonParser.getCodec();
+
+        while (jsonParser.nextToken() != JsonToken.END_OBJECT) {
+            String languageKey = jsonParser.currentName();
+            if (jsonParser.nextToken() != JsonToken.START_OBJECT) {
+                throw new IOException("Start of LexemeRepresentation not found for " + languageKey);
+            }
+            // Only decode for known languages.
+            if (isContained(languageKey)) {
+                LexemeRepresentation lexemeRepresentation = objectCodec.readValue(jsonParser, LexemeRepresentation.class);
+                if (result.put(languageKey, lexemeRepresentation) != null) {
+                    throw new IOException("Duplicate language " + languageKey);
+                }
+            }
+            else {
+                jsonParser.skipChildren();
+            }
+        }
+
+        return result;
+    }
+}
diff --git a/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java b/inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java