Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ RussianExposableMorphology::RussianExposableMorphology()
::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryAdjective,{u"adjective"}));
::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryAdverb,{u"adverb"}));
::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryVerb,{u"verb"}));
::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryNumber,{u"number"}));
::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryNumber,{u"numeral"}));
::inflection::util::Validate::notNull(dictionary.getBinaryProperties(&dictionaryProperNoun,{u"proper-noun"}));
dictionaryPOSMask = dictionaryNoun | dictionaryAdjective | dictionaryNumber | dictionaryProperNoun | dictionaryVerb | dictionaryAdverb;

Expand Down
9 changes: 0 additions & 9 deletions inflection/tools/dictionary-parser/ParseWiktionary

This file was deleted.

4 changes: 2 additions & 2 deletions inflection/tools/dictionary-parser/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ These tools generate files that describes the grammatical properties of words fr
## Usage for Wikidata

1) Download a copy of Wikidata from https://dumps.wikimedia.org/wikidatawiki/entities/ (e.g. https://dumps.wikimedia.org/wikidatawiki/entities/20250115/wikidata-20250115-lexemes.json.bz2)
2) Optionally decompress the file. This tool runs faster if it's decompressed.
2) Optionally decompress the file. If this tool will be run more than once, decompressing the file will make the tool run faster.
3) Check what options were used for your language. They are at the end of the generated dictionary_XX.lst, look for "generated with options"
- Run `grep 'generated with options' ../../resources/org/unicode/inflection/dictionary/dictionary_XX.lst | cut -d':' -f2`
- If the above command prints nothing, no additional options were used to generate the file, or it was generated with a different tool.
- To see what options are available run `./ParseWikidata`
- At minimum use the `--locale` option to specify the ISO-639 code for the language to extract.
- At minimum use the `--language` option to specify the ISO-639 code for the language to extract.
4) Run
```
./ParseWikidata <THE_OPTIONS_FROM_STEP_3> \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
Expand All @@ -18,9 +19,30 @@
* Converts the claims in LexemeForm from Wikidata in the JSON structure to usable Java classes.
*/
public class ClaimsJsonDeserializer extends JsonDeserializer<Map<String, List<String>>> {

/**
* If there are no lemmas due to a language mismatch, there is no point in deserializing.
*/
private static boolean isIgnorable(JsonParser jsonParser) {
var parent = jsonParser.getParsingContext().getParent();
while (parent != null) {
if (parent.getCurrentValue() instanceof Lexeme lexeme) {
return lexeme.lemmas.isEmpty();
}
parent = parent.getParent();
}
return false;
}

@Override
public Map<String, List<String>> deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException {
Map<String, List<String>> result = null;
if (isIgnorable(jsonParser)) {
// If there are no lemmas matching the current language, then don't bother extracting this data.
jsonParser.skipChildren();
return Collections.emptyMap();
}

JsonNode node = jsonParser.getCodec().readTree(jsonParser);
var nodeItr = node.fields();
while (nodeItr.hasNext()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ public String toString() {
enum Aspect {
HABITUAL,
IMPERFECTIVE,
IMPERFECT,
PERFECT, // Not to be confused with Perfective aspect. See https://en.wikipedia.org/wiki/Perfect_(grammar)
PERFECTIVE,
PLUPERFECT,
Expand Down Expand Up @@ -568,6 +569,7 @@ public String toString() {
TYPEMAP.put("Q12262560", EnumSet.of(PartOfSpeech.CONJUNCTION)); // adversative linker
TYPEMAP.put("Q12564489", EnumSet.of(PartOfSpeech.CONJUNCTION)); // disjunctive conjunction, which we don't need to differentiate.
TYPEMAP.put("Q55965516", EnumSet.of(PartOfSpeech.CONJUNCTION)); // alias of disjunctive conjunction, which we don't need to differentiate.
TYPEMAP.put("Q11655558", EnumSet.of(PartOfSpeech.CONJUNCTION)); // subordinating conjunction
TYPEMAP.put("Q576271", EnumSet.of(PartOfSpeech.DETERMINER));
TYPEMAP.put("Q5051", new HashSet<>(Arrays.asList(Case.GENITIVE, PartOfSpeech.DETERMINER))); // possessive determiner
TYPEMAP.put("Q83034", EnumSet.of(PartOfSpeech.INTERJECTION));
Expand Down Expand Up @@ -712,6 +714,7 @@ public String toString() {
TYPEMAP.put("Q1230649", new HashSet<>(Arrays.asList(Tense.PAST, VerbType.PARTICIPLE)));
TYPEMAP.put("Q72249355", new HashSet<>(Arrays.asList(Voice.ACTIVE, VerbType.PARTICIPLE)));
TYPEMAP.put("Q72249544", new HashSet<>(Arrays.asList(Voice.PASSIVE, VerbType.PARTICIPLE)));
TYPEMAP.put("Q113133303", EnumSet.of(VerbType.PARTICIPLE)); // conjunctive participle
TYPEMAP.put("Q192613", EnumSet.of(Tense.PRESENT)); // present tense
TYPEMAP.put("Q3910936", new HashSet<>(Arrays.asList(Aspect.SIMPLE, Tense.PRESENT))); // simple present and usually future
TYPEMAP.put("Q1994301", EnumSet.of(Tense.PAST)); // past tense
Expand Down Expand Up @@ -797,7 +800,7 @@ public String toString() {
TYPEMAP.put("Q371427", EnumSet.of(Aspect.IMPERFECTIVE));
TYPEMAP.put("Q54556033", EnumSet.of(Aspect.IMPERFECTIVE)); // imperfective verb
TYPEMAP.put("Q2898727", EnumSet.of(Aspect.IMPERFECTIVE)); // imperfective form for Japanese verb
// TYPEMAP.put("Q108524486", EnumSet.of(Aspect.IMPERFECT));
TYPEMAP.put("Q108524486", EnumSet.of(Aspect.IMPERFECT));
TYPEMAP.put("Q7240943", new HashSet<>(Arrays.asList(Tense.PRESENT, Aspect.IMPERFECTIVE))); // present continuous/present imperfect
TYPEMAP.put("Q56650537", new HashSet<>(Arrays.asList(Tense.PAST, Aspect.IMPERFECTIVE))); // past continuous/present imperfect
TYPEMAP.put("Q623742", EnumSet.of(Aspect.PLUPERFECT));
Expand Down Expand Up @@ -831,6 +834,9 @@ public String toString() {
TYPEMAP.put("rieul-end", EnumSet.of(Sound.RIEUL_END));
TYPEMAP.put("vowel-end", EnumSet.of(Sound.VOWEL_END));
TYPEMAP.put("vowel-start", EnumSet.of(Sound.VOWEL_START));
// TYPEMAP.put("Q650250", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // elision, omission of one or more sounds in a word or phrase
// TYPEMAP.put("Q114092330", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // prevocalic form, linguistic feature marking a linguistic unit as appearing only before vowels
// TYPEMAP.put("Q112154", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // apocope, loss of word-final sounds
TYPEMAP.put("Q101252532", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // where consonant is unmutated
TYPEMAP.put("Q56648699", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // soft mutation, where consonant becomes more sonorous
TYPEMAP.put("Q117262361", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // pausal form, form of a word realised in hiatus between prosodic units
Expand Down Expand Up @@ -931,7 +937,7 @@ public String toString() {
TYPEMAP.put("Q1122269", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // collocation, frequent occurrence of words next to each other
TYPEMAP.put("Q18915698", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // established collocation
TYPEMAP.put("Q1428334", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // paradigm, an inflection table instead of actual words
TYPEMAP.put("Q102500", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // chemical symbol
TYPEMAP.put("Q102500", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // chemical symbol
TYPEMAP.put("Q80071", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // symbol
TYPEMAP.put("Q308229", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // currency sign
TYPEMAP.put("Q31963", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // emoticon
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
@JsonIgnoreProperties(ignoreUnknown = true)
public class Lexeme {
public String id;
@JsonDeserialize(using = LexemesJsonDeserializer.class)
public Map<String, LexemeRepresentation> lemmas;
public String lexicalCategory;
@JsonDeserialize(using = ClaimsJsonDeserializer.class)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
* Copyright 2025 Unicode Incorporated and others. All rights reserved.
*/
package org.unicode.wikidata;

import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

/**
* Converts the map of language identifiers to lexeme representations from Wikidata in the JSON structure to usable Java classes.
*/
public class LexemesJsonDeserializer extends JsonDeserializer<Map<String, LexemeRepresentation>> {
private static String[] languages;

public static void setLanguage(List<String> languagesArray) {
languages = languagesArray.toArray(new String[0]);
}

/**
* Is the base desired language contained in the variant language?
* en, en true
* en, en-us true
* en-us, en-us true
* ko, kok false
*/
public static boolean isContained(String currentLanguage) {
var dashIdx = currentLanguage.indexOf('-');
if (dashIdx < 0) {
dashIdx = currentLanguage.length();
}
for (var language : languages) {
if (language.regionMatches(0, currentLanguage, 0, dashIdx)) {
return true;
}
}
return false;
}

@Override
public Map<String, LexemeRepresentation> deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException {
Map<String, LexemeRepresentation> result = new TreeMap<>();
var objectCodec = jsonParser.getCodec();

while (jsonParser.nextToken() != JsonToken.END_OBJECT) {
String languageKey = jsonParser.currentName();
if (jsonParser.nextToken() != JsonToken.START_OBJECT) {
throw new IOException("Start of LexemeRepresentation not found for " + languageKey);
}
// Only decode for known languages.
if (isContained(languageKey)) {
LexemeRepresentation lexemeRepresentation = objectCodec.readValue(jsonParser, LexemeRepresentation.class);
if (result.put(languageKey, lexemeRepresentation) != null) {
throw new IOException("Duplicate language " + languageKey);
}
}
else {
jsonParser.skipChildren();
}
}

return result;
}
}
Loading