Skip to content

Commit 6dd3d39

Browse files
authored
Inflection-94 Improve Wikidata coverage in dictionary-parser (#95)
1 parent 56bb2b9 commit 6dd3d39

File tree

4 files changed

+65
-5
lines changed

4 files changed

+65
-5
lines changed

inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/Grammar.java

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ public String toString() {
120120

121121
enum Tense {
122122
PAST,
123+
DISTANT_PAST,
123124
PRESENT,
124125
FUTURE;
125126

@@ -233,7 +234,8 @@ public String toString() {
233234
}
234235

235236
enum FormType {
236-
SHORT_FORM;
237+
SHORT_FORM,
238+
LONG_FORM;
237239

238240
private final String printableValue;
239241
FormType() {
@@ -591,10 +593,12 @@ public String toString() {
591593
TYPEMAP.put("Q576271", EnumSet.of(PartOfSpeech.DETERMINER));
592594
TYPEMAP.put("Q5051", new HashSet<>(Arrays.asList(Case.GENITIVE, PartOfSpeech.DETERMINER))); // possessive determiner
593595
TYPEMAP.put("Q2824480", EnumSet.of(PartOfSpeech.DETERMINER)); // demonstrative adjective, but it's really a determiner.
596+
TYPEMAP.put("Q2112896", EnumSet.of(PartOfSpeech.DETERMINER)); // pronominal adverb
594597
TYPEMAP.put("Q83034", EnumSet.of(PartOfSpeech.INTERJECTION));
595598
TYPEMAP.put("Q2304610", EnumSet.of(PartOfSpeech.INTERROGATIVE));
596599
TYPEMAP.put("Q12021746", EnumSet.of(PartOfSpeech.INTERROGATIVE));
597600
TYPEMAP.put("Q54310231", EnumSet.of(PartOfSpeech.INTERROGATIVE, PartOfSpeech.PRONOUN));
601+
TYPEMAP.put("Q60798917", EnumSet.of(PartOfSpeech.INTERROGATIVE)); // question tag
598602
TYPEMAP.put("Q9788", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // letter
599603
TYPEMAP.put("Q3241972", EnumSet.of(Ignorable.IGNORABLE_INFLECTION)); // character
600604
TYPEMAP.put("Q1084", EnumSet.of(PartOfSpeech.NOUN));
@@ -614,13 +618,19 @@ public String toString() {
614618
TYPEMAP.put("Q10535365", EnumSet.of(PartOfSpeech.PARTICLE)); // infinitive marker, infinitive participle, infinitive particle
615619
TYPEMAP.put("Q113198319", new HashSet<>(Arrays.asList(PartOfSpeech.ADVERB, PartOfSpeech.PARTICLE))); // adverbial particle
616620
TYPEMAP.put("Q115762248", EnumSet.of(PartOfSpeech.PARTICLE)); // vocative particle
621+
TYPEMAP.put("Q115475265", EnumSet.of(PartOfSpeech.PARTICLE)); // honorific particle
617622
TYPEMAP.put("Q113076880", EnumSet.of(PartOfSpeech.ADVERB)); // postpositive adverb
618623
TYPEMAP.put("Q65807752", EnumSet.of(PartOfSpeech.ADVERB)); // demonstrative adverb
624+
TYPEMAP.put("Q117321826", EnumSet.of(PartOfSpeech.ADVERB)); // localiser, similar to an adverb
619625
TYPEMAP.put("Q134316", EnumSet.of(PartOfSpeech.ADPOSITION)); // adposition
620626
TYPEMAP.put("Q161873", EnumSet.of(PartOfSpeech.ADPOSITION)); // postposition
621627
TYPEMAP.put("Q4833830", EnumSet.of(PartOfSpeech.ADPOSITION)); // preposition
622628
TYPEMAP.put("Q36224", EnumSet.of(PartOfSpeech.PRONOUN));
623629
TYPEMAP.put("Q2006180", EnumSet.of(PartOfSpeech.PRONOUN)); // pro-form, word that substitutes for another word, broader scope than pronoun
630+
TYPEMAP.put("Q115272253", EnumSet.of(PartOfSpeech.PRONOUN)); // possessive adjective, like "your"
631+
TYPEMAP.put("Q2824485", EnumSet.of(PartOfSpeech.PRONOUN)); // pronominal adjective
632+
TYPEMAP.put("Q115272205", EnumSet.of(PartOfSpeech.PRONOUN)); // reflexive adjective
633+
TYPEMAP.put("Q79377411", EnumSet.of(PartOfSpeech.PRONOUN)); // demonstrative pronoun
624634
TYPEMAP.put("Q147276", EnumSet.of(PartOfSpeech.PROPER_NOUN)); // proper noun
625635
TYPEMAP.put("Q7884789", EnumSet.of(PartOfSpeech.PROPER_NOUN)); // toponym
626636
TYPEMAP.put("Q43229", EnumSet.of(PartOfSpeech.PROPER_NOUN)); // organization
@@ -634,9 +644,11 @@ public String toString() {
634644
TYPEMAP.put("Q3254028", EnumSet.of(PartOfSpeech.VERB)); // separable verb, verb with a prefix which separates from the core verb in certain positions in a sentence
635645

636646
TYPEMAP.put("Q4239848", new HashSet<>(Arrays.asList(FormType.SHORT_FORM, PartOfSpeech.ADJECTIVE))); // short form of an adjective
647+
TYPEMAP.put("Q96406487", EnumSet.of(FormType.SHORT_FORM)); // short form
637648
TYPEMAP.put("Q112154", EnumSet.of(FormType.SHORT_FORM)); // apocope, loss of word-final sounds
638649
TYPEMAP.put("Q650250", EnumSet.of(FormType.SHORT_FORM)); // elision, omission of one or more sounds in a word
639650
TYPEMAP.put("Q114092330", EnumSet.of(FormType.SHORT_FORM)); // prevocalic form, linguistic feature marking a linguistic unit as appearing only before vowels
651+
TYPEMAP.put("Q96406455", EnumSet.of(FormType.LONG_FORM)); // long form
640652

641653
TYPEMAP.put("Q109267112", EnumSet.of(Polarity.AFFIRMATIVE));
642654
TYPEMAP.put("Q1478451", EnumSet.of(Polarity.NEGATIVE));
@@ -745,11 +757,16 @@ public String toString() {
745757
TYPEMAP.put("Q1230649", new HashSet<>(Arrays.asList(Tense.PAST, VerbType.PARTICIPLE)));
746758
TYPEMAP.put("Q72249355", new HashSet<>(Arrays.asList(Voice.ACTIVE, VerbType.PARTICIPLE)));
747759
TYPEMAP.put("Q72249544", new HashSet<>(Arrays.asList(Voice.PASSIVE, VerbType.PARTICIPLE)));
760+
TYPEMAP.put("Q430255", new HashSet<>(Arrays.asList(Tense.PRESENT, Voice.ACTIVE, VerbType.PARTICIPLE))); // present active participle
761+
TYPEMAP.put("Q117824585", new HashSet<>(Arrays.asList(Tense.PRESENT, Voice.PASSIVE, VerbType.PARTICIPLE))); // present active participle
762+
TYPEMAP.put("Q16086106", new HashSet<>(Arrays.asList(Tense.PAST, Voice.PASSIVE, VerbType.PARTICIPLE))); // past active participle
748763
TYPEMAP.put("Q113133303", EnumSet.of(VerbType.PARTICIPLE)); // conjunctive participle
749764
TYPEMAP.put("Q192613", EnumSet.of(Tense.PRESENT)); // present tense
750765
TYPEMAP.put("Q3910936", new HashSet<>(Arrays.asList(Aspect.SIMPLE, Tense.PRESENT))); // simple present and usually future
751766
TYPEMAP.put("Q1994301", EnumSet.of(Tense.PAST)); // past tense
752767
TYPEMAP.put("Q1392475", new HashSet<>(Arrays.asList(Aspect.SIMPLE, Tense.PAST))); // simple past
768+
TYPEMAP.put("Q113326559", EnumSet.of(Tense.PAST)); // non-remote tense
769+
TYPEMAP.put("Q113326099", EnumSet.of(Tense.DISTANT_PAST)); // remote tense
753770
TYPEMAP.put("Q501405", EnumSet.of(Tense.FUTURE)); // future tense
754771
TYPEMAP.put("Q344", EnumSet.of(Tense.FUTURE)); // future
755772
TYPEMAP.put("Q1475560", new HashSet<>(Arrays.asList(Aspect.SIMPLE, Tense.FUTURE))); // simple future
@@ -767,6 +784,8 @@ public String toString() {
767784
TYPEMAP.put("Q115223950", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // infinitive form in Norwegian Nynorsk that ends in 'a'
768785
TYPEMAP.put("Q115223951", EnumSet.of(Ignorable.IGNORABLE_PROPERTY)); // infinitive form in Norwegian Nynorsk that ends in 'e'
769786
TYPEMAP.put("Q1923028", EnumSet.of(VerbType.GERUND));
787+
TYPEMAP.put("Q380012", EnumSet.of(VerbType.GERUND)); // adverbial
788+
TYPEMAP.put("Q904896", EnumSet.of(VerbType.GERUND)); // transgressive, adverbial participle
770789
TYPEMAP.put("Q52434511", new HashSet<>(Arrays.asList(Tense.PRESENT, VerbType.GERUND)));
771790
TYPEMAP.put("Q52434598", new HashSet<>(Arrays.asList(Tense.PAST, VerbType.GERUND)));
772791
TYPEMAP.put("Q1050494", EnumSet.of(VerbType.NONFINITE));
@@ -791,7 +810,7 @@ public String toString() {
791810

792811
TYPEMAP.put("Q468801", EnumSet.of(PartOfSpeech.PRONOUN)); // personal pronoun
793812
TYPEMAP.put("Q1502460", EnumSet.of(PartOfSpeech.PRONOUN)); // possessive pronoun
794-
TYPEMAP.put("Q34793275", new HashSet<>(Arrays.asList(Definiteness.DEMONSTRATIVE, PartOfSpeech.PRONOUN))); // demonstrative pronoun
813+
TYPEMAP.put("Q34793275", EnumSet.of(PartOfSpeech.PRONOUN)); // demonstrative pronoun
795814
TYPEMAP.put("Q953129", EnumSet.of(PartOfSpeech.PRONOUN)); // reflexive pronoun
796815
TYPEMAP.put("Q130266209", EnumSet.of(PartOfSpeech.PRONOUN)); // reflexive personal pronoun
797816
TYPEMAP.put("Q1050744", EnumSet.of(PartOfSpeech.PRONOUN)); // relative pronoun
@@ -816,6 +835,7 @@ public String toString() {
816835
TYPEMAP.put("Q3502544", new HashSet<>(Arrays.asList(Tense.PAST, Mood.SUBJUNCTIVE))); // past subjunctive
817836
TYPEMAP.put("Q3502541", new HashSet<>(Arrays.asList(Aspect.IMPERFECT, Tense.PAST, Mood.SUBJUNCTIVE))); // imperfect subjunctive
818837
TYPEMAP.put("Q113289507", EnumSet.of(Mood.EMPHATIC));
838+
TYPEMAP.put("Q113959607", EnumSet.of(Mood.EMPHATIC)); // emphatic particle
819839
TYPEMAP.put("Q2532941", EnumSet.of(Mood.VOLITIVE));
820840

821841
TYPEMAP.put("Q5636904", EnumSet.of(Aspect.HABITUAL));
@@ -889,6 +909,9 @@ public String toString() {
889909
TYPEMAP.put("Q12237354", EnumSet.of(Usage.RARE)); // obsolete word
890910
TYPEMAP.put("Q54943392", EnumSet.of(Usage.RARE)); // obsolete form
891911
TYPEMAP.put("Q181970", EnumSet.of(Usage.RARE)); // archaism
912+
TYPEMAP.put("Q1098772", EnumSet.of(Usage.RARE)); // broken plural
913+
TYPEMAP.put("Q54944750", EnumSet.of(Usage.RARE)); // potential form
914+
TYPEMAP.put("Q55074511", EnumSet.of(Usage.RARE)); // reconstructed word
892915

893916
// Phrases and other things that don't inflect
894917
TYPEMAP.put("Q101352", EnumSet.of(Ignorable.IGNORABLE_LEMMA)); // family name. Lots of them conflict with common nouns, like "light"

inflection/tools/dictionary-parser/src/main/java/org/unicode/wikidata/ParseWikidata.java

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import java.util.TreeMap;
3535
import java.util.TreeSet;
3636

37+
import static org.unicode.wikidata.Grammar.Gender;
3738
import static org.unicode.wikidata.Grammar.Ignorable;
3839
import static org.unicode.wikidata.Grammar.PartOfSpeech;
3940
import static org.unicode.wikidata.Grammar.Sound;
@@ -48,7 +49,8 @@ public final class ParseWikidata {
4849
static final Set<String> PROPERTIES_WITH_GRAMMEMES = new TreeSet<>(List.of(
4950
"P31", // instance of. Sometimes phrase information is here.
5051
"P1552", // has characteristic for animacy
51-
"P5185" // grammatical gender
52+
"P5185", // grammatical gender
53+
"P11054" // grammatical number
5254
));
5355
static final Set<String> IMPORTANT_PROPERTIES = new TreeSet<>(PROPERTIES_WITH_GRAMMEMES);
5456

@@ -147,6 +149,7 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
147149
continue;
148150
}
149151
lemma.grammemes.remove(Ignorable.IGNORABLE_PROPERTY);
152+
removeConflicts(lemma.grammemes, Gender.class);
150153
for (var form : lexeme.forms) {
151154
Inflection currentInflection = null;
152155
var representation = form.representations.get(currentLemmaLanguage);
@@ -211,6 +214,33 @@ private void analyzeLexeme(int lineNumber, Lexeme lexeme) {
211214
}
212215
}
213216

217+
/**
218+
* When there are multiple genders at the lemma level, it's a ranking system instead of applying to all forms.
219+
* Such data is useless. So we should ignore it.
220+
* When there are multiple genders at the form level, the same form is valid for all specified genders.
221+
*/
222+
private void removeConflicts(TreeSet<Enum<?>> grammemes, Class<?> grammemeType) {
223+
if (grammemes.size() > 1) {
224+
var iter = grammemes.iterator();
225+
int count = 0;
226+
while (iter.hasNext()) {
227+
var grammeme = iter.next();
228+
if (grammemeType.isInstance(grammeme)) {
229+
count++;
230+
}
231+
}
232+
if (count > 1) {
233+
iter = grammemes.iterator();
234+
while (iter.hasNext()) {
235+
var grammeme = iter.next();
236+
if (grammemeType.isInstance(grammeme)) {
237+
iter.remove();
238+
}
239+
}
240+
}
241+
}
242+
}
243+
214244
private void convertGrammemes(LexemeForm form, Inflection currentInflection, String id, String lemma) {
215245
for (var feature : form.grammaticalFeatures) {
216246
Set<? extends Enum<?>> values = Grammar.getMappedGrammemes(feature);

inflection/tools/dictionary-parser/src/main/resources/org/unicode/wikidata/filter_en.properties

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
#
33
# These are lexemes that should either be ignored due to irrelevance that can't be easily tagged as irrelevant,
44
# or words that are just not that common that should be sorted last in the inflection patterns.
5-
L15388=rare
6-
L299075=omit
5+
# TODO remove this, since it is fixed upstream.
76
L342586=omit
7+
8+
L299075=omit
89
L468896=omit
910
L469033=omit
1011
L469036=omit
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Copyright 2025 Unicode Incorporated and others. All rights reserved.
2+
#
3+
# These are lexemes that should either be ignored due to irrelevance that can't be easily tagged as irrelevant,
4+
# or words that are just not that common that should be sorted last in the inflection patterns.
5+
L128740=omit
6+
L166820=omit

0 commit comments

Comments
 (0)