Skip to content

Commit 7d9c2c1

Browse files
authored
Inflection-64 Convert dictionary-parser to consume Wikidata (#65)
1 parent 71393a2 commit 7d9c2c1

File tree

99 files changed

+2118
-7275
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+2118
-7275
lines changed

inflection/gradle/versions.gradle

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
/*
2+
* Copyright 2025 Unicode Incorporated and others. All rights reserved.
23
* Copyright 2020-2024 Apple Inc. All rights reserved.
34
*/
45
// This is a sorted map of every dependency we want to use throughout the whole Inflection repo
56
ext.dependencyVersions = [
6-
commons_text: '1.10.0',
7-
icu4j: '75.1',
7+
commons_compress: '1.27.1',
8+
commons_text: '1.13.0',
9+
icu4j: '76.1',
10+
jackson: '2.18.2',
811
jsr305: '3.0.2',
9-
junit_jupiter: '5.10.2',
10-
junit_platform: '1.10.2',
11-
log4j: '2.22.1',
12-
xerces: '2.12.2',
12+
junit_jupiter: '5.11.4',
13+
junit_platform: '1.11.4',
14+
log4j: '2.24.3',
1315
]

inflection/resources/org/unicode/inflection/features/grammar.xml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
<?xml version='1.0' encoding='utf-8'?>
22
<!--
3+
Copyright 2025 Unicode Incorporated and others. All rights reserved.
34
Copyright 2016-2024 Apple Inc. All rights reserved.
45
-->
56
<!DOCTYPE languages SYSTEM "grammar.dtd">
@@ -23,7 +24,7 @@
2324
<grammeme name="interjection"/>
2425
<!-- grammeme name="letter"/ -->
2526
<grammeme name="noun"/>
26-
<!-- grammeme name="number"/ -->
27+
<grammeme name="numeral"/>
2728
<!-- grammeme name="participle" -->
2829
<grammeme name="particle"/>
2930
<!-- grammeme name="postposition"/ -->

inflection/test/src/inflection/lang/features/LanguageGrammarFeaturesTest.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/*
2+
* Copyright 2025 Unicode Incorporated and others. All rights reserved.
23
* Copyright 2016-2024 Apple Inc. All rights reserved.
34
*/
45
#include "catch2/catch_test_macros.hpp"
@@ -111,7 +112,7 @@ TEST_CASE("LanguageGrammarFeaturesTest#testLanguageGrammarFeatures") /* throws(E
111112
const auto& nbLangGramFeatures = ::inflection::lang::features::LanguageGrammarFeatures::getLanguageGrammarFeatures(::inflection::util::ULocale("nb"));
112113
auto nbCategories = nbLangGramFeatures.getCategories();
113114
REQUIRE(nbCategories.size() == 11);
114-
REQUIRE(categorySize(nbCategories, u"pos") == 10);
115+
REQUIRE(categorySize(nbCategories, u"pos") == 11);
115116
REQUIRE(categorySize(nbCategories, u"number") == 2);
116117

117118
auto nbFeatures = nbLangGramFeatures.getFeatures();
@@ -134,7 +135,7 @@ TEST_CASE("LanguageGrammarFeaturesTest#testLanguageGrammarFeatures") /* throws(E
134135
const auto& rootLangGramFeatures = ::inflection::lang::features::LanguageGrammarFeatures::getLanguageGrammarFeatures(::inflection::util::LocaleUtils::ROOT());
135136
auto rootCategories = rootLangGramFeatures.getCategories();
136137
REQUIRE(rootCategories.size() == 2);
137-
REQUIRE(categorySize(rootCategories, u"pos") == 10);
138+
REQUIRE(categorySize(rootCategories, u"pos") == 11);
138139
REQUIRE(categorySize(rootCategories, u"gender") == -1);
139140
}
140141

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
#
2+
# Copyright 2025 Unicode Incorporated and others. All rights reserved.
23
# Copyright 2020-2024 Apple Inc. All rights reserved.
34
#
45
JAR=build/libs/dictionary-parser-all.jar
56
if [ ! -e "$JAR" ]
67
then
78
../../gradlew clean fatJar >&2
89
fi
9-
java -Dfile.encoding=UTF-8 -cp $JAR com.apple.lexicon.ParseLexicon $*
10+
java -Dfile.encoding=UTF-8 -cp $JAR org.unicode.wikidata.ParseWikidata $*
Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,23 @@
11
<!--
2+
Copyright 2025 Unicode Incorporated and others. All rights reserved.
23
Copyright 2021-2024 Apple Inc. All rights reserved.
34
-->
45
# Dictionary Parsers
56

67
These tools generate files that describes the grammatical properties of words from the various sources.
78

8-
## Usage for Wiktionary
9+
## Usage for Wikidata
910

10-
1) Download a copy of Wiktionary data from http://dumps.wikimedia.org/ (e.g. http://dumps.wikimedia.org/enwiktionary/20130825/enwiktionary-20130825-pages-articles.xml.bz2)
11-
2) Decompress dump
12-
3) Run `./ParseWiktionary ~/Downloads/enwiktionary-20130825-pages-articles.xml > ../../resources/com/apple/inflection/dictionary/dictionary_en.lst`
13-
14-
## Usage for Apple's Lexical Resources (ALR)
15-
16-
1) Get the latest lexicon for your language.
11+
1) Download a copy of Wikidata from https://dumps.wikimedia.org/wikidatawiki/entities/ (e.g. https://dumps.wikimedia.org/wikidatawiki/entities/20250115/wikidata-20250115-lexemes.json.bz2)
1712
2) Check what options were used for your language. They are at the end of the generated dictionary_XX.lst, look for "generated with options"
18-
- Run `grep 'generated with options' ../../resources/com/apple/inflection/dictionary/dictionary_XX.lst | cut -d':' -f2`
13+
- Run `grep 'generated with options' ../../resources/org/unicode/inflection/dictionary/dictionary_XX.lst | cut -d':' -f2`
1914
- If the above command prints nothing, no additional options were used to generate the file, or it was generated with a different tool.
20-
- To see what options are available run `./ParseLexicon`
15+
- To see what options are available run `./ParseWikidata`
16+
- At minimum use the `--locale` option to specify the ISO-639 code for the language to extract.
2117
3) Run
2218
```
23-
./ParseLexicon <THE_OPTIONS_FROM_STEP_2> \
24-
--inflections ../../resources/com/apple/inflection/dictionary/inflectional_XX.xml \
25-
<THE_LEXICON_FILE> \
26-
> ../../resources/com/apple/inflection/dictionary/dictionary_XX.lst
19+
./ParseWikidata <THE_OPTIONS_FROM_STEP_2> \
20+
--inflections ../../resources/org/unicode/inflection/dictionary/inflectional_XX.xml \
21+
--dictionary ../../resources/org/unicode/inflection/dictionary/dictionary_XX.lst \
22+
<wikidata-NNNNNNNN-lexemes.json.bz2>
2723
```

inflection/tools/dictionary-parser/build.gradle

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/*
2+
* Copyright 2025 Unicode Incorporated and others. All rights reserved.
23
* Copyright 2020-2024 Apple Inc. All rights reserved.
34
*/
45
group = 'com.apple.inflection.tools'
@@ -7,15 +8,16 @@ description = 'Converts lexical dictionaries into a form that can be consumed by
78
dependencies {
89
implementation group: 'com.ibm.icu', name: 'icu4j', version: dependencyVersions.icu4j
910
implementation(group: 'org.apache.commons', name: 'commons-text', version: dependencyVersions.commons_text)
10-
runtimeOnly(group: 'xerces', name: 'xercesImpl', version: dependencyVersions.xerces) {
11-
exclude(group: 'xml-apis')
12-
}
11+
implementation(group: 'org.apache.commons', name: 'commons-compress', version: dependencyVersions.commons_compress)
12+
implementation(group: 'com.fasterxml.jackson.core', name: 'jackson-core', version: dependencyVersions.jackson)
13+
implementation(group: 'com.fasterxml.jackson.core', name: 'jackson-annotations', version: dependencyVersions.jackson)
14+
implementation(group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: dependencyVersions.jackson)
1315
}
1416

1517
tasks.register('fatJar', Jar) {
1618
manifest {
1719
attributes 'Implementation-Version': project.version,
18-
'Main-Class': 'com.apple.wiktionary.ParseWiktionary'
20+
'Main-Class': 'org.unicode.wikidata.ParseWikidata'
1921
}
2022
archiveFileName = project.name + '-all.jar'
2123
duplicatesStrategy = 'include'

0 commit comments

Comments
 (0)