Skip to content

Commit ad8e163

Browse files
authored
Inflection-37 Support word decompounding for inflecting words (#106)
1 parent 45e629d commit ad8e163

File tree

228 files changed

+807318
-20458
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

228 files changed

+807318
-20458
lines changed

inflection/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ inflection_debug_vars(NUM_PROCESSORS_VAL)
2626
# Unicode Inflection cache variables
2727
set(NUM_PROCESSORS ${NUM_PROCESSORS_VAL} CACHE STRING "Number of cores to be used in make")
2828

29-
# Morphun options
3029
option(PROFILING "Turn on code profiling" OFF)
3130

3231
add_compile_options(${CXX_STD_LIB_FLAG})

inflection/resources/CMakeLists.txt

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,26 @@ foreach (LOCALE IN LISTS BINARY_DICT_LOCALES)
147147
endforeach ()
148148
# -------- End Dictionary files section
149149

150+
# ======== Start tok dictionary files section
151+
set(BINARY_TOK_DICT_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/org/unicode/inflection/tokenizer)
152+
set(BINARY_TOK_DICT_DIR_ROOT ${INFLECTION_DATA_ROOT}/inflection/tokenizer)
153+
file(MAKE_DIRECTORY ${BINARY_TOK_DICT_DIR_ROOT})
154+
155+
file(GLOB_RECURSE BINARY_TOK_DICT_SRCS CONFIGURE_DEPENDS ${BINARY_TOK_DICT_SRC_DIR}/*/tokenizer.dictionary)
156+
foreach (BINARY_TOK_DICT_SRC IN LISTS BINARY_TOK_DICT_SRCS)
157+
string(REGEX REPLACE "^${BINARY_TOK_DICT_SRC_DIR}/(.*)\\.dictionary" "${BINARY_TOK_DICT_DIR_ROOT}/\\1.tokd" BINARY_TOK_DICT ${BINARY_TOK_DICT_SRC})
158+
get_filename_component(BINARY_TOK_DICT_DIR ${BINARY_TOK_DICT} DIRECTORY)
159+
list(APPEND RESOURCE_DIRS ${BINARY_TOK_DICT_DIR})
160+
list(APPEND BINARY_TOK_DICTS ${BINARY_TOK_DICT})
161+
162+
add_custom_command(
163+
OUTPUT ${BINARY_TOK_DICT}
164+
COMMAND ${LIBRARY_PATH_NAME}=${ICU_LIB_DIRECTORY} ${CMAKE_CURRENT_BINARY_DIR}/../tools/buildTokDictionary/buildTokDictionary ${BINARY_TOK_DICT_SRC} ${BINARY_TOK_DICT}
165+
DEPENDS buildTokDictionary ${BINARY_TOK_DICT_SRC}
166+
)
167+
endforeach()
168+
# -------- End tok dictionary section
169+
150170
file(GLOB_RECURSE RESOURCE_BINARIES ${CMAKE_CURRENT_SOURCE_DIR}/share/*)
151171
install_build_resources(
152172
RESOURCE_BINARIES
@@ -155,7 +175,7 @@ install_build_resources(
155175
RESOURCE_BINARIES_DIST
156176
)
157177

158-
add_custom_target(inflection-data ALL DEPENDS ${BINARY_DICTS} ${RESOURCE_BINARIES_DIST})
178+
add_custom_target(inflection-data ALL DEPENDS ${BINARY_DICTS} ${BINARY_TOK_DICTS} ${RESOURCE_BINARIES_DIST})
159179

160180
#Make directories for all generated resource files
161181
list(REMOVE_DUPLICATES RESOURCE_DIRS)

inflection/resources/org/unicode/inflection/dictionary/dictionary_ar.lst

Lines changed: 102 additions & 64 deletions
Large diffs are not rendered by default.

inflection/resources/org/unicode/inflection/dictionary/dictionary_fr.lst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Est: singular masculine vowel-start noun proper-noun inflection=40
1515
Eurockéennes de Belfort: plural feminine vowel-start noun inflection=b0
1616
FIAC: singular feminine abbreviation noun
1717
Fashion week: singular feminine noun inflection=28
18-
France: singular plural feminine noun proper-noun inflection=7
18+
France: singular feminine noun proper-noun inflection=7
1919
FrancoFolies: plural masculine noun proper-noun inflection=36
2020
Francofolies: plural feminine noun proper-noun inflection=67
2121
Félix: singular plural masculine noun proper-noun inflection=5

inflection/resources/org/unicode/inflection/dictionary/dictionary_he.lst

Lines changed: 195 additions & 34 deletions
Large diffs are not rendered by default.

inflection/resources/org/unicode/inflection/dictionary/dictionary_it.lst

Lines changed: 70 additions & 225 deletions
Large diffs are not rendered by default.

inflection/resources/org/unicode/inflection/dictionary/dictionary_ko.lst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ bluetooth: vowel-end noun
33
abby: vowel-end
44
c&c: vowel-end
55
cgv: vowel-end
6+
chatgpt: vowel-end
67
cj: vowel-end
78
cnh: vowel-end
89
cs: vowel-end
Lines changed: 31 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,40 @@
1-
Ås: noun proper-noun inflection=6
21
absurd: singular indefinite adjective
3-
administrasjon: singular indefinite masculine noun inflection=2
4-
administrasjonen: singular definite masculine noun inflection=2
5-
administrasjonene: plural definite masculine noun inflection=2
6-
administrasjoner: plural indefinite masculine noun inflection=2
7-
ås: singular genitive indefinite masculine feminine noun inflection=314
8-
avtale: singular indefinite masculine noun verb inflection=7
9-
belta: plural definite neuter noun inflection=16
10-
beltas: plural genitive definite neuter noun inflection=16
11-
belte: singular indefinite neuter noun inflection=16
12-
belter: plural indefinite neuter noun inflection=1 inflection=16
13-
belters: plural genitive indefinite neuter noun inflection=16
14-
beltes: singular genitive indefinite neuter noun inflection=16
15-
beltet: singular definite neuter noun inflection=1 inflection=16
16-
beltets: singular genitive definite neuter noun inflection=16
17-
bokmerke: singular indefinite neuter noun inflection=f
18-
fast: singular indefinite adjective verb
2+
administrasjon: singular indefinite masculine noun inflection=1
3+
all: determiner
4+
Apple: singular masculine noun proper-noun inflection=7
5+
ås: singular genitive indefinite masculine feminine noun inflection=17e
6+
avtale: singular indefinite masculine noun verb inflection=4
7+
belta: plural definite neuter noun inflection=d
8+
belte: singular indefinite neuter noun inflection=d
9+
beltene: plural definite neuter noun inflection=d
10+
bildøra: singular definite feminine noun inflection=64
11+
bokmerke: singular indefinite neuter noun inflection=d
12+
budskap: singular plural indefinite neuter noun inflection=6
13+
ekte: singular plural definite indefinite adjective
14+
fast: singular indefinite adjective
1915
gammel: singular indefinite adjective
16+
god: singular indefinite adjective
2017
høyest: singular plural indefinite adjective
21-
høyeste: singular plural definite adjective
22-
jazz: singular indefinite masculine noun verb inflection=12
23-
lampa: singular definite feminine noun inflection=78
24-
lampe: singular indefinite masculine feminine noun inflection=78
25-
lunken: singular indefinite masculine adjective noun inflection=1
26-
møte: singular indefinite neuter noun verb inflection=16
27-
møter: plural indefinite neuter noun verb inflection=16
28-
øvinga: singular definite feminine noun inflection=3
29-
vogntoga: plural definite neuter noun inflection=4
30-
budskap: singular plural indefinite neuter noun inflection=9
31-
budskapa: plural definite neuter noun inflection=9
32-
budskapas: plural genitive definite neuter noun inflection=9
33-
budskapene: plural definite neuter noun inflection=9
34-
budskapenes: plural genitive definite neuter noun inflection=9
35-
budskaper: plural indefinite neuter noun inflection=9
36-
budskapers: plural genitive indefinite neuter noun inflection=9
37-
budskapet: singular definite neuter noun inflection=9
38-
budskapets: singular genitive definite neuter noun inflection=9
39-
budskaps: singular plural genitive indefinite neuter noun inflection=9
40-
mann: singular indefinite masculine noun verb inflection=87
41-
menn: plural indefinite masculine noun inflection=87
42-
bildør: singular indefinite masculine feminine noun inflection=d2
43-
bildøra: singular definite feminine noun inflection=d2
44-
bildøras: singular genitive definite feminine noun inflection=d2
45-
bildøren: singular definite masculine noun inflection=d2
46-
bildørene: plural definite masculine feminine noun inflection=d2
47-
bildørenes: plural genitive definite masculine feminine noun inflection=d2
48-
bildørens: singular genitive definite masculine noun inflection=d2
49-
bildører: plural indefinite masculine feminine noun inflection=d2
50-
bildørers: plural genitive indefinite masculine feminine noun inflection=d2
51-
bildørs: singular genitive indefinite masculine feminine noun inflection=d2
18+
jazz: singular indefinite masculine noun verb inflection=b
19+
kjør: verb
20+
lampa: singular definite feminine noun inflection=43
21+
lampe: singular indefinite masculine feminine noun inflection=43
22+
lunken: singular indefinite adjective
5223
makaber: singular indefinite adjective
53-
mor: singular indefinite masculine feminine noun verb inflection=30 inflection=3b
54-
moren: singular definite masculine noun inflection=30
24+
mann: singular indefinite masculine noun verb inflection=4c
25+
mor: singular indefinite masculine feminine noun verb inflection=31 inflection=3b
26+
moren: singular definite masculine noun inflection=31
27+
møte: singular indefinite neuter noun verb inflection=d
28+
møter: plural indefinite neuter noun verb inflection=d
29+
norsk: singular indefinite masculine adjective noun verb inflection=1f
30+
opp: adverb
31+
øvinga: singular definite feminine noun inflection=2
32+
praktisk: singular indefinite adjective
33+
rosa: singular plural definite indefinite adjective
34+
tet: singular indefinite masculine noun inflection=1
5535
tykk: singular indefinite adjective
5636
vennlig: singular indefinite adjective
57-
praktisk: singular indefinite adjective
58-
norsk: singular indefinite masculine adjective noun verb
59-
god: singular indefinite adjective
60-
skjørtet: noun inflection=1
37+
vogntoga: plural definite neuter noun inflection=3
6138
==============================================
6239
Manually curated for tests to pass
6340
Copyright 2024-2024 Apple Inc. All rights reserved.

0 commit comments

Comments
 (0)