diff --git a/gradle/generation/kuromoji.gradle b/gradle/generation/kuromoji.gradle
index ec1c14c0a455..5c11cd24b7a7 100644
--- a/gradle/generation/kuromoji.gradle
+++ b/gradle/generation/kuromoji.gradle
@@ -132,6 +132,42 @@ configure(project(":lucene:analysis:kuromoji")) {
}
}
+ task compileUnidic(type: Download) {
+ description "Recompile dictionaries from UniDic data from https://clrd.ninjal.ac.jp/unidic_archive"
+ group "generation"
+
+ dependsOn deleteDictionaryData
+ dependsOn sourceSets.main.runtimeClasspath
+
+ def dictionaryName = "unidic-cwj-3.1.1-full"
+ def dictionarySource = "https://clrd.ninjal.ac.jp/unidic_archive/cwj/3.1.1/${dictionaryName}.zip"
+ def dictionaryFile = file("${buildDir}/generate/${dictionaryName}.zip")
+ def unpackedDir = file("${buildDir}/generate/${dictionaryName}")
+
+ src dictionarySource
+ dest dictionaryFile
+ onlyIfModified true
+
+ doLast {
+ // Unpack the downloaded archive.
+ delete unpackedDir
+ ant.unzip(src: dictionaryFile, dest: unpackedDir) {
+ ant.cutdirsmapper(dirs: "1")
+ }
+
+ // Compile the dictionary
+ recompileDictionary(project, dictionaryName, {
+ args += [
+ "unidic",
+ unpackedDir,
+ targetDir,
+ "UTF-8",
+ false
+ ]
+ })
+ }
+ }
+
regenerate.dependsOn compileMecab
}
}
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 025c75911547..d5c251e8b045 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -102,6 +102,8 @@ New Features
Improvements
---------------------
+* LUCENE-4056: Japanese Tokenizer (Kuromoji) can build a UniDic dictionary (Jun Ohtani, Alexander Zagniotov)
+
* LUCENE-10416: Update Korean Dictionary to mecab-ko-dic-2.1.1-20180720 for Nori.
(Uihyun Kim)
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryBuilder.java
index cf5183dfddfb..8b5b427e6334 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryBuilder.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryBuilder.java
@@ -25,7 +25,7 @@
* Tool to build dictionaries. Usage:
*
*
- * java -cp [lucene classpath] org.apache.lucene.analysis.ja.util.DictionaryBuilder \
+ * java -cp [lucene classpath] org.apache.lucene.analysis.ja.dict.DictionaryBuilder \
* ${inputDir} ${outputDir} ${encoding} ${normalizeEntry}
*
*
@@ -66,7 +66,7 @@ public static void build(
.build(inputDir)
.write(outputDir);
- new UnknownDictionaryBuilder(encoding).build(inputDir).write(outputDir);
+ new UnknownDictionaryBuilder(format, encoding).build(inputDir).write(outputDir);
ConnectionCostsBuilder.build(inputDir.resolve("matrix.def"))
.write(outputDir, DictionaryConstants.CONN_COSTS_HEADER, DictionaryConstants.VERSION);
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java
index 28579dc7db60..041ca49ceab0 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java
@@ -62,7 +62,7 @@ public TokenInfoDictionaryWriter build(Path dir) throws IOException {
}
private TokenInfoDictionaryWriter buildDictionary(List csvFiles) throws IOException {
- TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
+ TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(format, 10 * 1024 * 1024);
Charset cs = Charset.forName(encoding);
// all lines in the file
List lines = new ArrayList<>(400000);
@@ -72,10 +72,7 @@ private TokenInfoDictionaryWriter buildDictionary(List csvFiles) throws IO
while ((line = reader.readLine()) != null) {
String[] entry = CSVUtil.parse(line);
- if (entry.length < 13) {
- throw new IllegalArgumentException(
- "Entry in CSV is not valid (13 field values expected): " + line);
- }
+ validateEntryLengthWithThrow(line, entry);
lines.add(formatEntry(entry));
@@ -130,6 +127,16 @@ private TokenInfoDictionaryWriter buildDictionary(List csvFiles) throws IO
return dictionary;
}
+ private void validateEntryLengthWithThrow(final String line, String[] entry) {
+ if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && entry.length < 13) {
+ throw new IllegalArgumentException(
+ "Entry in CSV is not valid (13 field values expected): " + line);
+ } else if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC && entry.length < 21) {
+ throw new IllegalArgumentException(
+ "Entry in CSV is not valid (21 field values expected): " + line);
+ }
+ }
+
/*
* IPADIC features
*
@@ -150,9 +157,10 @@ private TokenInfoDictionaryWriter buildDictionary(List csvFiles) throws IO
* 3 - word cost
* 4-9 - pos
* 10 - base form reading
- * 11 - base form
+ * 11 - lexeme - not used
* 12 - surface form
* 13 - surface reading
+ * 14 - orthographic form
*/
private String[] formatEntry(String[] features) {
@@ -170,7 +178,7 @@ private String[] formatEntry(String[] features) {
features2[7] = features[7];
features2[8] = features[8];
features2[9] = features[9];
- features2[10] = features[11];
+ features2[10] = features[14];
// If the surface reading is non-existent, use surface form for reading and pronunciation.
// This happens with punctuation in UniDic and there are possibly other cases as well
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java
index 4bdfe5095a6c..01a84825d759 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java
@@ -26,10 +26,17 @@
/** Writes system dictionary entries */
class TokenInfoDictionaryEntryWriter extends DictionaryEntryWriter {
- private static final int ID_LIMIT = 8192;
+ private static final int IPADIC_ID_LIMIT = 8192;
- TokenInfoDictionaryEntryWriter(int size) {
+ // E.g.: unidic-cwj-3.1.1-full: 15388
+ // E.g.: unidic-cwj-202302_full: 18859
+ private static final int UNIDIC_ID_LIMIT = 18859;
+
+ private final DictionaryBuilder.DictionaryFormat format;
+
+ TokenInfoDictionaryEntryWriter(DictionaryBuilder.DictionaryFormat format, int size) {
super(size);
+ this.format = format;
}
/**
@@ -47,6 +54,21 @@ class TokenInfoDictionaryEntryWriter extends DictionaryEntryWriter {
* 11 - reading
* 12 - pronounciation
*
+ *
+ * unidic features
+ *
+ *
+ * 0 - surface
+ * 1 - left cost
+ * 2 - right cost
+ * 3 - word cost
+ * 4-9 - pos
+ * 10 - base form reading
+ * 11 - lexeme - not used
+ * 12 - surface form
+ * 13 - surface reading
+ * 14 - orthographic form
+ *
*/
@Override
protected int putEntry(String[] entry) {
@@ -114,31 +136,29 @@ protected int putEntry(String[] entry) {
flags |= TokenInfoMorphData.HAS_PRONUNCIATION;
}
- if (leftId != rightId) {
- throw new IllegalArgumentException("rightId != leftId: " + rightId + " " + leftId);
- }
- if (leftId >= ID_LIMIT) {
- throw new IllegalArgumentException("leftId >= " + ID_LIMIT + ": " + leftId);
- }
+ validateLeftRightIdsWithThrow(leftId, rightId);
// add pos mapping
int toFill = 1 + leftId - posDict.size();
for (int i = 0; i < toFill; i++) {
posDict.add(null);
}
- String existing = posDict.get(leftId);
- if (existing != null && existing.equals(fullPOSData) == false) {
- // TODO: test me
- throw new IllegalArgumentException("Multiple entries found for leftID=" + leftId);
- }
posDict.set(leftId, fullPOSData);
buffer.putShort((short) (leftId << 3 | flags));
buffer.putShort(wordCost);
if ((flags & TokenInfoMorphData.HAS_BASEFORM) != 0) {
- if (baseForm.length() >= 16) {
- throw new IllegalArgumentException("Length of base form " + baseForm + " is >= 16");
+ if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && baseForm.length() >= 16) {
+ throw new IllegalArgumentException(
+ "IPADIC base form length " + baseForm.length() + " is >= 16");
+ }
+
+ // Added the following check because when trying to build unidic-cwj-3.1.1-full,
+ // the base form length was greater than 16, thus, the original check was failing.
+ if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC && baseForm.length() >= 35) {
+ throw new IllegalArgumentException(
+ "UNIDIC base form length " + baseForm.length() + " is >= 35");
}
int shared = sharedPrefix(entry[0], baseForm);
int suffix = baseForm.length() - shared;
@@ -179,6 +199,20 @@ protected int putEntry(String[] entry) {
return buffer.position();
}
+ private void validateLeftRightIdsWithThrow(short leftId, short rightId) {
+ if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && leftId != rightId) {
+ throw new IllegalArgumentException("IpaDic rightId != leftId: " + rightId + " " + leftId);
+ }
+
+ if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && leftId >= IPADIC_ID_LIMIT) {
+ throw new IllegalArgumentException("IpaDic leftId >= " + IPADIC_ID_LIMIT + ": " + leftId);
+ }
+
+ if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC && leftId >= UNIDIC_ID_LIMIT) {
+ throw new IllegalArgumentException("UniDic leftId >= " + UNIDIC_ID_LIMIT + ": " + leftId);
+ }
+ }
+
private boolean isKatakana(String s) {
for (int i = 0; i < s.length(); i++) {
char ch = s.charAt(i);
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryWriter.java
index dda849b67d4d..1aada93d731f 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryWriter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryWriter.java
@@ -26,8 +26,8 @@ class TokenInfoDictionaryWriter
extends org.apache.lucene.analysis.morph.BinaryDictionaryWriter {
private FST fst;
- TokenInfoDictionaryWriter(int size) {
- super(TokenInfoDictionary.class, new TokenInfoDictionaryEntryWriter(size));
+ TokenInfoDictionaryWriter(DictionaryBuilder.DictionaryFormat format, int size) {
+ super(TokenInfoDictionary.class, new TokenInfoDictionaryEntryWriter(format, size));
}
public void setFST(FST fst) {
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java
index ba5bc0e6a058..f51284af6f36 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java
@@ -30,9 +30,11 @@
class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
+ private final DictionaryBuilder.DictionaryFormat format;
private final String encoding;
- UnknownDictionaryBuilder(String encoding) {
+ UnknownDictionaryBuilder(DictionaryBuilder.DictionaryFormat format, String encoding) {
+ this.format = format;
this.encoding = encoding;
}
@@ -49,7 +51,7 @@ private UnknownDictionaryWriter readDictionaryFile(Path path) throws IOException
private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding)
throws IOException {
- UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
+ UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(format, 5 * 1024 * 1024);
List lines = new ArrayList<>();
try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
@@ -60,11 +62,8 @@ private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding)
String line;
while ((line = lineReader.readLine()) != null) {
// note: unk.def only has 10 fields, it simplifies the writer to just append empty reading
- // and pronunciation,
- // even though the unknown dictionary returns hardcoded null here.
- final String[] parsed =
- CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
- lines.add(parsed);
+ // and pronunciation, even though the unknown dictionary returns hardcoded null here.
+ lines.add(parseCSVLine(line));
}
}
@@ -78,6 +77,14 @@ private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding)
return dictionary;
}
+ private String[] parseCSVLine(final String line) {
+ if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC) {
+ return CSVUtil.parse(line + ",*,*,*"); // UniDic needs one more column
+ } else {
+ return CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
+ }
+ }
+
private void readCharacterDefinition(Path path, UnknownDictionaryWriter dictionary)
throws IOException {
try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryWriter.java
index eb41296e12e3..e1d646065a77 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryWriter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryWriter.java
@@ -29,8 +29,8 @@ class UnknownDictionaryWriter extends BinaryDictionaryWriter
CharacterDefinition.CLASS_COUNT,
CharacterDefinition::lookupCharacterClass);
- public UnknownDictionaryWriter(int size) {
- super(UnknownDictionary.class, new TokenInfoDictionaryEntryWriter(size));
+ public UnknownDictionaryWriter(DictionaryBuilder.DictionaryFormat format, int size) {
+ super(UnknownDictionary.class, new TokenInfoDictionaryEntryWriter(format, size));
}
@Override
diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java
index 2d245c7a599c..802b06c6efac 100644
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java
@@ -25,7 +25,8 @@ public class TestUnknownDictionary extends LuceneTestCase {
@Test
public void testPutCharacterCategory() {
- UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
+ UnknownDictionaryWriter unkDic =
+ new UnknownDictionaryWriter(DictionaryBuilder.DictionaryFormat.IPADIC, 10 * 1024 * 1024);
expectThrows(Exception.class, () -> unkDic.putCharacterCategory(0, "DUMMY_NAME"));
@@ -40,7 +41,8 @@ public void testPutCharacterCategory() {
@Test
public void testPut() {
- UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
+ UnknownDictionaryWriter unkDic =
+ new UnknownDictionaryWriter(DictionaryBuilder.DictionaryFormat.IPADIC, 10 * 1024 * 1024);
expectThrows(
NumberFormatException.class,
() -> unkDic.put(CSVUtil.parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*")));