diff --git a/gradle/generation/kuromoji.gradle b/gradle/generation/kuromoji.gradle index ec1c14c0a455..5c11cd24b7a7 100644 --- a/gradle/generation/kuromoji.gradle +++ b/gradle/generation/kuromoji.gradle @@ -132,6 +132,42 @@ configure(project(":lucene:analysis:kuromoji")) { } } + task compileUnidic(type: Download) { + description "Recompile dictionaries from UniDic data from https://clrd.ninjal.ac.jp/unidic_archive" + group "generation" + + dependsOn deleteDictionaryData + dependsOn sourceSets.main.runtimeClasspath + + def dictionaryName = "unidic-cwj-3.1.1-full" + def dictionarySource = "https://clrd.ninjal.ac.jp/unidic_archive/cwj/3.1.1/${dictionaryName}.zip" + def dictionaryFile = file("${buildDir}/generate/${dictionaryName}.zip") + def unpackedDir = file("${buildDir}/generate/${dictionaryName}") + + src dictionarySource + dest dictionaryFile + onlyIfModified true + + doLast { + // Unpack the downloaded archive. + delete unpackedDir + ant.unzip(src: dictionaryFile, dest: unpackedDir) { + ant.cutdirsmapper(dirs: "1") + } + + // Compile the dictionary + recompileDictionary(project, dictionaryName, { + args += [ + "unidic", + unpackedDir, + targetDir, + "UTF-8", + false + ] + }) + } + } + regenerate.dependsOn compileMecab } } diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 025c75911547..d5c251e8b045 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -102,6 +102,8 @@ New Features Improvements --------------------- +* LUCENE-4056: Japanese Tokenizer (Kuromoji) can build a UniDic dictionary (Jun Ohtani, Alexander Zagniotov) + * LUCENE-10416: Update Korean Dictionary to mecab-ko-dic-2.1.1-20180720 for Nori. (Uihyun Kim) diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryBuilder.java index cf5183dfddfb..8b5b427e6334 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/DictionaryBuilder.java @@ -25,7 +25,7 @@ * Tool to build dictionaries. Usage: * *
- *    java -cp [lucene classpath] org.apache.lucene.analysis.ja.util.DictionaryBuilder \
+ *    java -cp [lucene classpath] org.apache.lucene.analysis.ja.dict.DictionaryBuilder \
  *          ${inputDir} ${outputDir} ${encoding} ${normalizeEntry}
  * 
* @@ -66,7 +66,7 @@ public static void build( .build(inputDir) .write(outputDir); - new UnknownDictionaryBuilder(encoding).build(inputDir).write(outputDir); + new UnknownDictionaryBuilder(format, encoding).build(inputDir).write(outputDir); ConnectionCostsBuilder.build(inputDir.resolve("matrix.def")) .write(outputDir, DictionaryConstants.CONN_COSTS_HEADER, DictionaryConstants.VERSION); diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java index 28579dc7db60..041ca49ceab0 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java @@ -62,7 +62,7 @@ public TokenInfoDictionaryWriter build(Path dir) throws IOException { } private TokenInfoDictionaryWriter buildDictionary(List csvFiles) throws IOException { - TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024); + TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(format, 10 * 1024 * 1024); Charset cs = Charset.forName(encoding); // all lines in the file List lines = new ArrayList<>(400000); @@ -72,10 +72,7 @@ private TokenInfoDictionaryWriter buildDictionary(List csvFiles) throws IO while ((line = reader.readLine()) != null) { String[] entry = CSVUtil.parse(line); - if (entry.length < 13) { - throw new IllegalArgumentException( - "Entry in CSV is not valid (13 field values expected): " + line); - } + validateEntryLengthWithThrow(line, entry); lines.add(formatEntry(entry)); @@ -130,6 +127,16 @@ private TokenInfoDictionaryWriter buildDictionary(List csvFiles) throws IO return dictionary; } + private void validateEntryLengthWithThrow(final String line, String[] entry) { + if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && entry.length < 13) { + throw new IllegalArgumentException( + "Entry in CSV is not valid (13 field values expected): " + line); + } else if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC && entry.length < 21) { + throw new IllegalArgumentException( + "Entry in CSV is not valid (21 field values expected): " + line); + } + } + /* * IPADIC features * @@ -150,9 +157,10 @@ private TokenInfoDictionaryWriter buildDictionary(List csvFiles) throws IO * 3 - word cost * 4-9 - pos * 10 - base form reading - * 11 - base form + * 11 - lexeme - not used * 12 - surface form * 13 - surface reading + * 14 - orthographic form */ private String[] formatEntry(String[] features) { @@ -170,7 +178,7 @@ private String[] formatEntry(String[] features) { features2[7] = features[7]; features2[8] = features[8]; features2[9] = features[9]; - features2[10] = features[11]; + features2[10] = features[14]; // If the surface reading is non-existent, use surface form for reading and pronunciation. // This happens with punctuation in UniDic and there are possibly other cases as well diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java index 4bdfe5095a6c..01a84825d759 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java @@ -26,10 +26,17 @@ /** Writes system dictionary entries */ class TokenInfoDictionaryEntryWriter extends DictionaryEntryWriter { - private static final int ID_LIMIT = 8192; + private static final int IPADIC_ID_LIMIT = 8192; - TokenInfoDictionaryEntryWriter(int size) { + // E.g.: unidic-cwj-3.1.1-full: 15388 + // E.g.: unidic-cwj-202302_full: 18859 + private static final int UNIDIC_ID_LIMIT = 18859; + + private final DictionaryBuilder.DictionaryFormat format; + + TokenInfoDictionaryEntryWriter(DictionaryBuilder.DictionaryFormat format, int size) { super(size); + this.format = format; } /** @@ -47,6 +54,21 @@ class TokenInfoDictionaryEntryWriter extends DictionaryEntryWriter { * 11 - reading * 12 - pronounciation * + * + *

unidic features + * + *

+   * 0   - surface
+   * 1   - left cost
+   * 2   - right cost
+   * 3   - word cost
+   * 4-9 - pos
+   * 10  - base form reading
+   * 11  - lexeme - not used
+   * 12  - surface form
+   * 13  - surface reading
+   * 14  - orthographic form
+   * 
*/ @Override protected int putEntry(String[] entry) { @@ -114,31 +136,29 @@ protected int putEntry(String[] entry) { flags |= TokenInfoMorphData.HAS_PRONUNCIATION; } - if (leftId != rightId) { - throw new IllegalArgumentException("rightId != leftId: " + rightId + " " + leftId); - } - if (leftId >= ID_LIMIT) { - throw new IllegalArgumentException("leftId >= " + ID_LIMIT + ": " + leftId); - } + validateLeftRightIdsWithThrow(leftId, rightId); // add pos mapping int toFill = 1 + leftId - posDict.size(); for (int i = 0; i < toFill; i++) { posDict.add(null); } - String existing = posDict.get(leftId); - if (existing != null && existing.equals(fullPOSData) == false) { - // TODO: test me - throw new IllegalArgumentException("Multiple entries found for leftID=" + leftId); - } posDict.set(leftId, fullPOSData); buffer.putShort((short) (leftId << 3 | flags)); buffer.putShort(wordCost); if ((flags & TokenInfoMorphData.HAS_BASEFORM) != 0) { - if (baseForm.length() >= 16) { - throw new IllegalArgumentException("Length of base form " + baseForm + " is >= 16"); + if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && baseForm.length() >= 16) { + throw new IllegalArgumentException( + "IPADIC base form length " + baseForm.length() + " is >= 16"); + } + + // Added the following check because when trying to build unidic-cwj-3.1.1-full, + // the base form length was greater than 16, thus, the original check was failing. + if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC && baseForm.length() >= 35) { + throw new IllegalArgumentException( + "UNIDIC base form length " + baseForm.length() + " is >= 35"); } int shared = sharedPrefix(entry[0], baseForm); int suffix = baseForm.length() - shared; @@ -179,6 +199,20 @@ protected int putEntry(String[] entry) { return buffer.position(); } + private void validateLeftRightIdsWithThrow(short leftId, short rightId) { + if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && leftId != rightId) { + throw new IllegalArgumentException("IpaDic rightId != leftId: " + rightId + " " + leftId); + } + + if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC && leftId >= IPADIC_ID_LIMIT) { + throw new IllegalArgumentException("IpaDic leftId >= " + IPADIC_ID_LIMIT + ": " + leftId); + } + + if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC && leftId >= UNIDIC_ID_LIMIT) { + throw new IllegalArgumentException("UniDic leftId >= " + UNIDIC_ID_LIMIT + ": " + leftId); + } + } + private boolean isKatakana(String s) { for (int i = 0; i < s.length(); i++) { char ch = s.charAt(i); diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryWriter.java index dda849b67d4d..1aada93d731f 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryWriter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryWriter.java @@ -26,8 +26,8 @@ class TokenInfoDictionaryWriter extends org.apache.lucene.analysis.morph.BinaryDictionaryWriter { private FST fst; - TokenInfoDictionaryWriter(int size) { - super(TokenInfoDictionary.class, new TokenInfoDictionaryEntryWriter(size)); + TokenInfoDictionaryWriter(DictionaryBuilder.DictionaryFormat format, int size) { + super(TokenInfoDictionary.class, new TokenInfoDictionaryEntryWriter(format, size)); } public void setFST(FST fst) { diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java index ba5bc0e6a058..f51284af6f36 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java @@ -30,9 +30,11 @@ class UnknownDictionaryBuilder { private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*"; + private final DictionaryBuilder.DictionaryFormat format; private final String encoding; - UnknownDictionaryBuilder(String encoding) { + UnknownDictionaryBuilder(DictionaryBuilder.DictionaryFormat format, String encoding) { + this.format = format; this.encoding = encoding; } @@ -49,7 +51,7 @@ private UnknownDictionaryWriter readDictionaryFile(Path path) throws IOException private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding) throws IOException { - UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024); + UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(format, 5 * 1024 * 1024); List lines = new ArrayList<>(); try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding)); @@ -60,11 +62,8 @@ private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding) String line; while ((line = lineReader.readLine()) != null) { // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading - // and pronunciation, - // even though the unknown dictionary returns hardcoded null here. - final String[] parsed = - CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry - lines.add(parsed); + // and pronunciation, even though the unknown dictionary returns hardcoded null here. + lines.add(parseCSVLine(line)); } } @@ -78,6 +77,14 @@ private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding) return dictionary; } + private String[] parseCSVLine(final String line) { + if (this.format == DictionaryBuilder.DictionaryFormat.UNIDIC) { + return CSVUtil.parse(line + ",*,*,*"); // UniDic needs one more column + } else { + return CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry + } + } + private void readCharacterDefinition(Path path, UnknownDictionaryWriter dictionary) throws IOException { try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding)); diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryWriter.java index eb41296e12e3..e1d646065a77 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryWriter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryWriter.java @@ -29,8 +29,8 @@ class UnknownDictionaryWriter extends BinaryDictionaryWriter CharacterDefinition.CLASS_COUNT, CharacterDefinition::lookupCharacterClass); - public UnknownDictionaryWriter(int size) { - super(UnknownDictionary.class, new TokenInfoDictionaryEntryWriter(size)); + public UnknownDictionaryWriter(DictionaryBuilder.DictionaryFormat format, int size) { + super(UnknownDictionary.class, new TokenInfoDictionaryEntryWriter(format, size)); } @Override diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java index 2d245c7a599c..802b06c6efac 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java @@ -25,7 +25,8 @@ public class TestUnknownDictionary extends LuceneTestCase { @Test public void testPutCharacterCategory() { - UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024); + UnknownDictionaryWriter unkDic = + new UnknownDictionaryWriter(DictionaryBuilder.DictionaryFormat.IPADIC, 10 * 1024 * 1024); expectThrows(Exception.class, () -> unkDic.putCharacterCategory(0, "DUMMY_NAME")); @@ -40,7 +41,8 @@ public void testPutCharacterCategory() { @Test public void testPut() { - UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024); + UnknownDictionaryWriter unkDic = + new UnknownDictionaryWriter(DictionaryBuilder.DictionaryFormat.IPADIC, 10 * 1024 * 1024); expectThrows( NumberFormatException.class, () -> unkDic.put(CSVUtil.parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*")));