From a4574d425487c74e8dfc6c5c465418d384cbaa54 Mon Sep 17 00:00:00 2001 From: Joel Mackenzie Date: Tue, 8 Apr 2025 13:58:36 +1000 Subject: [PATCH 1/4] Shard Merge tool --- .../java/io/anserini/index/MergeShards.java | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 src/main/java/io/anserini/index/MergeShards.java diff --git a/src/main/java/io/anserini/index/MergeShards.java b/src/main/java/io/anserini/index/MergeShards.java new file mode 100644 index 0000000000..510edacb18 --- /dev/null +++ b/src/main/java/io/anserini/index/MergeShards.java @@ -0,0 +1,35 @@ +package io.anserini.index; + +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +import java.nio.file.Paths; + +public class MergeShards { + + public static void main(String[] args) throws Exception { + if (args.length < 2) { + System.err.println("Usage: MergeShards [ ...]"); + System.exit(1); + } + + String outputDir = args[0]; + FSDirectory mergedDir = FSDirectory.open(Paths.get(outputDir)); + IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer()); + IndexWriter writer = new IndexWriter(mergedDir, config); + + for (int i = 1; i < args.length; i++) { + System.out.println("Adding index: " + args[i]); + FSDirectory shardDir = FSDirectory.open(Paths.get(args[i])); + writer.addIndexes(shardDir); + } + + System.out.println("Merging..."); + writer.forceMerge(1); + writer.close(); + System.out.println("Done. Merged index at: " + outputDir); + } +} + From 91f4fc4c2742dfe59833f2a174a98beeb6ea5a1b Mon Sep 17 00:00:00 2001 From: Brayden Zhong Date: Tue, 8 Apr 2025 21:24:35 -0400 Subject: [PATCH 2/4] test: add test for MergeShards --- .../io/anserini/index/MergeShardsTest.java | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 src/test/java/io/anserini/index/MergeShardsTest.java diff --git a/src/test/java/io/anserini/index/MergeShardsTest.java b/src/test/java/io/anserini/index/MergeShardsTest.java new file mode 100644 index 0000000000..e86d349f1a --- /dev/null +++ b/src/test/java/io/anserini/index/MergeShardsTest.java @@ -0,0 +1,161 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.index; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.en.EnglishAnalyzer; +import org.apache.lucene.document.BinaryDocValuesField; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.BytesRef; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.nio.file.Path; + +import static org.junit.Assert.assertEquals; + +public class MergeShardsTest extends IndexerTestBase { + private final static PrintStream standardOut = System.out; + private final static ByteArrayOutputStream outputCaptor = new ByteArrayOutputStream(); + + private Path shardDir1; + private Path shardDir2; + private Path mergedDir; + + @Before + @Override + public void setUp() throws Exception { + super.setUp(); + shardDir1 = createTempDir(); + shardDir2 = createTempDir(); + mergedDir = createTempDir(); + + System.setOut(new PrintStream(outputCaptor)); + + buildShardIndex(shardDir1, "shard1-doc1", "shard1-doc2"); + buildShardIndex(shardDir2, "shard2-doc1", "shard2-doc2"); + } + + @After + @Override + public void tearDown() throws Exception { + System.setOut(standardOut); + System.gc(); + super.tearDown(); + } + + private void buildShardIndex(Path path, String id1, String id2) throws IOException { + Directory dir = FSDirectory.open(path); + + Analyzer analyzer = new EnglishAnalyzer(); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); + + IndexWriter writer = new IndexWriter(dir, config); + + FieldType textOptions = new FieldType(); + textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + textOptions.setStored(true); + textOptions.setTokenized(true); + textOptions.setStoreTermVectors(true); + textOptions.setStoreTermVectorPositions(true); + + Document doc1 = new Document(); + String doc1Text = "here is some text for " + id1; + doc1.add(new StringField(Constants.ID, id1, Field.Store.YES)); + doc1.add(new BinaryDocValuesField(Constants.ID, new BytesRef(id1.getBytes()))); + doc1.add(new Field(Constants.CONTENTS, doc1Text, textOptions)); + doc1.add(new StoredField(Constants.RAW, String.format("{\"contents\": \"%s\"}", doc1Text))); + writer.addDocument(doc1); + + Document doc2 = new Document(); + String doc2Text = "more texts for " + id2; + doc2.add(new StringField(Constants.ID, id2, Field.Store.YES)); + doc2.add(new BinaryDocValuesField(Constants.ID, new BytesRef(id2.getBytes()))); + doc2.add(new Field(Constants.CONTENTS, doc2Text, textOptions)); + doc2.add(new StoredField(Constants.RAW, String.format("{\"contents\": \"%s\"}", doc2Text))); + writer.addDocument(doc2); + + writer.commit(); + writer.forceMerge(1); + writer.close(); + + dir.close(); + } + + @Test + public void testMergeShards() throws Exception { + String[] args = new String[] { + mergedDir.toString(), + shardDir1.toString(), + shardDir2.toString() + }; + + MergeShards.main(args); + + String output = outputCaptor.toString(); + assert(output.contains("Adding index: " + shardDir1.toString())); + assert(output.contains("Adding index: " + shardDir2.toString())); + assert(output.contains("Merging...")); + assert(output.contains("Done. Merged index at: " + mergedDir.toString())); + + Directory dir = FSDirectory.open(mergedDir); + IndexReader reader = DirectoryReader.open(dir); + + assertEquals(4, reader.numDocs()); + + assertEquals(1, reader.docFreq(new Term(Constants.ID, "shard1-doc1"))); + assertEquals(1, reader.docFreq(new Term(Constants.ID, "shard1-doc2"))); + assertEquals(1, reader.docFreq(new Term(Constants.ID, "shard2-doc1"))); + assertEquals(1, reader.docFreq(new Term(Constants.ID, "shard2-doc2"))); + + assertEquals(4, reader.docFreq(new Term(Constants.CONTENTS, "text"))); + assertEquals(4, reader.docFreq(new Term(Constants.CONTENTS, "for"))); + + reader.close(); + dir.close(); + } + + @Test + public void testMergeShardsInvalidArgs() throws Exception { + String[] args = new String[] { mergedDir.toString() }; + + outputCaptor.reset(); + + try { + MergeShards.main(args); + } catch (SecurityException e) { + } + + } +} \ No newline at end of file From 92f236215abfe92cc1b99d6123b0d50454d569bd Mon Sep 17 00:00:00 2001 From: Brayden Zhong Date: Tue, 8 Apr 2025 21:36:49 -0400 Subject: [PATCH 3/4] Update: modify MergeShards.java and MergeShardsTest.java --- .../java/io/anserini/index/MergeShards.java | 17 +++++++++++++- .../io/anserini/index/MergeShardsTest.java | 23 +++++++++++++------ 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/src/main/java/io/anserini/index/MergeShards.java b/src/main/java/io/anserini/index/MergeShards.java index 510edacb18..478bbbd0ae 100644 --- a/src/main/java/io/anserini/index/MergeShards.java +++ b/src/main/java/io/anserini/index/MergeShards.java @@ -1,3 +1,19 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package io.anserini.index; import org.apache.lucene.index.IndexWriter; @@ -32,4 +48,3 @@ public static void main(String[] args) throws Exception { System.out.println("Done. Merged index at: " + outputDir); } } - diff --git a/src/test/java/io/anserini/index/MergeShardsTest.java b/src/test/java/io/anserini/index/MergeShardsTest.java index e86d349f1a..bddadcd4c4 100644 --- a/src/test/java/io/anserini/index/MergeShardsTest.java +++ b/src/test/java/io/anserini/index/MergeShardsTest.java @@ -43,10 +43,13 @@ import java.nio.file.Path; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; public class MergeShardsTest extends IndexerTestBase { private final static PrintStream standardOut = System.out; + private final static PrintStream standardErr = System.err; private final static ByteArrayOutputStream outputCaptor = new ByteArrayOutputStream(); + private final static ByteArrayOutputStream errorCaptor = new ByteArrayOutputStream(); private Path shardDir1; private Path shardDir2; @@ -61,6 +64,7 @@ public void setUp() throws Exception { mergedDir = createTempDir(); System.setOut(new PrintStream(outputCaptor)); + System.setErr(new PrintStream(errorCaptor)); buildShardIndex(shardDir1, "shard1-doc1", "shard1-doc2"); buildShardIndex(shardDir2, "shard2-doc1", "shard2-doc2"); @@ -70,6 +74,7 @@ public void setUp() throws Exception { @Override public void tearDown() throws Exception { System.setOut(standardOut); + System.setErr(standardErr); System.gc(); super.tearDown(); } @@ -124,10 +129,10 @@ public void testMergeShards() throws Exception { MergeShards.main(args); String output = outputCaptor.toString(); - assert(output.contains("Adding index: " + shardDir1.toString())); - assert(output.contains("Adding index: " + shardDir2.toString())); - assert(output.contains("Merging...")); - assert(output.contains("Done. Merged index at: " + mergedDir.toString())); + assertTrue(output.contains("Adding index: " + shardDir1.toString())); + assertTrue(output.contains("Adding index: " + shardDir2.toString())); + assertTrue(output.contains("Merging...")); + assertTrue(output.contains("Done. Merged index at: " + mergedDir.toString())); Directory dir = FSDirectory.open(mergedDir); IndexReader reader = DirectoryReader.open(dir); @@ -147,15 +152,19 @@ public void testMergeShards() throws Exception { } @Test - public void testMergeShardsInvalidArgs() throws Exception { + public void testValidateArguments() throws Exception { String[] args = new String[] { mergedDir.toString() }; + errorCaptor.reset(); outputCaptor.reset(); try { MergeShards.main(args); - } catch (SecurityException e) { + } catch (Exception e) { + // MergeShards might exit, we'll just catch any exception } + String errorOutput = errorCaptor.toString(); + assertTrue(errorOutput.contains("Usage: MergeShards [ ...]")); } -} \ No newline at end of file +} From eb53f0142d5d486d1fa36deb3c549bb4fd39d0dc Mon Sep 17 00:00:00 2001 From: J Mackenzie Date: Tue, 15 Apr 2025 13:50:19 +1000 Subject: [PATCH 4/4] Update MergeShards.java Haus stylin --- .../java/io/anserini/index/MergeShards.java | 45 ++++++++++--------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/src/main/java/io/anserini/index/MergeShards.java b/src/main/java/io/anserini/index/MergeShards.java index 478bbbd0ae..2c7a765082 100644 --- a/src/main/java/io/anserini/index/MergeShards.java +++ b/src/main/java/io/anserini/index/MergeShards.java @@ -25,26 +25,27 @@ public class MergeShards { - public static void main(String[] args) throws Exception { - if (args.length < 2) { - System.err.println("Usage: MergeShards [ ...]"); - System.exit(1); - } - - String outputDir = args[0]; - FSDirectory mergedDir = FSDirectory.open(Paths.get(outputDir)); - IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer()); - IndexWriter writer = new IndexWriter(mergedDir, config); - - for (int i = 1; i < args.length; i++) { - System.out.println("Adding index: " + args[i]); - FSDirectory shardDir = FSDirectory.open(Paths.get(args[i])); - writer.addIndexes(shardDir); - } - - System.out.println("Merging..."); - writer.forceMerge(1); - writer.close(); - System.out.println("Done. Merged index at: " + outputDir); - } +public static void main(String[] args) throws Exception { + if (args.length < 2) { + System.err.println("Usage: MergeShards [ ...]"); + System.exit(1); + } + + String outputDir = args[0]; + FSDirectory mergedDir = FSDirectory.open(Paths.get(outputDir)); + IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer()); + IndexWriter writer = new IndexWriter(mergedDir, config); + + for (int i = 1; i < args.length; i++) { + System.out.println("Adding index: " + args[i]); + FSDirectory shardDir = FSDirectory.open(Paths.get(args[i])); + writer.addIndexes(shardDir); + } + + System.out.println("Merging..."); + writer.forceMerge(1); + writer.close(); + System.out.println("Done. Merged index at: " + outputDir); + } + }