diff --git a/src/main/java/io/anserini/collection/ClueWeb12Collection.java b/src/main/java/io/anserini/collection/ClueWeb12Collection.java index 65a51ef6c5..8d9b165bda 100644 --- a/src/main/java/io/anserini/collection/ClueWeb12Collection.java +++ b/src/main/java/io/anserini/collection/ClueWeb12Collection.java @@ -16,7 +16,6 @@ package io.anserini.collection; -import org.apache.commons.io.input.ReaderInputStream; import org.apache.logging.log4j.LogManager; import java.io.BufferedReader; diff --git a/src/main/java/io/anserini/collection/ClueWeb22Collection.java b/src/main/java/io/anserini/collection/ClueWeb22Collection.java new file mode 100644 index 0000000000..4092d813e2 --- /dev/null +++ b/src/main/java/io/anserini/collection/ClueWeb22Collection.java @@ -0,0 +1,165 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.MappingIterator; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.HashMap; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Set; +import java.util.zip.GZIPInputStream; + +/** + * An instance of the ClueWeb22-B collection. + */ +public class ClueWeb22Collection extends DocumentCollection { + private static final Logger LOG = LogManager.getLogger(ClueWeb22Collection.class); + + public ClueWeb22Collection(Path path) { + this.path = path; + this.allowedFileSuffix = Set.of(".json.gz"); + } + + public ClueWeb22Collection() { + } + + @Override + public FileSegment createFileSegment(Path p) throws IOException { + return new Segment(p); + } + + @Override + public FileSegment createFileSegment(BufferedReader bufferedReader) throws IOException { + return new Segment(bufferedReader); + } + + /** + * A file segment in the ClueWeb22-B collection. + */ + public static class Segment extends FileSegment { + private JsonNode node = null; + private MappingIterator iterator; + + public Segment(Path path) throws IOException { + super(path); + + if (path.toString().endsWith(".gz")) { + InputStream stream = new GZIPInputStream(Files.newInputStream(path, StandardOpenOption.READ), BUFFER_SIZE); + bufferedReader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)); + } else { + bufferedReader = new BufferedReader(new FileReader(path.toString())); + } + + ObjectMapper mapper = new ObjectMapper(); + iterator = mapper.readerFor(JsonNode.class).readValues(bufferedReader); + if (iterator.hasNext()) { + node = iterator.next(); + } + } + + public Segment(BufferedReader bufferedReader) throws IOException { + super(bufferedReader); + ObjectMapper mapper = new ObjectMapper(); + iterator = mapper.readerFor(JsonNode.class).readValues(bufferedReader); + if (iterator.hasNext()) { + node = iterator.next(); + } + } + + @Override + public void readNext() throws NoSuchElementException { + if (node == null) { + throw new NoSuchElementException("JsonNode is empty"); + } else if (node.isObject()) { + bufferedRecord = new Document(node); + if (iterator.hasNext()) { + node = iterator.next(); + } else { + atEOF = true; + } + } else { + LOG.error("Error: invalid JsonNode type"); + throw new NoSuchElementException("Invalid JsonNode type"); + } + } + } + + /** + * A document in the ClueWeb22-B collection. + */ + public static class Document implements SourceDocument { + private String id; + private String contents; + private String raw; + private Map fields; + + public Document(JsonNode json) { + this.raw = json.toString(); + this.fields = new HashMap<>(); + + json.fields().forEachRemaining(e -> { + if ("ClueWeb22-ID".equals(e.getKey())) { + this.id = json.get("ClueWeb22-ID").asText(); + } else if ("Clean-Text".equals(e.getKey())) { + this.contents = json.get("Clean-Text").asText(); + } else { + this.fields.put(e.getKey(), e.getValue().asText()); + } + }); + } + + @Override + public String id() { + if (id == null) { + throw new RuntimeException("Document does not have the required \"ClueWeb22-ID\" field!"); + } + return id; + } + + @Override + public String contents() { + if (contents == null) { + throw new RuntimeException("Document does not have the required \"Clean-Text\" field!"); + } + return contents; + } + + @Override + public String raw() { + return raw; + } + + @Override + public boolean indexable() { + return true; + } + } +} diff --git a/src/test/java/io/anserini/collection/ClueWeb22CollectionTest.java b/src/test/java/io/anserini/collection/ClueWeb22CollectionTest.java new file mode 100644 index 0000000000..d4dbeb23ff --- /dev/null +++ b/src/test/java/io/anserini/collection/ClueWeb22CollectionTest.java @@ -0,0 +1,53 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.collection; + +import org.junit.Before; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Map; + +public class ClueWeb22CollectionTest extends DocumentCollectionTest { + + @Before + public void setUp() throws Exception { + super.setUp(); + + collectionPath = Paths.get("src/test/resources/sample_docs/clueweb22/"); + collection = new ClueWeb22Collection(collectionPath); + + Path segment1 = Paths.get("src/test/resources/sample_docs/clueweb22/txt/en/en00/en0000/en0000-30.json.gz"); + + segmentPaths.add(segment1); + segmentDocCounts.put(segment1, 2); + + totalSegments = 1; + totalDocs = 2; + + expected.put("clueweb22-en0000-30-00000", Map.of("id", "clueweb22-en0000-30-00000")); + expected.put("clueweb22-en0000-30-00001", Map.of("id", "clueweb22-en0000-30-00001")); + } + + @Override + void checkDocument(SourceDocument doc, Map expected) { + assertTrue(doc.indexable()); + assertEquals(expected.get("id"), doc.id()); + assertNotNull(doc.contents()); + assertTrue(doc.contents().length() > 0); + } +} diff --git a/src/test/resources/sample_docs/clueweb22/txt/en/en00/en0000/en0000-30.json.gz b/src/test/resources/sample_docs/clueweb22/txt/en/en00/en0000/en0000-30.json.gz new file mode 100644 index 0000000000..87824fa8e7 Binary files /dev/null and b/src/test/resources/sample_docs/clueweb22/txt/en/en00/en0000/en0000-30.json.gz differ