Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions pyterrier_colbert/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,10 @@ class Object(object):

class CollectionEncoder_Generator(CollectionEncoder):

def __init__(self, *args, prepend_title=False):
def __init__(self, *args, prepend_title=False, skip_empty_docs=False):
super().__init__(*args)
self.prepend_title = prepend_title
self.skip_empty_docs = skip_empty_docs

def _initialize_iterator(self):
return self.args.generator
Expand All @@ -245,12 +246,17 @@ def _preprocess_batch(self, offset, lines):

for line_idx, line in zip(range(offset, endpos), lines):
pid = line["docid"]
line_keys = line.keys()
if "text" not in line_keys and "body" in line_keys:
line["text"] = line["body"]
passage = line["text"]
if prepend_title:
title = line["title"]
passage = title + ' | ' + passage

if len(passage) == 0 or passage.isspace():
if self.skip_empty_docs:
continue
raise ValueError("There is an empty passage at %d. Aborting... " % line_idx )

batch.append(passage)
Expand All @@ -259,7 +265,7 @@ def _preprocess_batch(self, offset, lines):


class ColBERTIndexer(pt.Indexer):
def __init__(self, checkpoint, index_root, index_name, chunksize, prepend_title=False, num_docs=None, ids=True, gpu=True, mask_punctuation=False):
def __init__(self, checkpoint, index_root, index_name, chunksize, prepend_title=False, num_docs=None, ids=True, gpu=True, mask_punctuation=False, skip_empty_docs=False):
args = Object()
args.similarity = 'cosine'
args.dim = 128
Expand Down Expand Up @@ -287,6 +293,7 @@ def __init__(self, checkpoint, index_root, index_name, chunksize, prepend_title=
self.prepend_title = prepend_title
self.num_docs = num_docs
self.gpu = gpu
self.skip_empty_docs = skip_empty_docs
if not gpu:
warn("Gpu disabled, YMMV")
import colbert.parameters
Expand Down Expand Up @@ -327,7 +334,7 @@ def convert_gen(iterator):
docid+=1
yield l
self.args.generator = convert_gen(iterator)
ceg = CollectionEncoderIds(self.args,0,1) if self.ids else CollectionEncoder_Generator(self.args,0,1)
ceg = CollectionEncoderIds(self.args,0,1) if self.ids else CollectionEncoder_Generator(self.args, 0, 1, skip_empty_docs=self.skip_empty_docs)

create_directory(self.args.index_root)
create_directory(self.args.index_path)
Expand Down
21 changes: 21 additions & 0 deletions tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,27 @@ def _indexing_1doc(self, indexmgr, model, dim=None):
# def test_indexing_1doc_half(self):
# self._indexing_1doc('half')

def test_indexing_skip_empty_docs(self):
import pyterrier as pt
from pyterrier_colbert.indexing import ColBERTIndexer
import os
indexer = ColBERTIndexer(
CHECKPOINT,
os.path.dirname(self.test_dir), os.path.basename(self.test_dir),
chunksize=3,
# indexmgr=indexmgr,
gpu=False,
skip_empty_docs=True)

iter = pt.get_dataset("vaswani").get_corpus_iter()
docs = [next(iter) for i in range(200)]
docs.insert(100, {'docno': 'empty', 'text': ''}) # truly empty
docs.insert(105, {'docno': 'empty', 'text': ' '}) # whitespace only
factory = indexer.index(docs)
self.assertEqual(200, len(factory)) # check that empty docs are indeed ignored



def indexing_docnos_correctly_empty(self):
#A test case to see whether empty passages are handled correctly.
import pyterrier as pt
Expand Down