diff --git a/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 362751dc87..cbd3fb23e3 100644 --- a/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-arguana.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-arguana.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-arguana.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-arguana.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-arguana.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-arguana.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-arguana.test.txt runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-arguana.test.txt runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-arguana.test.txt runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-arguana.test.txt runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-arguana.test.txt runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-arguana.test.txt runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 85% rename from docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 644b13c46f..65cff68fda 100644 --- a/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-arguana.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-arguana.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-arguana.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-arguana.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-arguana.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-arguana.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-arguana.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-arguana.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-arguana.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-arguana.test.txt runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-arguana.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-arguana.test.txt runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-arguana.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-arguana.test.txt runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-arguana.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-arguana.test.txt runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-arguana.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-arguana.test.txt runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-arguana.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-arguana.test.txt runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-arguana.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.cached.md index 1fe094fde3..2ad8e6e910 100644 --- a/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-arguana.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-arguana.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-arguana.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.onnx.md index c426fce647..680a395407 100644 --- a/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-arguana.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-arguana.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-arguana.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-arguana.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-arguana.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-arguana.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 20067f2f78..135571e765 100644 --- a/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-bioasq.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-bioasq.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-bioasq.bge-base-en-v1.5/ \ + -M 32 -efC 1000 -quantize.sqv \ >& logs/log.beir-v1.0.0-bioasq.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-bioasq.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-bioasq.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-bioasq.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-bioasq.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 2000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-bioasq.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 16000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-bioasq.test.txt runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-bioasq.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-bioasq.test.txt runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-bioasq.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-bioasq.test.txt runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-bioasq.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-bioasq.test.txt runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-bioasq.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-bioasq.test.txt runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-bioasq.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-bioasq.test.txt runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-bioasq.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 85% rename from docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 2e6e461ae5..466a1824ae 100644 --- a/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-bioasq.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-bioasq.bge-base-en-v1.5/ \ - -M 16 -efC 500 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-bioasq.bge-base-en-v1.5/ \ + -M 32 -efC 1000 -quantize.sqv \ >& logs/log.beir-v1.0.0-bioasq.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-bioasq.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-bioasq.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-bioasq.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-bioasq.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-bioasq.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 16000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-bioasq.test.txt runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-bioasq.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-bioasq.test.txt runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-bioasq.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-bioasq.test.txt runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-bioasq.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-bioasq.test.txt runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-bioasq.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-bioasq.test.txt runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-bioasq.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-bioasq.test.txt runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-bioasq.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.cached.md index f598ade05d..4e88b2d4f1 100644 --- a/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-bioasq.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-bioasq.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 32 -efC 1000 \ >& logs/log.beir-v1.0.0-bioasq.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-bioasq.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-bioasq.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 2000 -removeQuery -threads 16 & + -hits 1000 -efSearch 16000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.onnx.md index b7b531b0c6..eadb4165d3 100644 --- a/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-bioasq.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-bioasq.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 32 -efC 1000 \ >& logs/log.beir-v1.0.0-bioasq.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-bioasq.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-bioasq.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-bioasq.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 16000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 7cb795c0b5..abfb77fd72 100644 --- a/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-climate-fever.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-climate-fever.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-climate-fever.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-climate-fever.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-climate-fever.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-climate-fever.test.txt runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-climate-fever.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-climate-fever.test.txt runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-climate-fever.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-climate-fever.test.txt runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-climate-fever.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-climate-fever.test.txt runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-climate-fever.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-climate-fever.test.txt runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-climate-fever.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-climate-fever.test.txt runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-climate-fever.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 99765e41b8..00d2caa64e 100644 --- a/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-climate-fever.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-climate-fever.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-climate-fever.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-climate-fever.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-climate-fever.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-climate-fever.test.txt runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-climate-fever.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-climate-fever.test.txt runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-climate-fever.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-climate-fever.test.txt runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-climate-fever.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-climate-fever.test.txt runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-climate-fever.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-climate-fever.test.txt runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-climate-fever.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-climate-fever.test.txt runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-climate-fever.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.cached.md index 6571b90f03..dd9a22f111 100644 --- a/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-climate-fever.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-climate-fever.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-climate-fever.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-climate-fever.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.onnx.md index 6e8bfad18a..1b9e30e541 100644 --- a/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-climate-fever.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-climate-fever.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-climate-fever.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-climate-fever.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-climate-fever.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 82% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 39a6753723..444a052368 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-android.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-android.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-android.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-android.test.txt runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-android.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-android.test.txt runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-android.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-android.test.txt runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-android.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-android.test.txt runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-android.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-android.test.txt runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-android.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-android.test.txt runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-android.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index b5cad67846..c2d0a33498 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-android.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-android.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-android.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-android.test.txt runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-android.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-android.test.txt runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-android.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-android.test.txt runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-android.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-android.test.txt runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-android.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-android.test.txt runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-android.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-android.test.txt runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-android.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.cached.md index 320ef96ba6..cc0479d376 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-android.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-cqadupstack-android.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.onnx.md index 970e923468..80277beae7 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-android.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-cqadupstack-android.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 82% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 02a0942e25..d9e1786486 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-english.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-english.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-english.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-english.test.txt runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-english.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-english.test.txt runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-english.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-english.test.txt runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-english.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-english.test.txt runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-english.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-english.test.txt runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-english.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-english.test.txt runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-english.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 258f7517ca..210c8e0f18 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-english.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-english.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-english.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-english.test.txt runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-english.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-english.test.txt runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-english.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-english.test.txt runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-english.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-english.test.txt runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-english.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-english.test.txt runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-english.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-english.test.txt runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-english.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.cached.md index cb08efb1cc..7c8f878273 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-english.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-cqadupstack-english.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.onnx.md index 0d9ffa2de8..f0980b9793 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-english.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-cqadupstack-english.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 19369227f0..cb299b7d38 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-gaming.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-gaming.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-gaming.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gaming.test.txt runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-gaming.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gaming.test.txt runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-gaming.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gaming.test.txt runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-gaming.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gaming.test.txt runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-gaming.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gaming.test.txt runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-gaming.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gaming.test.txt runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-gaming.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index f54a5ab473..789737e5c3 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-gaming.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-gaming.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-gaming.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gaming.test.txt runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-gaming.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gaming.test.txt runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-gaming.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gaming.test.txt runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-gaming.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gaming.test.txt runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-gaming.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gaming.test.txt runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-gaming.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gaming.test.txt runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-gaming.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.cached.md index 6e5b8840d8..b73f941bd0 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-gaming.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-cqadupstack-gaming.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.onnx.md index f65c70b632..807343206c 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-gaming.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-cqadupstack-gaming.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 4818114674..629a36cc78 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-gis.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-gis.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-gis.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gis.test.txt runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-gis.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gis.test.txt runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-gis.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gis.test.txt runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-gis.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gis.test.txt runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-gis.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gis.test.txt runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-gis.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gis.test.txt runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-gis.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 00f5594049..127d060e86 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-gis.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-gis.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-gis.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gis.test.txt runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-gis.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gis.test.txt runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-gis.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gis.test.txt runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-gis.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gis.test.txt runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-gis.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gis.test.txt runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-gis.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-gis.test.txt runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-gis.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.cached.md index 61df552b09..e3d7c2bd0d 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-gis.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-cqadupstack-gis.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.onnx.md index 360f53390e..915326409f 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-gis.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-cqadupstack-gis.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 81% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 72747bbb3b..e83159ed6d 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-mathematica.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-mathematica.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-mathematica.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-mathematica.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-mathematica.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-mathematica.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-mathematica.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-mathematica.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-mathematica.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 82% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 4c12032cb1..f472b35e42 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-mathematica.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-mathematica.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-mathematica.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-mathematica.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-mathematica.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-mathematica.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-mathematica.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-mathematica.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-mathematica.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.cached.md index 298ec42a39..2795c6c963 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-mathematica.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-cqadupstack-mathematica.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.onnx.md index 6891001d15..c5fdb70d57 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-mathematica.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-cqadupstack-mathematica.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 82% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 911cd23ca5..50f9584ba1 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-physics.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-physics.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-physics.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-physics.test.txt runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-physics.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-physics.test.txt runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-physics.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-physics.test.txt runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-physics.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-physics.test.txt runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-physics.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-physics.test.txt runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-physics.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-physics.test.txt runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-physics.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index f250cb8011..918d2e1026 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-physics.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-physics.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-physics.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-physics.test.txt runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-physics.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-physics.test.txt runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-physics.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-physics.test.txt runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-physics.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-physics.test.txt runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-physics.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-physics.test.txt runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-physics.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-physics.test.txt runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-physics.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.cached.md index 5ec35237ff..440e4701cc 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-physics.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-cqadupstack-physics.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.onnx.md index 3cc08d8689..ec5c0ec4b6 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-physics.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-cqadupstack-physics.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 81% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 1fd6cd0165..198b4aaa1e 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-programmers.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-programmers.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-programmers.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-programmers.test.txt runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-programmers.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-programmers.test.txt runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-programmers.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-programmers.test.txt runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-programmers.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-programmers.test.txt runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-programmers.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-programmers.test.txt runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-programmers.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-programmers.test.txt runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-programmers.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 82% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 919eec3b0b..10df581d0c 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-programmers.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-programmers.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-programmers.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-programmers.test.txt runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-programmers.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-programmers.test.txt runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-programmers.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-programmers.test.txt runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-programmers.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-programmers.test.txt runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-programmers.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-programmers.test.txt runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-programmers.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-programmers.test.txt runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-programmers.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.cached.md index dc22689655..8fe8087300 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-programmers.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-cqadupstack-programmers.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.onnx.md index adcc64c870..c6e332ff34 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-programmers.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-cqadupstack-programmers.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 82% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index fb4eaeabd0..e4dfbcf784 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-stats.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-stats.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-stats.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-stats.test.txt runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-stats.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-stats.test.txt runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-stats.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-stats.test.txt runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-stats.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-stats.test.txt runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-stats.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-stats.test.txt runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-stats.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-stats.test.txt runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-stats.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index a03bfd5fb8..2240d0889b 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-stats.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-stats.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-stats.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-stats.test.txt runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-stats.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-stats.test.txt runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-stats.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-stats.test.txt runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-stats.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-stats.test.txt runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-stats.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-stats.test.txt runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-stats.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-stats.test.txt runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-stats.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.cached.md index aa66e10b2d..c0086a6369 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-stats.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-cqadupstack-stats.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.onnx.md index 473418e14f..5fcd8bb898 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-stats.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-cqadupstack-stats.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 8d1b0d9943..51b460de4d 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-tex.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-tex.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-tex.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-tex.test.txt runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-tex.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-tex.test.txt runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-tex.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-tex.test.txt runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-tex.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-tex.test.txt runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-tex.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-tex.test.txt runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-tex.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-tex.test.txt runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-tex.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 7bf633a1d9..3fc16028f2 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-tex.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-tex.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-tex.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-tex.test.txt runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-tex.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-tex.test.txt runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-tex.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-tex.test.txt runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-tex.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-tex.test.txt runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-tex.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-tex.test.txt runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-tex.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-tex.test.txt runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-tex.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.cached.md index 6f55949665..8be1fcd321 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-tex.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-cqadupstack-tex.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.onnx.md index f7cb2e48bd..1a7340f832 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-tex.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-cqadupstack-tex.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 82% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 14c346fbf0..9e6c05e87a 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-unix.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-unix.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-unix.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-unix.test.txt runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-unix.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-unix.test.txt runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-unix.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-unix.test.txt runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-unix.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-unix.test.txt runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-unix.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-unix.test.txt runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-unix.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-unix.test.txt runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-unix.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index e6b4a5f586..a3dfd72349 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-unix.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-unix.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-unix.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-unix.test.txt runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-unix.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-unix.test.txt runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-unix.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-unix.test.txt runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-unix.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-unix.test.txt runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-unix.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-unix.test.txt runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-unix.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-unix.test.txt runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-unix.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.cached.md index 7b499f6b85..12032b42c7 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-unix.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-cqadupstack-unix.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.onnx.md index 4ca724df32..83e4d9e993 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-unix.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-cqadupstack-unix.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 82% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 53a9b49879..e29b8dbf70 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-webmasters.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-webmasters.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-webmasters.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-webmasters.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-webmasters.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-webmasters.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-webmasters.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-webmasters.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-webmasters.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 82% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index c00f06d577..444896362f 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-webmasters.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-webmasters.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-webmasters.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-webmasters.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-webmasters.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-webmasters.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-webmasters.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-webmasters.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-webmasters.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.cached.md index 26ca42dfc4..e0c47c6646 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-webmasters.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-cqadupstack-webmasters.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.onnx.md index 590e57a51e..0ea64663f8 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-webmasters.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-cqadupstack-webmasters.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 82% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index a136fd35aa..4636a057da 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-wordpress.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-wordpress.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-wordpress.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-wordpress.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-wordpress.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-cqadupstack-wordpress.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-wordpress.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-wordpress.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-cqadupstack-wordpress.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index b56805ab8a..0ae1b6c60b 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-wordpress.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-wordpress.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-wordpress.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-wordpress.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-wordpress.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-cqadupstack-wordpress.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-wordpress.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-wordpress.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-cqadupstack-wordpress.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.cached.md index cbe7560203..0ba4f8b57f 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-wordpress.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-cqadupstack-wordpress.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.onnx.md index 6e53101624..f8ee04c2f1 100644 --- a/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-cqadupstack-wordpress.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-cqadupstack-wordpress.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 37352bd3ad..33ea1c4b40 100644 --- a/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-dbpedia-entity.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-dbpedia-entity.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-dbpedia-entity.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-dbpedia-entity.test.txt runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-dbpedia-entity.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-dbpedia-entity.test.txt runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-dbpedia-entity.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-dbpedia-entity.test.txt runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-dbpedia-entity.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-dbpedia-entity.test.txt runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-dbpedia-entity.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-dbpedia-entity.test.txt runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-dbpedia-entity.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-dbpedia-entity.test.txt runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-dbpedia-entity.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index ef08264f67..91ab29fbf7 100644 --- a/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-dbpedia-entity.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-dbpedia-entity.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-dbpedia-entity.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-dbpedia-entity.test.txt runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-dbpedia-entity.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-dbpedia-entity.test.txt runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-dbpedia-entity.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-dbpedia-entity.test.txt runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-dbpedia-entity.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-dbpedia-entity.test.txt runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-dbpedia-entity.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-dbpedia-entity.test.txt runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-dbpedia-entity.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-dbpedia-entity.test.txt runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-dbpedia-entity.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.cached.md index 885e7ffae9..534b4f29a4 100644 --- a/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-dbpedia-entity.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-dbpedia-entity.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.onnx.md index b0e91184d3..2d5bcfa915 100644 --- a/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-dbpedia-entity.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-dbpedia-entity.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 660bd4a9d7..61fe5c5dc2 100644 --- a/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-fever.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-fever.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-fever.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-fever.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-fever.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-fever.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-fever.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-fever.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-fever.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-fever.test.txt runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-fever.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-fever.test.txt runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-fever.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-fever.test.txt runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-fever.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-fever.test.txt runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-fever.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-fever.test.txt runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-fever.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-fever.test.txt runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-fever.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 3eadb9de1f..6bdf1af101 100644 --- a/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-fever.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-fever.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-fever.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-fever.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-fever.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-fever.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-fever.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-fever.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-fever.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-fever.test.txt runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-fever.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-fever.test.txt runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-fever.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-fever.test.txt runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-fever.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-fever.test.txt runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-fever.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-fever.test.txt runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-fever.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-fever.test.txt runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-fever.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.cached.md index d271ec837d..8b58216a78 100644 --- a/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-fever.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-fever.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-fever.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-fever.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-fever.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.onnx.md index 427dfbde41..4fdab20db2 100644 --- a/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-fever.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-fever.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-fever.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-fever.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-fever.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-fever.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 82% rename from docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index abd8acea8e..b94a8f3afd 100644 --- a/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-fiqa.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-fiqa.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-fiqa.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-fiqa.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-fiqa.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-fiqa.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-fiqa.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-fiqa.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-fiqa.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-fiqa.test.txt runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-fiqa.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-fiqa.test.txt runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-fiqa.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-fiqa.test.txt runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-fiqa.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-fiqa.test.txt runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-fiqa.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-fiqa.test.txt runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-fiqa.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-fiqa.test.txt runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-fiqa.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 6245ec79f7..ea7f4cb4d7 100644 --- a/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-fiqa.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-fiqa.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-fiqa.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-fiqa.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-fiqa.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-fiqa.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-fiqa.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-fiqa.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-fiqa.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-fiqa.test.txt runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-fiqa.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-fiqa.test.txt runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-fiqa.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-fiqa.test.txt runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-fiqa.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-fiqa.test.txt runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-fiqa.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-fiqa.test.txt runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-fiqa.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-fiqa.test.txt runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-fiqa.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.cached.md index 96bbb7a179..98608430e3 100644 --- a/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-fiqa.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-fiqa.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-fiqa.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-fiqa.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-fiqa.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.onnx.md index b0e269181e..c68eeb2d61 100644 --- a/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-fiqa.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-fiqa.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-fiqa.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-fiqa.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-fiqa.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-fiqa.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 8f77692422..5c8ec20c23 100644 --- a/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-hotpotqa.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-hotpotqa.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-hotpotqa.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-hotpotqa.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-hotpotqa.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-hotpotqa.test.txt runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-hotpotqa.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-hotpotqa.test.txt runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-hotpotqa.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-hotpotqa.test.txt runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-hotpotqa.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-hotpotqa.test.txt runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-hotpotqa.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-hotpotqa.test.txt runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-hotpotqa.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-hotpotqa.test.txt runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-hotpotqa.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 85% rename from docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index f1b3f5eae0..ffa6c4114a 100644 --- a/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-hotpotqa.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-hotpotqa.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-hotpotqa.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-hotpotqa.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-hotpotqa.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-hotpotqa.test.txt runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-hotpotqa.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-hotpotqa.test.txt runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-hotpotqa.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-hotpotqa.test.txt runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-hotpotqa.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-hotpotqa.test.txt runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-hotpotqa.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-hotpotqa.test.txt runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-hotpotqa.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-hotpotqa.test.txt runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-hotpotqa.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.cached.md index 48a6103ee2..eeac0ab179 100644 --- a/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-hotpotqa.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-hotpotqa.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-hotpotqa.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-hotpotqa.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.onnx.md index b87cae5a23..c83a577122 100644 --- a/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-hotpotqa.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-hotpotqa.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-hotpotqa.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-hotpotqa.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-hotpotqa.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 1ecdc7ee81..1abcb5fd79 100644 --- a/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-nfcorpus.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-nfcorpus.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-nfcorpus.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-nfcorpus.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-nfcorpus.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-nfcorpus.test.txt runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-nfcorpus.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-nfcorpus.test.txt runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-nfcorpus.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-nfcorpus.test.txt runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-nfcorpus.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-nfcorpus.test.txt runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-nfcorpus.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-nfcorpus.test.txt runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-nfcorpus.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-nfcorpus.test.txt runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-nfcorpus.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 85% rename from docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 4016e00137..ed49eef320 100644 --- a/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-nfcorpus.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-nfcorpus.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-nfcorpus.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-nfcorpus.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-nfcorpus.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-nfcorpus.test.txt runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-nfcorpus.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-nfcorpus.test.txt runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-nfcorpus.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-nfcorpus.test.txt runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-nfcorpus.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-nfcorpus.test.txt runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-nfcorpus.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-nfcorpus.test.txt runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-nfcorpus.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-nfcorpus.test.txt runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-nfcorpus.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.cached.md index 0443b424cf..42d9321d9a 100644 --- a/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-nfcorpus.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-nfcorpus.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-nfcorpus.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-nfcorpus.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.onnx.md index 5f3c038cc5..74a7c2dc46 100644 --- a/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-nfcorpus.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-nfcorpus.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-nfcorpus.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-nfcorpus.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-nfcorpus.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 82% rename from docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index aa3c021654..9368a1f1d5 100644 --- a/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-nq.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-nq.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-nq.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-nq.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-nq.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-nq.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-nq.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-nq.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-nq.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-nq.test.txt runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-nq.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-nq.test.txt runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-nq.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-nq.test.txt runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-nq.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-nq.test.txt runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-nq.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-nq.test.txt runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-nq.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-nq.test.txt runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-nq.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index f980f69909..309fb1987a 100644 --- a/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-nq.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-nq.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-nq.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-nq.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-nq.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-nq.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-nq.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-nq.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-nq.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-nq.test.txt runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-nq.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-nq.test.txt runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-nq.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-nq.test.txt runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-nq.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-nq.test.txt runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-nq.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-nq.test.txt runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-nq.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-nq.test.txt runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-nq.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.cached.md index d22b273464..dfc3708a83 100644 --- a/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-nq.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-nq.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-nq.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-nq.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-nq.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.onnx.md index 6a12b31b85..89faff088a 100644 --- a/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-nq.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-nq.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-nq.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-nq.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-nq.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-nq.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 8d5c9c1308..7228e93360 100644 --- a/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-quora.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-quora.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-quora.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-quora.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-quora.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-quora.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-quora.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-quora.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-quora.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-quora.test.txt runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-quora.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-quora.test.txt runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-quora.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-quora.test.txt runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-quora.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-quora.test.txt runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-quora.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-quora.test.txt runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-quora.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-quora.test.txt runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-quora.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index a36da970ea..20d5751a34 100644 --- a/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-quora.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-quora.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-quora.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-quora.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-quora.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-quora.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-quora.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-quora.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-quora.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-quora.test.txt runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-quora.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-quora.test.txt runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-quora.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-quora.test.txt runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-quora.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-quora.test.txt runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-quora.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-quora.test.txt runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-quora.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-quora.test.txt runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-quora.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.cached.md index f5b2439a94..428cbbae67 100644 --- a/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-quora.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-quora.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-quora.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-quora.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-quora.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.onnx.md index cb20e9ba02..dd4a1f5296 100644 --- a/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-quora.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-quora.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-quora.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-quora.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-quora.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-quora.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index ca85cc2a72..118cc8800f 100644 --- a/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-robust04.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-robust04.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-robust04.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-robust04.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-robust04.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-robust04.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-robust04.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-robust04.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-robust04.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-robust04.test.txt runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-robust04.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-robust04.test.txt runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-robust04.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-robust04.test.txt runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-robust04.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-robust04.test.txt runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-robust04.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-robust04.test.txt runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-robust04.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-robust04.test.txt runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-robust04.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 85% rename from docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index f79295a6d2..b5574d4a35 100644 --- a/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-robust04.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-robust04.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-robust04.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-robust04.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-robust04.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-robust04.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-robust04.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-robust04.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-robust04.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-robust04.test.txt runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-robust04.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-robust04.test.txt runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-robust04.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-robust04.test.txt runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-robust04.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-robust04.test.txt runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-robust04.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-robust04.test.txt runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-robust04.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-robust04.test.txt runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-robust04.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.cached.md index a5dbff5517..32ec24efef 100644 --- a/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-robust04.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-robust04.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-robust04.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-robust04.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-robust04.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.onnx.md index d26f24c7d9..8d6389813f 100644 --- a/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-robust04.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-robust04.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-robust04.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-robust04.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-robust04.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-robust04.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index bb170e4d6d..bed219f060 100644 --- a/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-scidocs.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-scidocs.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-scidocs.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-scidocs.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-scidocs.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-scidocs.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-scidocs.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-scidocs.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-scidocs.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-scidocs.test.txt runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-scidocs.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-scidocs.test.txt runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-scidocs.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-scidocs.test.txt runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-scidocs.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-scidocs.test.txt runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-scidocs.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-scidocs.test.txt runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-scidocs.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-scidocs.test.txt runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-scidocs.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 85% rename from docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index f1d88e8521..8a2214ab53 100644 --- a/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-scidocs.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-scidocs.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-scidocs.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-scidocs.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-scidocs.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-scidocs.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-scidocs.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-scidocs.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-scidocs.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-scidocs.test.txt runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-scidocs.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-scidocs.test.txt runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-scidocs.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-scidocs.test.txt runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-scidocs.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-scidocs.test.txt runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-scidocs.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-scidocs.test.txt runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-scidocs.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-scidocs.test.txt runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-scidocs.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.cached.md index 9e92011e16..5bb0d944f6 100644 --- a/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-scidocs.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-scidocs.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-scidocs.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-scidocs.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-scidocs.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.onnx.md index 49875f9a5e..4f77d4490d 100644 --- a/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-scidocs.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-scidocs.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-scidocs.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-scidocs.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-scidocs.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-scidocs.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 01b06abf61..ddcea4c2f0 100644 --- a/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-scifact.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-scifact.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-scifact.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-scifact.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-scifact.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-scifact.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-scifact.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-scifact.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-scifact.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-scifact.test.txt runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-scifact.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-scifact.test.txt runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-scifact.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-scifact.test.txt runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-scifact.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-scifact.test.txt runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-scifact.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-scifact.test.txt runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-scifact.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-scifact.test.txt runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-scifact.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 85% rename from docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index cce354a71e..0103ef4328 100644 --- a/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-scifact.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-scifact.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-scifact.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-scifact.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-scifact.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-scifact.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-scifact.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-scifact.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-scifact.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-scifact.test.txt runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-scifact.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-scifact.test.txt runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-scifact.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-scifact.test.txt runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-scifact.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-scifact.test.txt runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-scifact.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-scifact.test.txt runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-scifact.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-scifact.test.txt runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-scifact.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.cached.md index 5c4e9abb50..43b474ed2e 100644 --- a/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-scifact.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-scifact.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-scifact.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-scifact.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-scifact.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.onnx.md index bc6dc43d19..6f734bec7a 100644 --- a/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-scifact.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-scifact.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-scifact.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-scifact.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-scifact.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-scifact.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 9b0f6174b7..07a43b7686 100644 --- a/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-signal1m.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-signal1m.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-signal1m.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-signal1m.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-signal1m.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-signal1m.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-signal1m.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-signal1m.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-signal1m.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-signal1m.test.txt runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-signal1m.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-signal1m.test.txt runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-signal1m.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-signal1m.test.txt runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-signal1m.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-signal1m.test.txt runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-signal1m.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-signal1m.test.txt runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-signal1m.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-signal1m.test.txt runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-signal1m.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 85% rename from docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index b96df3db77..ce25630bae 100644 --- a/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-signal1m.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-signal1m.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-signal1m.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-signal1m.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-signal1m.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-signal1m.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-signal1m.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-signal1m.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-signal1m.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-signal1m.test.txt runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-signal1m.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-signal1m.test.txt runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-signal1m.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-signal1m.test.txt runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-signal1m.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-signal1m.test.txt runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-signal1m.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-signal1m.test.txt runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-signal1m.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-signal1m.test.txt runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-signal1m.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.cached.md index d5f17061a4..6be4225ef0 100644 --- a/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-signal1m.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-signal1m.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-signal1m.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-signal1m.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-signal1m.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.onnx.md index 16a2a7c424..a687cd1c34 100644 --- a/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-signal1m.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-signal1m.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-signal1m.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-signal1m.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-signal1m.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-signal1m.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index c95e8505fb..d3f6f69042 100644 --- a/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-trec-covid.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-trec-covid.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-trec-covid.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-trec-covid.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-trec-covid.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-covid.test.txt runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-trec-covid.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-covid.test.txt runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-trec-covid.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-covid.test.txt runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-trec-covid.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-covid.test.txt runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-trec-covid.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-covid.test.txt runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-trec-covid.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-covid.test.txt runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-trec-covid.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 8a8207610a..3df78da59a 100644 --- a/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-trec-covid.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-trec-covid.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-trec-covid.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-trec-covid.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-trec-covid.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-covid.test.txt runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-trec-covid.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-covid.test.txt runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-trec-covid.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-covid.test.txt runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-trec-covid.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-covid.test.txt runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-trec-covid.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-covid.test.txt runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-trec-covid.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-covid.test.txt runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-trec-covid.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.cached.md index 25fbac5969..073c88ff83 100644 --- a/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-trec-covid.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-trec-covid.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-trec-covid.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-trec-covid.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.onnx.md index bb8d8b2e0a..92807a6cef 100644 --- a/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-trec-covid.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-trec-covid.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-trec-covid.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-trec-covid.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-trec-covid.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 84% rename from docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 156befafe2..ea20ff07a4 100644 --- a/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-trec-news.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-trec-news.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-trec-news.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-trec-news.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-trec-news.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-trec-news.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-trec-news.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-trec-news.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-trec-news.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-news.test.txt runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-trec-news.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-news.test.txt runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-trec-news.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-news.test.txt runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-trec-news.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-news.test.txt runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-trec-news.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-news.test.txt runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-trec-news.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-news.test.txt runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-trec-news.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 85% rename from docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index a217a2b683..9242eb58af 100644 --- a/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-trec-news.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-trec-news.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-trec-news.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-trec-news.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-trec-news.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-trec-news.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-trec-news.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-trec-news.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-trec-news.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-news.test.txt runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-trec-news.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-news.test.txt runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-trec-news.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-news.test.txt runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-trec-news.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-news.test.txt runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-trec-news.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-news.test.txt runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-trec-news.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-trec-news.test.txt runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-trec-news.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.cached.md index 916488177f..e162fc5644 100644 --- a/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-trec-news.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-trec-news.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 1000 \ >& logs/log.beir-v1.0.0-trec-news.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-trec-news.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-trec-news.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.onnx.md index 16bae5101e..09fe72939c 100644 --- a/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-trec-news.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-trec-news.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 1000 \ >& logs/log.beir-v1.0.0-trec-news.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-trec-news.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-trec-news.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-trec-news.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 82% rename from docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index ce5e68a1c7..06f7e7c63d 100644 --- a/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-webis-touche2020.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ - -output runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-webis-touche2020.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-webis-touche2020.test.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-webis-touche2020.test.txt runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-webis-touche2020.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-webis-touche2020.test.txt runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-webis-touche2020.test.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-webis-touche2020.test.txt runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-int8-cached.topics.beir-v1.0.0-webis-touche2020.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-webis-touche2020.test.txt runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-webis-touche2020.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-webis-touche2020.test.txt runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-webis-touche2020.test.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-webis-touche2020.test.txt runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-sqv-cached.topics.beir-v1.0.0-webis-touche2020.test.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 83% rename from docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index e78d3baee7..ef61f0617c 100644 --- a/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using ONNX to perform query encoding on the fly. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ``` -python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` All the BEIR corpora, encoded by the BGE-base-en-v1.5 model and stored in Parquet format, are available for download: @@ -33,12 +33,12 @@ Sample indexing command, building quantized HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5 & ``` @@ -52,19 +52,19 @@ After indexing has completed, you should be able to perform retrieval as follows ``` bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-webis-touche2020.test.tsv.gz \ -topicReader TsvString \ - -output runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-webis-touche2020.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -output runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-webis-touche2020.test.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: ``` -bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-webis-touche2020.test.txt runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-webis-touche2020.test.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-webis-touche2020.test.txt runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-webis-touche2020.test.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-webis-touche2020.test.txt runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-int8-onnx.topics.beir-v1.0.0-webis-touche2020.test.txt +bin/trec_eval -c -m ndcg_cut.10 tools/topics-and-qrels/qrels.beir-v1.0.0-webis-touche2020.test.txt runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-webis-touche2020.test.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.beir-v1.0.0-webis-touche2020.test.txt runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-webis-touche2020.test.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.beir-v1.0.0-webis-touche2020.test.txt runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-sqv-onnx.topics.beir-v1.0.0-webis-touche2020.test.txt ``` ## Effectiveness diff --git a/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.cached.md index ea83e7da13..efcce957d9 100644 --- a/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-webis-touche2020.test.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonStringVector \ -output runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-cached.topics.beir-v1.0.0-webis-touche2020.test.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.onnx.md index 349a68bf62..9438cb5725 100644 --- a/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -33,12 +33,12 @@ Sample indexing command, building HNSW indexes: ``` bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5 \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5 & ``` @@ -56,7 +56,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.beir-v1.0.0-webis-touche2020.test.tsv.gz \ -topicReader TsvString \ -output runs/run.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.bge-hnsw-onnx.topics.beir-v1.0.0-webis-touche2020.test.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 86% rename from docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index f13000be6c..9dd361c51d 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -11,13 +11,13 @@ In these experiments, we are using cached queries (i.e., cached results of query Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 26 GB and has MD5 With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached \ +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached \ --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet ``` @@ -53,12 +53,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & ``` @@ -75,20 +75,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.dl19-passage.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonIntVector \ - -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness @@ -115,4 +115,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 87% rename from docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index c1de53fffe..2c4158e2f5 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -11,13 +11,13 @@ In these experiments, we are performing query inference "on-the-fly" with ONNX. Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 26 GB and has MD5 With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx \ +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx \ --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet ``` @@ -53,12 +53,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & ``` @@ -75,20 +75,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.dl19-passage.txt \ -topicReader TsvInt \ - -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl19-passage.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.dl19-passage.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl19-passage.txt -bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl19-passage.txt -bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl19-passage.txt -bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.dl19-passage.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.dl19-passage.txt ``` ## Effectiveness @@ -115,4 +115,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.md index 946ad9ed39..93e805b81a 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -53,12 +53,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & ``` @@ -79,7 +79,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.dl19-passage.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonIntVector \ -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md index 49d1e8d2a7..bbe34d91d6 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -53,12 +53,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & ``` @@ -79,7 +79,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.dl19-passage.txt \ -topicReader TsvInt \ -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.dl19-passage.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.md similarity index 85% rename from docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.md index 0a6379ab89..325b45e0bf 100644 --- a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.md @@ -6,13 +6,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. @@ -20,7 +20,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -38,7 +38,7 @@ To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached \ +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached \ --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet ``` @@ -48,12 +48,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & ``` @@ -70,20 +70,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cohere-embed-english-v3.0/ \ -topics tools/topics-and-qrels/topics.dl19-passage.cohere-embed-english-v3.0.jsonl.gz \ -topicReader JsonIntVector \ - -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt -bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt -bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt -bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt ``` ## Effectiveness @@ -110,4 +110,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md index ecf721ddbf..d839bf6318 100644 --- a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md @@ -48,12 +48,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & ``` @@ -74,7 +74,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.dl19-passage.cohere-embed-english-v3.0.jsonl.gz \ -topicReader JsonIntVector \ -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.md similarity index 86% rename from docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.md index 28d7295501..22272167bf 100644 --- a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.md @@ -11,13 +11,13 @@ In these experiments, we are using cached queries (i.e., cached results of query Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 26 GB and has MD5 ch With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached \ +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached \ --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet ``` @@ -53,12 +53,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-cos-dpr-distil.parquet & ``` @@ -75,20 +75,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ \ -topics tools/topics-and-qrels/topics.dl19-passage.cos-dpr-distil.jsonl.gz \ -topicReader JsonIntVector \ - -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt \ + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt -bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt -bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt -bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt ``` ## Effectiveness @@ -115,4 +115,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.md similarity index 87% rename from docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.md index ab46eeed1e..1f629e5c8f 100644 --- a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.md @@ -11,13 +11,13 @@ In these experiments, we are performing query inference "on-the-fly" with ONNX. Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 26 GB and has MD5 ch With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx \ +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx \ --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet ``` @@ -53,12 +53,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-cos-dpr-distil.parquet & ``` @@ -75,11 +75,11 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ \ -topics tools/topics-and-qrels/topics.dl19-passage.txt \ -topicReader TsvInt \ - -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl19-passage.txt \ - -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.dl19-passage.txt \ + -encoder CosDprDistil -hits 1000 -efSearch 2000 -threads 16 & ``` Note that we are performing query inference "on-the-fly" with ONNX in these experiments. @@ -87,10 +87,10 @@ Note that we are performing query inference "on-the-fly" with ONNX in these expe Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl19-passage.txt -bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl19-passage.txt -bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl19-passage.txt -bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.dl19-passage.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.dl19-passage.txt ``` ## Effectiveness @@ -117,4 +117,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.cached.md index 9e7751037d..afa65f1ad5 100644 --- a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.cached.md @@ -53,12 +53,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-cos-dpr-distil.parquet & ``` @@ -79,7 +79,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.dl19-passage.cos-dpr-distil.jsonl.gz \ -topicReader JsonIntVector \ -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.md index 4e2d9e2644..095652d2bc 100644 --- a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.md @@ -53,12 +53,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-cos-dpr-distil.parquet & ``` @@ -79,7 +79,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.dl19-passage.txt \ -topicReader TsvInt \ -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.dl19-passage.txt \ - -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 & + -encoder CosDprDistil -hits 1000 -efSearch 2000 -threads 16 & ``` Note that we are performing query inference "on-the-fly" with ONNX in these experiments. diff --git a/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw-sqv.cached.md similarity index 86% rename from docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw-sqv.cached.md index cb84ee1604..88be4000b8 100644 --- a/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw-sqv.cached.md @@ -11,13 +11,13 @@ In these experiments, we are using cached queries (i.e., cached results of query Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.openai-ada2.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.openai-ada2.parquet.hnsw-sqv.cached ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.openai-ada2.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.openai-ada2.parquet.hnsw-sqv.cached ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 51 GB and has MD5 check With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.openai-ada2.parquet.hnsw-int8.cached \ +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.openai-ada2.parquet.hnsw-sqv.cached \ --corpus-path collections/msmarco-passage-openai-ada2.parquet ``` @@ -53,12 +53,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-openai-ada2.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.openai-ada2/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-openai-ada2.parquet & ``` @@ -75,20 +75,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.openai-ada2/ \ -topics tools/topics-and-qrels/topics.dl19-passage.openai-ada2.jsonl.gz \ -topicReader JsonIntVector \ - -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.dl19-passage.openai-ada2.jsonl.txt \ + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt -bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt -bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt -bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.dl19-passage.openai-ada2.jsonl.txt ``` ## Effectiveness @@ -115,4 +115,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw-sqv.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw.cached.md index e7437763e9..3d792bd5fe 100644 --- a/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw.cached.md @@ -53,12 +53,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-openai-ada2.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-openai-ada2.parquet & ``` @@ -79,7 +79,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.dl19-passage.openai-ada2.jsonl.gz \ -topicReader JsonIntVector \ -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.dl19-passage.openai-ada2.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 86% rename from docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index c8b4274820..35610a5338 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -11,13 +11,13 @@ In these experiments, we are using cached queries (i.e., cached results of query Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 26 GB and has MD5 With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached \ +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached \ --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet ``` @@ -53,12 +53,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & ``` @@ -75,20 +75,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.dl20.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonIntVector \ - -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness @@ -115,4 +115,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 88% rename from docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 809d39df2e..7e009b9449 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -11,13 +11,13 @@ In these experiments, we are performing query inference "on-the-fly" with ONNX. Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 26 GB and has MD5 With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx \ +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx \ --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet ``` @@ -53,12 +53,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & ``` @@ -75,20 +75,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.dl20.txt \ -topicReader TsvInt \ - -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl20.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.dl20.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl20.txt -bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl20.txt -bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl20.txt -bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl20.txt +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.dl20.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.dl20.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.dl20.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.dl20.txt ``` ## Effectiveness @@ -115,4 +115,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.md index 8cd2740d93..08c2ea1c57 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -53,12 +53,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & ``` @@ -79,7 +79,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.dl20.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonIntVector \ -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md index dabc9a3b7e..8967ab52af 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -53,12 +53,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & ``` @@ -79,7 +79,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.dl20.txt \ -topicReader TsvInt \ -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.dl20.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.md similarity index 85% rename from docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.md index f6fd40a708..1762f6a63f 100644 --- a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.md @@ -6,13 +6,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. @@ -20,7 +20,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -38,7 +38,7 @@ To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached \ +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached \ --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet ``` @@ -48,12 +48,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & ``` @@ -70,20 +70,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cohere-embed-english-v3.0/ \ -topics tools/topics-and-qrels/topics.dl20.cohere-embed-english-v3.0.jsonl.gz \ -topicReader JsonIntVector \ - -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt -bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt -bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt -bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt ``` ## Effectiveness @@ -110,4 +110,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md index bd5f295645..1e5e808f20 100644 --- a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md @@ -48,12 +48,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & ``` @@ -74,7 +74,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.dl20.cohere-embed-english-v3.0.jsonl.gz \ -topicReader JsonIntVector \ -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.md similarity index 86% rename from docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.md index 42f2e2e08a..8612b918fa 100644 --- a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.md @@ -11,13 +11,13 @@ In these experiments, we are using cached queries (i.e., cached results of query Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 26 GB and has MD5 ch With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached \ +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached \ --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet ``` @@ -53,12 +53,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-cos-dpr-distil.parquet & ``` @@ -75,20 +75,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ \ -topics tools/topics-and-qrels/topics.dl20.cos-dpr-distil.jsonl.gz \ -topicReader JsonIntVector \ - -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.dl20.cos-dpr-distil.jsonl.txt \ + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt -bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt -bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt -bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.dl20.cos-dpr-distil.jsonl.txt ``` ## Effectiveness @@ -115,4 +115,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.md similarity index 87% rename from docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.md index b2611f8d77..eb304a9746 100644 --- a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.md @@ -11,13 +11,13 @@ In these experiments, we are performing query inference "on-the-fly" with ONNX. Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 26 GB and has MD5 ch With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx \ +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx \ --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet ``` @@ -53,12 +53,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-cos-dpr-distil.parquet & ``` @@ -75,11 +75,11 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ \ -topics tools/topics-and-qrels/topics.dl20.txt \ -topicReader TsvInt \ - -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl20.txt \ - -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.dl20.txt \ + -encoder CosDprDistil -hits 1000 -efSearch 2000 -threads 16 & ``` Note that we are performing query inference "on-the-fly" with ONNX in these experiments. @@ -87,10 +87,10 @@ Note that we are performing query inference "on-the-fly" with ONNX in these expe Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl20.txt -bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl20.txt -bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl20.txt -bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl20.txt +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.dl20.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.dl20.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.dl20.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.dl20.txt ``` ## Effectiveness @@ -117,4 +117,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.cached.md index 45ffdf23f9..04b1818b66 100644 --- a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.cached.md @@ -53,12 +53,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-cos-dpr-distil.parquet & ``` @@ -79,7 +79,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.dl20.cos-dpr-distil.jsonl.gz \ -topicReader JsonIntVector \ -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.dl20.cos-dpr-distil.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.md index e3d392c752..053575c3da 100644 --- a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.md @@ -53,12 +53,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-cos-dpr-distil.parquet & ``` @@ -79,7 +79,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.dl20.txt \ -topicReader TsvInt \ -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.dl20.txt \ - -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 & + -encoder CosDprDistil -hits 1000 -efSearch 2000 -threads 16 & ``` Note that we are performing query inference "on-the-fly" with ONNX in these experiments. diff --git a/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw-sqv.cached.md similarity index 87% rename from docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw-sqv.cached.md index f87a327b92..7e3392d340 100644 --- a/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw-sqv.cached.md @@ -11,13 +11,13 @@ In these experiments, we are using cached queries (i.e., cached results of query Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.openai-ada2.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.openai-ada2.parquet.hnsw-sqv.cached ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.openai-ada2.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.openai-ada2.parquet.hnsw-sqv.cached ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 51 GB and has MD5 check With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.openai-ada2.parquet.hnsw-int8.cached \ +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.openai-ada2.parquet.hnsw-sqv.cached \ --corpus-path collections/msmarco-passage-openai-ada2.parquet ``` @@ -53,12 +53,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-openai-ada2.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.openai-ada2/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-openai-ada2.parquet & ``` @@ -75,20 +75,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.openai-ada2/ \ -topics tools/topics-and-qrels/topics.dl20.openai-ada2.jsonl.gz \ -topicReader JsonIntVector \ - -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl20.openai-ada2.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.dl20.openai-ada2.jsonl.txt \ + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl20.openai-ada2.jsonl.txt -bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl20.openai-ada2.jsonl.txt -bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl20.openai-ada2.jsonl.txt -bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.dl20.openai-ada2.jsonl.txt ``` ## Effectiveness @@ -115,4 +115,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw-sqv.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw.cached.md index 6135d95375..9c1aecdec4 100644 --- a/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw.cached.md @@ -53,12 +53,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-openai-ada2.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-openai-ada2.parquet & ``` @@ -79,7 +79,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.dl20.openai-ada2.jsonl.gz \ -topicReader JsonIntVector \ -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.dl20.openai-ada2.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md similarity index 85% rename from docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md index 9e471c17e4..c50dc95694 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. @@ -22,7 +22,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -40,7 +40,7 @@ To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 26 GB and has MD5 With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached \ +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached \ --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet ``` @@ -50,12 +50,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & ``` @@ -71,20 +71,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonIntVector \ - -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt ``` ## Effectiveness @@ -107,4 +107,4 @@ Note that both HNSW indexing and quantization are non-deterministic (i.e., resul ## Reproduction Log[*](../../docs/reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md similarity index 86% rename from docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md index 48486fae3d..a34ae38d10 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are performing query inference "on-the-fly" with ONNX. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. @@ -22,7 +22,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -40,7 +40,7 @@ To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 26 GB and has MD5 With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx \ +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx \ --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet ``` @@ -50,12 +50,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & ``` @@ -71,20 +71,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ -topicReader TsvInt \ - -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.msmarco-passage.dev-subset.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt -bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-sqv-onnx.topics.msmarco-passage.dev-subset.txt ``` ## Effectiveness @@ -107,4 +107,4 @@ Note that both HNSW indexing and quantization are non-deterministic (i.e., resul ## Reproduction Log[*](../../docs/reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.md index e8ad4fa4ff..30033360d9 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -50,12 +50,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & ``` @@ -75,7 +75,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.gz \ -topicReader JsonIntVector \ -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md index fcfbd64137..5539b7d575 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -50,12 +50,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & ``` @@ -75,7 +75,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ -topicReader TsvInt \ -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.msmarco-passage.dev-subset.txt \ - -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 & + -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.md similarity index 82% rename from docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.md index 1cbdf50a76..c27dd63269 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.md @@ -6,13 +6,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. @@ -20,7 +20,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -38,7 +38,7 @@ To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached \ +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached \ --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet ``` @@ -48,12 +48,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & ``` @@ -69,20 +69,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cohere-embed-english-v3.0/ \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.gz \ -topicReader JsonIntVector \ - -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt -bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt ``` ## Effectiveness @@ -105,4 +105,4 @@ Note that both HNSW indexing and quantization are non-deterministic (i.e., resul ## Reproduction Log[*](../../docs/reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md index e66525a038..544194f275 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md @@ -48,12 +48,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & ``` @@ -73,7 +73,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.gz \ -topicReader JsonIntVector \ -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.md similarity index 83% rename from docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md rename to docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.md index 27639e8b5a..2b54d97bf0 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using cached queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. @@ -22,7 +22,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -40,7 +40,7 @@ To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 26 GB and has MD5 ch With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached \ +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached \ --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet ``` @@ -50,12 +50,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-cos-dpr-distil.parquet & ``` @@ -71,20 +71,20 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz \ -topicReader JsonIntVector \ - -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt -bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt ``` ## Effectiveness @@ -107,6 +107,6 @@ Note that both HNSW indexing and quantization are non-deterministic (i.e., resul ## Reproduction Log[*](../../docs/reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.template) and run `bin/build.sh` to rebuild the documentation. + Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.md similarity index 84% rename from docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md rename to docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.md index b81bcf4ec1..83c79b9f8a 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are performing query inference "on-the-fly" with ONNX. -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. @@ -22,7 +22,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -40,7 +40,7 @@ To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 26 GB and has MD5 ch With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx \ +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx \ --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet ``` @@ -50,12 +50,12 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ -generator DenseVectorDocumentGenerator \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ - -M 16 -efC 100 -quantize.int8 \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 500 -quantize.sqv \ >& logs/log.msmarco-passage-cos-dpr-distil.parquet & ``` @@ -71,11 +71,11 @@ After indexing has completed, you should be able to perform retrieval as follows ```bash bin/run.sh io.anserini.search.SearchHnswDenseVectors \ - -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ -topicReader TsvInt \ - -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt \ - -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 & + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.msmarco-passage.dev-subset.txt \ + -encoder CosDprDistil -hits 1000 -efSearch 2000 -threads 16 & ``` Note that we are performing query inference "on-the-fly" with ONNX in these experiments. @@ -83,10 +83,10 @@ Note that we are performing query inference "on-the-fly" with ONNX in these expe Evaluation can be performed using `trec_eval`: ```bash -bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt -bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt -bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt -bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-sqv-onnx.topics.msmarco-passage.dev-subset.txt ``` ## Effectiveness @@ -109,4 +109,4 @@ Note that both HNSW indexing and quantization are non-deterministic (i.e., resul ## Reproduction Log[*](../../docs/reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.md index 5319c01fe5..519e18fc56 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.md @@ -50,12 +50,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-cos-dpr-distil.parquet & ``` @@ -75,7 +75,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz \ -topicReader JsonIntVector \ -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.md index 2b9a89c89d..73e8a4b2aa 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.md @@ -50,12 +50,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-cos-dpr-distil.parquet & ``` @@ -75,7 +75,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ -topicReader TsvInt \ -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.msmarco-passage.dev-subset.txt \ - -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 & + -encoder CosDprDistil -hits 1000 -efSearch 2000 -threads 16 & ``` Note that we are performing query inference "on-the-fly" with ONNX in these experiments. diff --git a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.md index e7d1c00598..9f8fb454e7 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.md @@ -50,7 +50,7 @@ Sample indexing command, building quantized HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-openai-ada2.parquet \ -generator DenseVectorDocumentGenerator \ diff --git a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached.md b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached.md new file mode 100644 index 0000000000..71f49ace33 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached.md @@ -0,0 +1,111 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: OpenAI-ada2 embeddings with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 51 GB and has MD5 checksum `a8fddf594c9b8e771637968033b12f6d`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached \ + --corpus-path collections/msmarco-passage-openai-ada2.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 4 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2.parquet \ + -generator DenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.openai-ada2/ \ + -M 16 -efC 500 -quantize.sqv \ + >& logs/log.msmarco-passage-openai-ada2.parquet & +``` + +The path `/path/to/msmarco-passage-openai-ada2.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-sqv.msmarco-v1-passage.openai-ada2/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt \ + -hits 1000 -efSearch 2000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-sqv-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|----------------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.350 | +| **RR@10** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.343 | +| **R@100** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.900 | +| **R@1000** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.986 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached.template) and run `bin/build.sh` to rebuild the documentation. + diff --git a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.md index 6cea06d071..0e94bacb55 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.md @@ -50,12 +50,12 @@ Sample indexing command, building HNSW indexes: ```bash bin/run.sh io.anserini.index.IndexHnswDenseVectors \ - -threads 16 \ + -threads 4 \ -collection ParquetDenseVectorCollection \ -input /path/to/msmarco-passage-openai-ada2.parquet \ -generator DenseVectorDocumentGenerator \ -index indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ \ - -M 16 -efC 100 \ + -M 16 -efC 500 \ >& logs/log.msmarco-passage-openai-ada2.parquet & ``` @@ -75,7 +75,7 @@ bin/run.sh io.anserini.search.SearchHnswDenseVectors \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz \ -topicReader JsonIntVector \ -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt \ - -hits 1000 -efSearch 1000 -threads 16 & + -hits 1000 -efSearch 2000 -threads 16 & ``` Evaluation can be performed using `trec_eval`: diff --git a/src/main/java/io/anserini/index/AbstractIndexer.java b/src/main/java/io/anserini/index/AbstractIndexer.java index 1612f73a09..e87fddbfd5 100644 --- a/src/main/java/io/anserini/index/AbstractIndexer.java +++ b/src/main/java/io/anserini/index/AbstractIndexer.java @@ -301,6 +301,7 @@ public void run() { LOG.info(String.format("Total %,d documents indexed in %s", numIndexed, DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"))); } + // Default method to process the segments; subclasses can override this method if desired. protected void processSegments(ThreadPoolExecutor executor, List segmentPaths) { segmentPaths.forEach((segmentPath) -> { diff --git a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java index d96de8b52c..f4a0a02232 100644 --- a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java @@ -26,16 +26,14 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.lucene102.Lucene102HnswBinaryQuantizedVectorsFormat; import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; -import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.TieredMergePolicy; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.kohsuke.args4j.CmdLineException; @@ -54,34 +52,17 @@ public static final class Args extends AbstractIndexer.Args { @Option(name = "-generator", metaVar = "[class]", usage = "Document generator class in io.anserini.index.generator.") public String generatorClass = DenseVectorDocumentGenerator.class.getSimpleName(); - @Option(name = "-M", metaVar = "[num]", usage = "HNSW parameters M") + @Option(name = "-M", metaVar = "[num]", usage = "HNSW parameters M.") public int M = 16; - @Option(name = "-efC", metaVar = "[num]", usage = "HNSW parameters ef Construction") - public int efC = 100; + @Option(name = "-efC", metaVar = "[num]", usage = "HNSW parameters ef Construction.") + public int efC = 500; - @Option(name = "-quantize.int8", usage = "Quantize vectors into int8.") - public boolean quantizeInt8 = false; + @Option(name = "-quantize.sqv", usage = "Quantize vectors using ScalarQuantizedVectors (mutually exclusive with -quantize.bqv).", forbids = "-quantize.bqv") + public boolean quantizeSQV = false; - @Option(name = "-storeVectors", usage = "Boolean switch to store raw raw vectors.") - public boolean storeVectors = false; - - @Option(name = "-noMerge", usage = "Do not merge segments (fast indexing, slow retrieval).") - public boolean noMerge = false; - - @Option(name = "-maxThreadMemoryBeforeFlush", metaVar = "[num]", usage = "Maximum memory consumption per thread before triggering a forced flush (in MB); must be smaller than 2048.") - public int maxThreadMemoryBeforeFlush = 2047; - // This is the most aggressive possible setting; default is 1945. - // If the setting is too aggressive, may result in GCLocker issues. - - @Option(name = "-maxMergedSegmentSize", metaVar = "[num]", usage = "Maximum sized segment to produce during normal merging (in MB).") - public int maxMergedSegmentSize = 1024 * 16; - - @Option(name = "-segmentsPerTier", metaVar = "[num]", usage = "Allowed number of segments per tier.") - public int segmentsPerTier = 10; - - @Option(name = "-maxMergeAtOnce", metaVar = "[num]", usage = "Maximum number of segments to be merged at a time during \"normal\" merging.") - public int maxMergeAtOnce = 10; + @Option(name = "-quantize.bqv", usage = "Quantize vectors using BinaryQuantizedVectors (mutually exclusive with -quantize.sqv).", forbids = "-quantize.sqv") + public boolean quantizeBQV = false; } @SuppressWarnings("unchecked") @@ -99,7 +80,7 @@ public IndexHnswDenseVectors(Args args) throws Exception { final Directory dir = FSDirectory.open(Paths.get(args.index)); final IndexWriterConfig config; - if (args.quantizeInt8) { + if (args.quantizeSQV) { config = new IndexWriterConfig().setCodec( new Lucene103Codec() { @Override @@ -108,6 +89,15 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { new Lucene99HnswScalarQuantizedVectorsFormat(args.M, args.efC), 4096); } }); + } else if (args.quantizeBQV) { + config = new IndexWriterConfig().setCodec( + new Lucene103Codec() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return new DelegatingKnnVectorsFormat( + new Lucene102HnswBinaryQuantizedVectorsFormat(args.M, args.efC), 4096); + } + }); } else { config = new IndexWriterConfig().setCodec( new Lucene103Codec() { @@ -119,30 +109,6 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { }); } - config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); - config.setRAMBufferSizeMB(args.memoryBuffer); - config.setRAMPerThreadHardLimitMB(args.maxThreadMemoryBeforeFlush); - config.setUseCompoundFile(false); - config.setMergeScheduler(new ConcurrentMergeScheduler()); - - if (args.noMerge) { - config.setMergePolicy(NoMergePolicy.INSTANCE); - } else { - TieredMergePolicy mergePolicy = new TieredMergePolicy(); - if (args.optimize) { - // If we're going to merge down into a single segment at the end, skip intermediate merges, - // since they are a waste of time. - mergePolicy.setMaxMergeAtOnce(256); - mergePolicy.setSegmentsPerTier(256); - } else { - mergePolicy.setFloorSegmentMB(1024); - mergePolicy.setMaxMergedSegmentMB(args.maxMergedSegmentSize); - mergePolicy.setSegmentsPerTier(args.segmentsPerTier); - mergePolicy.setMaxMergeAtOnce(args.maxMergeAtOnce); - } - config.setMergePolicy(mergePolicy); - } - this.writer = new IndexWriter(dir, config); } catch (Exception e) { throw new IllegalArgumentException(String.format("Unable to create IndexWriter: %s.", e.getMessage())); @@ -152,27 +118,12 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { LOG.info(" + Generator: " + args.generatorClass); LOG.info(" + M: " + args.M); LOG.info(" + efC: " + args.efC); - LOG.info(" + Store document vectors? " + args.storeVectors); - LOG.info(" + Int8 quantization? " + args.quantizeInt8); - LOG.info(" + Codec: " + this.writer.getConfig().getCodec()); - LOG.info(" + MemoryBuffer: " + args.memoryBuffer); - LOG.info(" + MaxThreadMemoryBeforeFlush: " + args.maxThreadMemoryBeforeFlush); - - if (args.noMerge) { - LOG.info(" + MergePolicy: NoMerge"); - } else if (args.optimize) { - LOG.info(" + MergePolicy: TieredMergePolicy (force merge into a single index segment)"); - } else { - LOG.info(" + MergePolicy: TieredMergePolicy"); - LOG.info(" + MaxMergedSegmentSize: " + args.maxMergedSegmentSize); - LOG.info(" + SegmentsPerTier: " + args.segmentsPerTier); - LOG.info(" + MaxMergeAtOnce: " + args.maxMergeAtOnce); - } + LOG.info(" + ScalarQuantizedVectors? " + args.quantizeSQV); + LOG.info(" + BinaryQuantizedVectors? " + args.quantizeBQV); } + // We need this class exists because Lucene99HnswVectorsFormat is final, and so we can't override getMaxDimensions. // Solution provided by Solr, see https://www.mail-archive.com/java-user@lucene.apache.org/msg52149.html - // This class exists because Lucene95HnswVectorsFormat's getMaxDimensions method is final and we - // need to workaround that constraint to allow more than the default number of dimensions. private static final class DelegatingKnnVectorsFormat extends KnnVectorsFormat { private final KnnVectorsFormat delegate; private final int maxDimensions; diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.template similarity index 100% rename from src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template rename to src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.template diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached.template similarity index 100% rename from src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.template rename to src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached.template diff --git a/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 80% rename from src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index fdfe1dad14..7170ad98df 100644 --- a/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-arguana.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/arguana.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-arguana.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-arguana.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-arguana.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.6361 @@ -53,8 +53,8 @@ models: - 0.9964 tolerance: nDCG@10: - - 0.001 + - 0.002 R@100: - - 0.001 + - 0.002 R@1000: - - 0.001 + - 0.002 diff --git a/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 80% rename from src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 0355df0e9b..41f1d4323d 100644 --- a/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-arguana.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/arguana.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-arguana.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-arguana.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-arguana.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.6361 @@ -53,8 +53,8 @@ models: - 0.9964 tolerance: nDCG@10: - - 0.02 + - 0.002 R@100: - - 0.025 + - 0.002 R@1000: - - 0.003 + - 0.002 diff --git a/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.cached.yaml index b4b4c82b54..35d153467b 100644 --- a/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-arguana.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.6361 diff --git a/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 0f49b55123..46b975bd89 100644 --- a/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-arguana.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-arguana.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.6361 @@ -53,8 +53,8 @@ models: - 0.9964 tolerance: nDCG@10: - - 0.02 + - 0.001 R@100: - - 0.02 + - 0.001 R@1000: - - 0.004 + - 0.001 diff --git a/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 81% rename from src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 8bd3a54a4f..093579a937 100644 --- a/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-bioasq.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/bioasq.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-bioasq.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-bioasq.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 32 -efC 1000 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-bioasq.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 16000 -removeQuery -threads 16 results: nDCG@10: - 0.4149 @@ -53,8 +53,8 @@ models: - 0.8059 tolerance: nDCG@10: - - 0.03 + - 0.02 R@100: - - 0.04 + - 0.03 R@1000: - - 0.06 + - 0.05 diff --git a/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 80% rename from src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 8a5f76f61f..3daf86a240 100644 --- a/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-bioasq.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/bioasq.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-bioasq.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-bioasq.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 500 -quantize.int8 +index_threads: 4 +index_options: -M 32 -efC 1000 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-bioasq.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 16000 -removeQuery -threads 16 results: nDCG@10: - 0.4149 @@ -53,8 +53,8 @@ models: - 0.8059 tolerance: nDCG@10: - - 0.03 + - 0.02 R@100: - - 0.035 + - 0.03 R@1000: - - 0.06 + - 0.05 diff --git a/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 6ed9e22793..80e0b4a69b 100644 --- a/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-bioasq.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 32 -efC 1000 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 16000 -removeQuery -threads 16 results: nDCG@10: - 0.4149 diff --git a/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index eabf532d2c..f957905695 100644 --- a/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-bioasq.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-bioasq.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 32 -efC 1000 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 16000 -removeQuery -threads 16 results: nDCG@10: - 0.4149 diff --git a/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 8014c502eb..7d5692d2e8 100644 --- a/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-climate-fever.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/climate-fever.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-climate-fever.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.3119 diff --git a/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 81% rename from src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 39081b1764..f87982499c 100644 --- a/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-climate-fever.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/climate-fever.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-climate-fever.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.3119 @@ -53,8 +53,8 @@ models: - 0.8307 tolerance: nDCG@10: - - 0.006 + - 0.005 R@100: - - 0.002 + - 0.003 R@1000: - 0.003 diff --git a/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 305c2a92ee..149e2db94a 100644 --- a/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.3119 @@ -55,6 +55,6 @@ models: nDCG@10: - 0.001 R@100: - - 0.001 + - 0.002 R@1000: - 0.003 diff --git a/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 6f29bddd12..e48f45b152 100644 --- a/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-climate-fever.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-climate-fever.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.3119 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 7925c4533f..f1d22fe02c 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-android.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-android.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.5075 @@ -55,6 +55,6 @@ models: nDCG@10: - 0.002 R@100: - - 0.002 + - 0.003 R@1000: - - 0.001 + - 0.002 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index e4dce71d9f..958891f63b 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-android.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-android.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.5075 @@ -57,4 +57,4 @@ models: R@100: - 0.003 R@1000: - - 0.001 + - 0.002 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 3d7614decc..a6b3dba719 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5 index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.5075 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index b9906b4aad..455af0a027 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-android.bge-base-en-v1.5 index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.5075 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index e3614c15d7..9fdb6976f2 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-english.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-english.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4857 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 0973628bd8..3787952ae9 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-english.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-english.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4857 @@ -53,7 +53,7 @@ models: - 0.8839 tolerance: nDCG@10: - - 0.002 + - 0.003 R@100: - 0.003 R@1000: diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 74a54fccb8..27ed46eead 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5 index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4857 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 399077dc86..d19cd20586 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-english.bge-base-en-v1.5 index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4857 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index b4a4a90605..9c51efbdaf 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-gaming.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-gaming.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.5965 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index d7b02a24a2..565a60ac2c 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-gaming.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-gaming.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.5965 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 0a85aa823f..ee76847768 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.5965 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index dc8863a2a1..fe682a6340 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-gaming.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.5965 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index ec8df72f40..f89482e72f 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-gis.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-gis.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4127 @@ -53,7 +53,7 @@ models: - 0.9117 tolerance: nDCG@10: - - 0.003 + - 0.004 R@100: - 0.002 R@1000: diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index e6c2de025f..28339e3027 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-gis.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-gis.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4127 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.cached.yaml index b6a1aba267..ecadf4aed3 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4127 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 86990d4f66..b013ac6804 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-gis.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4127 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index a3ecaa8085..dd3928b7c0 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-mathematica.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3163 @@ -57,4 +57,4 @@ models: R@100: - 0.001 R@1000: - - 0.001 + - 0.002 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 97cc48e232..be49e744c9 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-mathematica.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-mathematica.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3163 @@ -57,4 +57,4 @@ models: R@100: - 0.001 R@1000: - - 0.001 + - 0.002 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 4b3640bbd4..f79e755c1c 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-mathematica.bge-base-en- index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3163 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index fa8c51f16d..7ad087ecab 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-mathematica.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-mathematica.bge-base-en- index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3163 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 5066f8a0fb..d171544307 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-physics.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-physics.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4722 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 3a62552f51..5918096b65 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-physics.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-physics.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4722 @@ -53,7 +53,7 @@ models: - 0.9406 tolerance: nDCG@10: - - 0.001 + - 0.002 R@100: - 0.003 R@1000: diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 661666e4df..19d167ca02 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5 index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4722 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index d73893b7bd..cf491aeff5 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-physics.bge-base-en-v1.5 index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4722 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 38d6f37895..779feb4f20 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-programmers.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-programmers.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4242 @@ -55,6 +55,6 @@ models: nDCG@10: - 0.001 R@100: - - 0.001 + - 0.002 R@1000: - 0.002 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index b46d49e7ab..b9223f5997 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-programmers.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-programmers.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4242 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 9c138178bc..f474e347c7 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-programmers.bge-base-en- index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4242 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 71684137aa..04a15416c6 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-programmers.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-programmers.bge-base-en- index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4242 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 80% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 74bf636876..11a5648eca 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-stats.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-stats.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3732 @@ -53,8 +53,8 @@ models: - 0.8445 tolerance: nDCG@10: - - 0.002 + - 0.004 R@100: - - 0.001 + - 0.003 R@1000: - - 0.008 + - 0.003 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 80% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index afa1b5452c..4371d2557b 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-stats.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-stats.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3732 @@ -53,8 +53,8 @@ models: - 0.8445 tolerance: nDCG@10: - - 0.005 + - 0.004 R@100: - - 0.002 + - 0.003 R@1000: - - 0.01 + - 0.003 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 5831b23515..ad35096763 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3732 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 64b2968526..a79e5d6bf8 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-stats.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3732 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 81% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index e6ca9e3201..6fe6c6f945 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-tex.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-tex.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3115 @@ -53,8 +53,8 @@ models: - 0.8537 tolerance: nDCG@10: - - 0.001 + - 0.002 R@100: - - 0.001 + - 0.002 R@1000: - 0.002 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 81% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 8ff9a43437..f7230a3a8e 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-tex.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-tex.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3115 @@ -53,8 +53,8 @@ models: - 0.8537 tolerance: nDCG@10: - - 0.001 + - 0.002 R@100: - - 0.001 + - 0.002 R@1000: - 0.002 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 34aa1bd663..ae89249df8 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3115 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index c0d77c9c2e..603da2cd5f 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-tex.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3115 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index d21f757222..ff3dd05668 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-unix.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-unix.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4219 @@ -55,6 +55,6 @@ models: nDCG@10: - 0.002 R@100: - - 0.003 + - 0.004 R@1000: - 0.001 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index a787092676..54f98ec97e 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-unix.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-unix.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4219 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 3223c267e2..d147434101 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4219 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 1bde9aab4f..04bc9e1e05 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-unix.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4219 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 81% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 14e1f7479f..be1a0300be 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-webmasters.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4065 @@ -53,8 +53,8 @@ models: - 0.9380 tolerance: nDCG@10: - - 0.005 + - 0.004 R@100: - - 0.003 + - 0.004 R@1000: - - 0.001 + - 0.003 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 7c7bd2618e..e0dd50489c 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-webmasters.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-webmasters.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4065 @@ -57,4 +57,4 @@ models: R@100: - 0.004 R@1000: - - 0.002 + - 0.003 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 5d30d9d42e..0d2737afce 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4065 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 7e9f47b5a4..47c63b86e1 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-webmasters.bge-base-en-v index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4065 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 80% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index be27cfb9bd..d213db631d 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-wordpress.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3547 @@ -53,8 +53,8 @@ models: - 0.8861 tolerance: nDCG@10: - - 0.001 + - 0.004 R@100: - - 0.001 + - 0.004 R@1000: - - 0.001 + - 0.004 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 77e5017517..edfc45d0bc 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/cqadupstack-wordpress.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-cqadupstack-wordpress.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3547 @@ -53,7 +53,7 @@ models: - 0.8861 tolerance: nDCG@10: - - 0.006 + - 0.004 R@100: - 0.004 R@1000: diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 0d12ebcd90..5907b13bba 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1 index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3547 diff --git a/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index bc75cba7b8..9bd5c473b3 100644 --- a/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-cqadupstack-wordpress.bge-base-en-v1 index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3547 @@ -55,6 +55,6 @@ models: nDCG@10: - 0.001 R@100: - - 0.004 + - 0.002 R@1000: - 0.001 diff --git a/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 2223c4d46d..746b95f9b2 100644 --- a/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/dbpedia-entity.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-dbpedia-entity.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.4074 diff --git a/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 24a83f91b9..fe16951917 100644 --- a/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/dbpedia-entity.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-dbpedia-entity.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.4074 @@ -53,7 +53,7 @@ models: - 0.7833 tolerance: nDCG@10: - - 0.004 + - 0.003 R@100: - 0.01 R@1000: diff --git a/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.cached.yaml index e0611c7a73..29b99f703b 100644 --- a/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.4074 diff --git a/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 5c2ce24693..e5e3619b4a 100644 --- a/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-dbpedia-entity.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.4074 diff --git a/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 879551ddc2..a68649c781 100644 --- a/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-fever.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/fever.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-fever.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-fever.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-fever.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.8630 diff --git a/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index a51933e340..0c5ec10b18 100644 --- a/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-fever.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/fever.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-fever.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-fever.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-fever.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.8630 @@ -53,7 +53,7 @@ models: - 0.9855 tolerance: nDCG@10: - - 0.02 + - 0.015 R@100: - 0.02 R@1000: diff --git a/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 4e1e863c14..59198c3735 100644 --- a/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-fever.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.8630 diff --git a/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index ab61b5bbff..b8140e54c6 100644 --- a/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-fever.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-fever.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.8630 diff --git a/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 78e1cf7bad..246d860375 100644 --- a/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-fiqa.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/fiqa.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-fiqa.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-fiqa.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-fiqa.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4065 diff --git a/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 6f42ec9ebb..36e16789da 100644 --- a/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-fiqa.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/fiqa.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-fiqa.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-fiqa.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-fiqa.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4065 diff --git a/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.cached.yaml index a0f3aa0a83..32f20b7618 100644 --- a/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-fiqa.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4065 diff --git a/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index fadc555cbc..a70767a8da 100644 --- a/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-fiqa.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-fiqa.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4065 diff --git a/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index ec0dcaff3c..b59db394fb 100644 --- a/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-hotpotqa.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/hotpotqa.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-hotpotqa.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.7259 diff --git a/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 0777d77963..3de81e8973 100644 --- a/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-hotpotqa.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/hotpotqa.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-hotpotqa.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.7259 @@ -55,6 +55,6 @@ models: nDCG@10: - 0.02 R@100: - - 0.025 + - 0.03 R@1000: - 0.03 diff --git a/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 0fed87e159..ef3dcba857 100644 --- a/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.7259 diff --git a/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 4146206766..3c540b33f1 100644 --- a/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-hotpotqa.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-hotpotqa.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.7259 diff --git a/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index ea5f4c8f85..c16c9dfc92 100644 --- a/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-nfcorpus.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/nfcorpus.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-nfcorpus.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3735 diff --git a/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index ed59c6ba39..f397769d39 100644 --- a/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-nfcorpus.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/nfcorpus.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-nfcorpus.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3735 @@ -57,4 +57,4 @@ models: R@100: - 0.002 R@1000: - - 0.006 + - 0.005 diff --git a/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 827a434c60..2b02f54572 100644 --- a/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3735 diff --git a/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 9df1c08caa..f1a9121eab 100644 --- a/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-nfcorpus.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-nfcorpus.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.3735 diff --git a/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 86b904bd49..1c0951602f 100644 --- a/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-nq.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/nq.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-nq.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-nq.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-nq.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.5413 diff --git a/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index e4487a0014..dd8a084a39 100644 --- a/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-nq.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/nq.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-nq.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-nq.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-nq.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.5413 diff --git a/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 070ca3ad43..18d75491c3 100644 --- a/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-nq.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.5413 @@ -55,6 +55,6 @@ models: nDCG@10: - 0.004 R@100: - - 0.008 + - 0.007 R@1000: - 0.009 diff --git a/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index b87f598ef8..1df6df3801 100644 --- a/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-nq.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-nq.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.5413 diff --git a/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index c4c76c6bd7..4effdb7b3f 100644 --- a/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-quora.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/quora.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-quora.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-quora.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-quora.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.8890 diff --git a/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 3b8d6b8532..fd6f62caa2 100644 --- a/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-quora.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/quora.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-quora.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-quora.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-quora.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.8890 diff --git a/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 1870d4d21b..ec42dabcb2 100644 --- a/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-quora.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.8890 @@ -53,7 +53,7 @@ models: - 0.9998 tolerance: nDCG@10: - - 0.001 + - 0.002 R@100: - 0.001 R@1000: diff --git a/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index d88adb1f4e..ff7ae9f706 100644 --- a/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-quora.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-quora.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.8890 diff --git a/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 81% rename from src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index ec136f9993..cad1bd1d8e 100644 --- a/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-robust04.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/robust04.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-robust04.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-robust04.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-robust04.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4465 @@ -53,8 +53,8 @@ models: - 0.5981 tolerance: nDCG@10: - - 0.002 - R@100: - 0.005 + R@100: + - 0.006 R@1000: - - 0.005 + - 0.007 diff --git a/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index ac6a3c6f69..d460428c14 100644 --- a/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-robust04.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/robust04.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-robust04.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-robust04.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-robust04.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4465 diff --git a/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 5f3b9a0917..477621684b 100644 --- a/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-robust04.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4465 @@ -53,8 +53,8 @@ models: - 0.5981 tolerance: nDCG@10: - - 0.001 + - 0.004 R@100: - - 0.003 - R@1000: - 0.004 + R@1000: + - 0.007 diff --git a/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 90e784a972..7bcaf3bbca 100644 --- a/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-robust04.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-robust04.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4465 @@ -55,6 +55,6 @@ models: nDCG@10: - 0.004 R@100: - - 0.002 + - 0.004 R@1000: - - 0.006 + - 0.007 diff --git a/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 223604a906..a104a2830d 100644 --- a/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-scidocs.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/scidocs.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-scidocs.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-scidocs.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-scidocs.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.2170 diff --git a/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 06a7052545..753b6d57ed 100644 --- a/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-scidocs.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/scidocs.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-scidocs.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-scidocs.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-scidocs.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.2170 diff --git a/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 137265e6a0..f1b20dfa23 100644 --- a/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-scidocs.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.2170 diff --git a/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index c46164fa46..8967645d9e 100644 --- a/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-scidocs.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-scidocs.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.2170 diff --git a/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 81% rename from src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index dcc5c35a36..140737c88d 100644 --- a/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-scifact.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/scifact.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-scifact.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-scifact.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-scifact.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.7408 @@ -53,8 +53,8 @@ models: - 0.9967 tolerance: nDCG@10: - - 0.001 + - 0.002 R@100: - 0.003 R@1000: - - 0.001 + - 0.002 diff --git a/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 81% rename from src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 4ae8f01f46..2e2f1ed6a0 100644 --- a/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-scifact.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/scifact.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-scifact.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-scifact.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-scifact.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.7408 @@ -55,6 +55,6 @@ models: nDCG@10: - 0.002 R@100: - - 0.003 + - 0.005 R@1000: - - 0.001 + - 0.005 diff --git a/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 414a0836a7..b54390b6bb 100644 --- a/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-scifact.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.7408 diff --git a/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index e7c812fe5e..96b1fefd15 100644 --- a/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-scifact.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-scifact.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.7408 diff --git a/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 1aa1ea241c..086b730f8c 100644 --- a/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-signal1m.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/signal1m.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-signal1m.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-signal1m.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-signal1m.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.2886 @@ -53,8 +53,8 @@ models: - 0.5331 tolerance: nDCG@10: - - 0.025 + - 0.02 R@100: - - 0.03 + - 0.025 R@1000: - 0.05 diff --git a/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 118c7a7fab..3be9145f83 100644 --- a/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-signal1m.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/signal1m.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-signal1m.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-signal1m.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-signal1m.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.2886 diff --git a/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.cached.yaml index ac6961c8d2..c12e39df2d 100644 --- a/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-signal1m.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.2886 @@ -57,4 +57,4 @@ models: R@100: - 0.03 R@1000: - - 0.05 + - 0.045 diff --git a/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 7c1b734cf2..5bab5d0bc1 100644 --- a/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-signal1m.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-signal1m.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 4000 -removeQuery -threads 16 results: nDCG@10: - 0.2886 diff --git a/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 89bdc8c5f0..3172124cde 100644 --- a/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-trec-covid.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/trec-covid.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-trec-covid.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.7814 diff --git a/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 82% rename from src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 3526c79f35..464eabf5e1 100644 --- a/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-trec-covid.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/trec-covid.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-trec-covid.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.7814 @@ -55,6 +55,6 @@ models: nDCG@10: - 0.006 R@100: - - 0.001 + - 0.002 R@1000: - 0.002 diff --git a/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 46d518a000..1df9fd6afc 100644 --- a/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.7814 diff --git a/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 4c4f32d7e6..cd39d18c82 100644 --- a/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-trec-covid.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-trec-covid.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.7814 diff --git a/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 9c9304026a..d8be063a7b 100644 --- a/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-trec-news.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/trec-news.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-trec-news.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-trec-news.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-trec-news.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4425 diff --git a/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 7d56ed8676..4329c793a2 100644 --- a/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-trec-news.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/trec-news.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-trec-news.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-trec-news.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-trec-news.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4425 diff --git a/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 3f19726347..2c71caecfe 100644 --- a/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-trec-news.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 1000 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4425 @@ -53,7 +53,7 @@ models: - 0.7875 tolerance: nDCG@10: - - 0.003 + - 0.004 R@100: - 0.01 R@1000: diff --git a/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 292d14f647..b4cbb844e5 100644 --- a/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-trec-news.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-trec-news.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 1000 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.4425 @@ -53,8 +53,8 @@ models: - 0.7875 tolerance: nDCG@10: - - 0.003 + - 0.004 R@100: - - 0.009 + - 0.01 R@1000: - 0.02 diff --git a/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 81% rename from src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index 80dc7a078c..9d08bac41c 100644 --- a/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-webis-touche2020.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/webis-touche2020.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-webis-touche2020.test.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.2570 @@ -55,6 +55,6 @@ models: nDCG@10: - 0.008 R@100: - - 0.002 + - 0.003 R@1000: - - 0.005 + - 0.006 diff --git a/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 83% rename from src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 92f1536d63..cc6d5a8f10 100644 --- a/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -2,12 +2,12 @@ corpus: beir-v1.0.0-webis-touche2020.bge-base-en-v1.5 corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/webis-touche2020.parquet -index_path: indexes/lucene-hnsw-int8.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: nDCG@10 @@ -40,10 +40,10 @@ topics: qrel: qrels.beir-v1.0.0-webis-touche2020.test.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.2570 diff --git a/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 69fd52dbd9..6cb3c30c96 100644 --- a/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.2570 @@ -55,6 +55,6 @@ models: nDCG@10: - 0.001 R@100: - - 0.001 + - 0.002 R@1000: - 0.001 diff --git a/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 18c57beb8b..f3447a9657 100644 --- a/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/beir-v1.0.0-webis-touche2020.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -6,8 +6,8 @@ index_path: indexes/lucene-hnsw.beir-v1.0.0-webis-touche2020.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: nDCG@10 @@ -43,7 +43,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -removeQuery -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -removeQuery -threads 16 results: nDCG@10: - 0.2570 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 87% rename from src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index c5776c3b23..5f90eba023 100644 --- a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar download_checksum: a55b3cb338ec4a1b1c36825bf0854648 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.dl19-passage.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4435 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 87% rename from src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 281e51cc87..217ab72a67 100644 --- a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar download_checksum: a55b3cb338ec4a1b1c36825bf0854648 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.dl19-passage.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4435 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 179b9588e0..ad9c68e682 100644 --- a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4435 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index a6e5a8d8b3..999c6e1c1a 100644 --- a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4435 diff --git a/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.yaml similarity index 86% rename from src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.yaml index 02b8df030f..fdef9d18aa 100644 --- a/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parqu download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar download_checksum: 760dfb5ba9e2b0cc6f7e527e518fef03 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.cohere-embed-english-v3.0/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.dl19-passage.txt models: - - name: cohere-embed-english-v3.0-hnsw-int8-cached + - name: cohere-embed-english-v3.0-hnsw-sqv-cached display: cohere-embed-english-v3.0 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4884 diff --git a/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml index 7c67a29eb7..72550bf226 100644 --- a/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: cohere-embed-english-v3.0-hnsw-cached display: cohere-embed-english-v3.0 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4884 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.yaml similarity index 87% rename from src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.yaml index 4d3e873b24..ca065f3618 100644 --- a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar download_checksum: b9183de205fbd5c799211c21187179e7 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.dl19-passage.txt models: - - name: cos-dpr-distil-hnsw-int8-cached + - name: cos-dpr-distil-hnsw-sqv-cached display: cosDPR-distil type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4656 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.yaml similarity index 86% rename from src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.yaml index fdb7747047..11abb1b03e 100644 --- a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar download_checksum: b9183de205fbd5c799211c21187179e7 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.dl19-passage.txt models: - - name: cos-dpr-distil-hnsw-int8-onnx + - name: cos-dpr-distil-hnsw-sqv-onnx display: cosDPR-distil type: hnsw - params: -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 + params: -encoder CosDprDistil -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4656 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.cached.yaml index 940145ae9b..f12b307d35 100644 --- a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.cached.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: cos-dpr-distil-hnsw-cached display: cosDPR-distil type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4656 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml index 9991cf71b4..d57cc4d778 100644 --- a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: cos-dpr-distil-hnsw-onnx display: cosDPR-distil type: hnsw - params: -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 + params: -encoder CosDprDistil -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4656 diff --git a/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw-sqv.cached.yaml similarity index 87% rename from src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw-sqv.cached.yaml index a512e02888..1ab8c91730 100644 --- a/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw-sqv.cached.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar download_checksum: a8fddf594c9b8e771637968033b12f6d -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.openai-ada2/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.dl19-passage.txt models: - - name: openai-ada2-hnsw-int8-cached + - name: openai-ada2-hnsw-sqv-cached display: OpenAI-ada2 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4788 diff --git a/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw.cached.yaml index d32285468f..d8394400fd 100644 --- a/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw.cached.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: openai-ada2-hnsw-cached display: OpenAI-ada2 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4788 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 87% rename from src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index c6d797b3b7..bae904dd89 100644 --- a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar download_checksum: a55b3cb338ec4a1b1c36825bf0854648 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.dl20-passage.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4650 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 87% rename from src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 7bfbc318fe..bd1842388b 100644 --- a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar download_checksum: a55b3cb338ec4a1b1c36825bf0854648 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.dl20-passage.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4650 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 1437c03ae6..1f4d926fd8 100644 --- a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4650 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index 6d78987b60..981ef23bfc 100644 --- a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4650 diff --git a/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.yaml similarity index 86% rename from src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.yaml index d8849a877e..d0d4865c09 100644 --- a/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parqu download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar download_checksum: 760dfb5ba9e2b0cc6f7e527e518fef03 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.cohere-embed-english-v3.0/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.dl20-passage.txt models: - - name: cohere-embed-english-v3.0-hnsw-int8-cached + - name: cohere-embed-english-v3.0-hnsw-sqv-cached display: cohere-embed-english-v3.0 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.5067 diff --git a/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml index d76ae22319..3b1bb22433 100644 --- a/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: cohere-embed-english-v3.0-hnsw-cached display: cohere-embed-english-v3.0 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.5067 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.yaml similarity index 87% rename from src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.yaml index c1576346cd..c209e786ae 100644 --- a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar download_checksum: b9183de205fbd5c799211c21187179e7 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.dl20-passage.txt models: - - name: cos-dpr-distil-hnsw-int8-cached + - name: cos-dpr-distil-hnsw-sqv-cached display: cosDPR-distil type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4876 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.yaml similarity index 86% rename from src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.yaml index 9a4f8545b6..9e927cc54c 100644 --- a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar download_checksum: b9183de205fbd5c799211c21187179e7 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.dl20-passage.txt models: - - name: cos-dpr-distil-hnsw-int8-onnx + - name: cos-dpr-distil-hnsw-sqv-onnx display: cosDPR-distil type: hnsw - params: -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 + params: -encoder CosDprDistil -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4876 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.cached.yaml index 6451f92941..eef61d5a29 100644 --- a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.cached.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: cos-dpr-distil-hnsw-cached display: cosDPR-distil type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4876 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml index ded4621a35..9d5e17095d 100644 --- a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: cos-dpr-distil-hnsw-onnx display: cosDPR-distil type: hnsw - params: -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 + params: -encoder CosDprDistil -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4876 diff --git a/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw-sqv.cached.yaml similarity index 87% rename from src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw-sqv.cached.yaml index 2bb5a13f82..c3fe96c913 100644 --- a/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw-sqv.cached.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar download_checksum: a8fddf594c9b8e771637968033b12f6d -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.openai-ada2/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.dl20-passage.txt models: - - name: openai-ada2-hnsw-int8-cached + - name: openai-ada2-hnsw-sqv-cached display: OpenAI-ada2 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4771 diff --git a/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw.cached.yaml index 15f7e8f4a3..899666296c 100644 --- a/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw.cached.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: openai-ada2-hnsw-cached display: OpenAI-ada2 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.4771 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml similarity index 88% rename from src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml index b78b4a33ef..a85f4e6c77 100644 --- a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.cached.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar download_checksum: a55b3cb338ec4a1b1c36825bf0854648 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.msmarco-passage.dev-subset.txt models: - - name: bge-hnsw-int8-cached + - name: bge-hnsw-sqv-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.3641 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml similarity index 87% rename from src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml index 09405a10ff..3ffb3e4c14 100644 --- a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-sqv.onnx.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar download_checksum: a55b3cb338ec4a1b1c36825bf0854648 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.msmarco-passage.dev-subset.txt models: - - name: bge-hnsw-int8-onnx + - name: bge-hnsw-sqv-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.3641 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml index 7e6e8db9cd..32572ebf2d 100644 --- a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: bge-hnsw-cached display: BGE-base-en-v1.5 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.3641 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml index a9615e5e39..4f62271c1e 100644 --- a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: bge-hnsw-onnx display: BGE-base-en-v1.5 type: hnsw - params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.3641 diff --git a/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.yaml similarity index 87% rename from src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.yaml index a422e84f99..907d563de8 100644 --- a/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-sqv.cached.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parqu download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar download_checksum: 760dfb5ba9e2b0cc6f7e527e518fef03 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.cohere-embed-english-v3.0/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.msmarco-passage.dev-subset.txt models: - - name: cohere-embed-english-v3.0-hnsw-int8-cached + - name: cohere-embed-english-v3.0-hnsw-sqv-cached display: cohere-embed-english-v3.0 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.3716 diff --git a/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml index de62768225..2b6b1cb614 100644 --- a/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: cohere-embed-english-v3.0-hnsw-cached display: cohere-embed-english-v3.0 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.3716 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.yaml similarity index 87% rename from src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.yaml index e99f88dbfc..3aaf0b9a63 100644 --- a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.cached.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar download_checksum: b9183de205fbd5c799211c21187179e7 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.msmarco-passage.dev-subset.txt models: - - name: cos-dpr-distil-hnsw-int8-cached + - name: cos-dpr-distil-hnsw-sqv-cached display: cosDPR-distil type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.3942 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.yaml similarity index 86% rename from src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml rename to src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.yaml index 05dae8fba3..04b3588033 100644 --- a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-sqv.onnx.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar download_checksum: b9183de205fbd5c799211c21187179e7 -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.cos-dpr-distil/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.msmarco-passage.dev-subset.txt models: - - name: cos-dpr-distil-hnsw-int8-onnx + - name: cos-dpr-distil-hnsw-sqv-onnx display: cosDPR-distil type: hnsw - params: -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 + params: -encoder CosDprDistil -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.3942 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.yaml index 2dfefb195c..4d2bfb0bd4 100644 --- a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: cos-dpr-distil-hnsw-cached display: cosDPR-distil type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.3942 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml index 85a7c2e821..bc9e7a17ed 100644 --- a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: cos-dpr-distil-hnsw-onnx display: cosDPR-distil type: hnsw - params: -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 + params: -encoder CosDprDistil -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.3942 diff --git a/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached.yaml similarity index 87% rename from src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.yaml rename to src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached.yaml index cfd7bf79e2..7804962b04 100644 --- a/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw-sqv.cached.yaml @@ -5,12 +5,12 @@ corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar download_checksum: a8fddf594c9b8e771637968033b12f6d -index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ +index_path: indexes/lucene-hnsw-sqv.msmarco-v1-passage.openai-ada2/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 -quantize.int8 +index_threads: 4 +index_options: -M 16 -efC 500 -quantize.sqv metrics: - metric: AP@1000 @@ -50,10 +50,10 @@ topics: qrel: qrels.msmarco-passage.dev-subset.txt models: - - name: openai-ada2-hnsw-int8-cached + - name: openai-ada2-hnsw-sqv-cached display: OpenAI-ada2 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.3505 diff --git a/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.yaml index a7645ad5d1..405d979beb 100644 --- a/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.yaml +++ b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.yaml @@ -9,8 +9,8 @@ index_path: indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ index_type: hnsw collection_class: ParquetDenseVectorCollection generator_class: DenseVectorDocumentGenerator -index_threads: 16 -index_options: -M 16 -efC 100 +index_threads: 4 +index_options: -M 16 -efC 500 metrics: - metric: AP@1000 @@ -53,7 +53,7 @@ models: - name: openai-ada2-hnsw-cached display: OpenAI-ada2 type: hnsw - params: -hits 1000 -efSearch 1000 -threads 16 + params: -hits 1000 -efSearch 2000 -threads 16 results: AP@1000: - 0.3505 diff --git a/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java b/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java index 7c1687fede..5097f17b65 100644 --- a/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java +++ b/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java @@ -241,7 +241,7 @@ public void testNullVectorInverted() throws Exception { } @Test - public void testQuantizedInt8() throws Exception { + public void testQuantizedSQV() throws Exception { String indexPath = "target/lucene-test-index.hnsw." + System.currentTimeMillis(); String[] indexArgs = new String[] { "-collection", "JsonDenseVectorCollection", @@ -249,7 +249,7 @@ public void testQuantizedInt8() throws Exception { "-index", indexPath, "-generator", "DenseVectorDocumentGenerator", "-threads", "1", - "-M", "16", "-efC", "100", "-quantize.int8" + "-M", "16", "-efC", "100", "-quantize.sqv" }; IndexHnswDenseVectors.main(indexArgs); diff --git a/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java b/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java index 666a55e92a..3d404d83aa 100644 --- a/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java +++ b/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java @@ -30,8 +30,6 @@ import io.anserini.index.AbstractIndexer; import io.anserini.index.IndexHnswDenseVectors; -import static org.junit.Assert.assertTrue; - /** * Tests for {@link SearchHnswDenseVectors} */ @@ -354,7 +352,7 @@ public void testBasicCosDprQuantized() throws Exception { "-index", indexPath, "-generator", "DenseVectorDocumentGenerator", "-threads", "1", - "-M", "16", "-efC", "100", "-quantize.int8" + "-M", "16", "-efC", "100", "-quantize.sqv" }; IndexHnswDenseVectors.main(indexArgs);