From 4bf99c317113bb09c7f451474521462b10bea2df Mon Sep 17 00:00:00 2001 From: Junjie Qi Date: Thu, 6 Mar 2025 16:42:20 -0800 Subject: [PATCH 1/2] Support cosine distance for training vectors (#4227) Summary: same as title Differential Revision: D70724590 --- benchs/bench_fw/descriptors.py | 2 ++ benchs/bench_fw/index.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/benchs/bench_fw/descriptors.py b/benchs/bench_fw/descriptors.py index 8b1d65a505..d40243bb5b 100644 --- a/benchs/bench_fw/descriptors.py +++ b/benchs/bench_fw/descriptors.py @@ -106,6 +106,8 @@ class DatasetDescriptor: # desc_name desc_name: Optional[str] = None + normalize_L2: bool = False + def __hash__(self): return hash(self.get_filename()) diff --git a/benchs/bench_fw/index.py b/benchs/bench_fw/index.py index fe2fe103ef..b1252ad1b0 100644 --- a/benchs/bench_fw/index.py +++ b/benchs/bench_fw/index.py @@ -1138,6 +1138,8 @@ def assemble(self, dry_run): return None, None, "" logger.info(f"assemble, train {self.factory}") xt = self.io.get_dataset(self.training_vectors) + if self.training_vectors.normalize_L2: + faiss.normalize_L2(xt) _, t, _ = timer("train", lambda: codec.train(xt), once=True) t_aggregate += t From dc12e9cd2950880631b83892c64b24090b319651 Mon Sep 17 00:00:00 2001 From: Junjie Qi Date: Thu, 6 Mar 2025 16:42:20 -0800 Subject: [PATCH 2/2] Support non-partition col and map in the embedding reader Summary: same as title Differential Revision: D70728870 --- benchs/bench_fw/descriptors.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchs/bench_fw/descriptors.py b/benchs/bench_fw/descriptors.py index d40243bb5b..212e643426 100644 --- a/benchs/bench_fw/descriptors.py +++ b/benchs/bench_fw/descriptors.py @@ -83,6 +83,9 @@ class DatasetDescriptor: embedding_column: Optional[str] = None + # only when the embedding column is a map + embedding_column_key: Optional[Any] = None + embedding_id_column: Optional[str] = None # unused in open-source