Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/nrt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# /root/jdk1.6.0_31/bin/java -XX:+UnlockExperimentalVMOptions -XX:+UseG1GC

/opt/zing/zingLX-jdk1.6.0_31-5.2.0.0-18-x86_64/bin/java -verbose:gc -Xms40G -Xmx40G -cp .:$LUCENE_HOME/build/core/classes/java:$LUCENE_HOME/build/highlighter/classes/java:$LUCENE_HOME/build/test-framework/classes/java:$LUCENE_HOME/build/queryparser/classes/java:$LUCENE_HOME/build/suggest/classes/java:$LUCENE_HOME/build/analysis/common/classes/java:$LUCENE_HOME/build/grouping/classes/java perf.SearchPerfTest \
-indexPath /large/indices/wikimediumall.lucene4x.Lucene40.nd33.3326M/index \
-indexPath /large/indices/wikimediumall.lucene4x.nd33.3326M/index \
-dirImpl RAMDirectory \
-analyzer StandardAnalyzer \
-taskSource server:localhost:7777 \
Expand Down
2 changes: 1 addition & 1 deletion src/main/IndexToFST.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;

// javac -cp lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar IndexToFST.java; java -cp .:lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar IndexToFST /l/indices/wikimediumall.trunk.facets.taxonomy:Date.taxonomy:Month.taxonomy:DayOfYear.taxonomy:RandomLabel.taxonomy.sortedset:Date.sortedset:Month.sortedset:DayOfYear.sortedset:RandomLabel.sortedset.Lucene90.Lucene90.dvfields.nd33.3326M/index
// javac -cp lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar IndexToFST.java; java -cp .:lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar IndexToFST /l/indices/wikimediumall.trunk.nd33.3326M/index

public class IndexToFST {

Expand Down
75 changes: 73 additions & 2 deletions src/python/benchUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# limitations under the License.

import datetime
import json
import math
import os
import pickle
Expand Down Expand Up @@ -933,9 +934,20 @@ def makeIndex(self, id, index, printCharts=False, profilerCount=30, profilerStac
profilerStackSize = (profilerStackSize,)

fullIndexPath = nameToIndexPath(index.getName())

# Get current config for this index
current_config = get_index_config(index)

if os.path.exists(fullIndexPath) and not index.doUpdate:
print(" %s: already exists" % fullIndexPath)
return fullIndexPath
# Check if we can reuse the existing index
can_reuse, reason = can_reuse_index(fullIndexPath, current_config)
if can_reuse:
print(" %s: already exists with matching config" % fullIndexPath)
return fullIndexPath
else:
print(" %s: exists but cannot reuse: %s" % (fullIndexPath, reason))
print(" removing old index and reindexing...")
shutil.rmtree(fullIndexPath)
if index.doUpdate:
if not os.path.exists(fullIndexPath):
raise RuntimeError("index path does not exists: %s" % fullIndexPath)
Expand Down Expand Up @@ -1091,6 +1103,9 @@ def makeIndex(self, id, index, printCharts=False, profilerCount=30, profilerStac
shutil.rmtree(fullIndexPath)
raise

# After successful indexing, write the config file
write_index_config(fullIndexPath, current_config)

profilerResults = profilerOutput(index.javaCommand, jfrOutput, checkoutToPath(index.checkout), profilerCount, profilerStackSize)

return fullIndexPath, fullLogFile, profilerResults, jfrOutput
Expand Down Expand Up @@ -1936,3 +1951,59 @@ def profilerOutput(javaCommand, jfrOutput, checkoutPath, profilerCount, profiler
print(output)
profilerResults.append((mode, stackSize, output))
return profilerResults


def get_index_config(index):
"""Generate a dict of all config parameters that affect index creation."""
config = {
"dataSource": index.dataSource.name,
"numDocs": index.numDocs,
"optimize": index.optimize,
"useCFS": index.useCFS,
"postingsFormat": index.postingsFormat,
"idFieldPostingsFormat": index.idFieldPostingsFormat,
"bodyTermVectors": index.bodyTermVectors,
"bodyStoredFields": index.bodyStoredFields,
"bodyPostingsOffsets": index.bodyPostingsOffsets,
"addDVFields": index.addDVFields,
"indexSort": index.indexSort,
}
if index.facets is not None:
config["facets"] = [arg[0] for arg in index.facets]
config["facetDVFormat"] = index.facetDVFormat
if index.vectorFile:
config["vectorFile"] = index.vectorFile
config["vectorDimension"] = index.vectorDimension
config["quantizeKNNGraph"] = index.quantizeKNNGraph
return config


def write_index_config(index_path, config):
"""Write index config to metadata file."""
config_path = os.path.join(index_path, "index-config.json")
with open(config_path, "w") as f:
json.dump(config, f, indent=2, sort_keys=True)


def read_index_config(index_path):
"""Read index config from metadata file, returns None if not found."""
config_path = os.path.join(index_path, "index-config.json")
if not os.path.exists(config_path):
return None
with open(config_path, "r") as f:
return json.load(f)


def can_reuse_index(index_path, current_config):
"""Check if existing index can be reused with current config."""
if not os.path.exists(index_path):
return False, "index does not exist"

prev_config = read_index_config(index_path)
if prev_config is None:
return False, "index config file does not exist"

if prev_config != current_config:
return False, f"config changed:\n old: {prev_config}\n new: {current_config}"

return True, None
46 changes: 20 additions & 26 deletions src/python/competition.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#

import glob
import hashlib
import os
import random
import subprocess
Expand Down Expand Up @@ -230,43 +231,36 @@ def getName(self):
if self.extraNamePart is not None:
name.append(self.extraNamePart)

# Generate hash of configuration to avoid index path collisions
config_parts = []
if self.optimize:
name.append("opt")

config_parts.append("opt")
if self.useCFS:
name.append("cfs")

# TODO: adding facets to filename makes it too long and runs into limits on some machines
# Can we remove this from file name and record it in a different logfile.
config_parts.append("cfs")
if self.facets is not None:
name.append("facets")
for arg in self.facets:
name.append(arg[0])
name.append(self.facetDVFormat)

config_parts.extend([arg[0] for arg in self.facets])
config_parts.append(self.facetDVFormat)
if self.bodyTermVectors:
name.append("tv")

config_parts.append("tv")
if self.bodyStoredFields:
name.append("stored")

config_parts.append("stored")
if self.bodyPostingsOffsets:
name.append("offsets")

name.append(self.postingsFormat)
config_parts.append("offsets")
config_parts.append(self.postingsFormat)
if self.postingsFormat != self.idFieldPostingsFormat:
name.append(self.idFieldPostingsFormat)

config_parts.append(self.idFieldPostingsFormat)
if self.addDVFields:
name.append("dvfields")

config_parts.append("dvfields")
if self.indexSort:
name.append("sort=%s" % self.indexSort)

config_parts.append(f"sort={self.indexSort}")
if self.vectorFile:
name.append("vectors=%d" % self.vectorDimension)
config_parts.append(f"vectors={self.vectorDimension}")
if self.quantizeKNNGraph:
name.append("int8-quantized")
config_parts.append("int8-quantized")

config_str = "|".join([benchUtil.checkoutToName(self.checkout)] + config_parts)
config_hash = hashlib.md5(config_str.encode()).hexdigest()[:8]
name.append(config_hash)

name.append("nd%gM" % (self.numDocs / 1000000.0))
return ".".join(name)
Expand Down
7 changes: 6 additions & 1 deletion src/python/test_all_fst_sizes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# nocommit -- temporary tool

import glob
import pickle
import re
import subprocess
Expand All @@ -22,8 +23,12 @@

while True:
print(f"\nTest ram_mb={ram_mb}")
index_dirs = glob.glob("/l/indices/wikimediumall.trunk.*.nd33.3326M/index")
if not index_dirs:
raise RuntimeError("No matching index directory found")
index_path = index_dirs[0]
stdout = subprocess.check_output(
f"java -cp .:lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar IndexToFST /l/indices/wikimediumall.trunk.facets.taxonomy:Date.taxonomy:Month.taxonomy:DayOfYear.taxonomy:RandomLabel.taxonomy.sortedset:Date.sortedset:Month.sortedset:DayOfYear.sortedset:RandomLabel.sortedset.Lucene90.Lucene90.dvfields.nd33.3326M/index {ram_mb}",
f"java -cp .:lucene/core/build/libs/lucene-core-10.0.0-SNAPSHOT.jar IndexToFST {index_path} {ram_mb}",
shell=True,
)

Expand Down
Loading