diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BruteForceIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BruteForceIndexImpl.java index 4dfef90261..83c08fedc6 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BruteForceIndexImpl.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BruteForceIndexImpl.java @@ -15,6 +15,7 @@ */ package com.nvidia.cuvs.internal; +import static com.nvidia.cuvs.internal.common.CloseableRMMAllocation.allocateRMMSegment; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_FLOAT; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_FLOAT_BYTE_SIZE; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_INT_BYTE_SIZE; @@ -22,7 +23,6 @@ import static com.nvidia.cuvs.internal.common.LinkerHelper.C_LONG_BYTE_SIZE; import static com.nvidia.cuvs.internal.common.Util.CudaMemcpyKind.HOST_TO_DEVICE; import static com.nvidia.cuvs.internal.common.Util.CudaMemcpyKind.INFER_DIRECTION; -import static com.nvidia.cuvs.internal.common.Util.allocateRMMSegment; import static com.nvidia.cuvs.internal.common.Util.buildMemorySegment; import static com.nvidia.cuvs.internal.common.Util.checkCuVSError; import static com.nvidia.cuvs.internal.common.Util.concatenate; @@ -35,7 +35,6 @@ import static com.nvidia.cuvs.internal.panama.headers_h.cuvsBruteForceIndex_t; import static com.nvidia.cuvs.internal.panama.headers_h.cuvsBruteForceSearch; import static com.nvidia.cuvs.internal.panama.headers_h.cuvsBruteForceSerialize; -import static com.nvidia.cuvs.internal.panama.headers_h.cuvsRMMFree; import static com.nvidia.cuvs.internal.panama.headers_h.cuvsStreamSync; import static com.nvidia.cuvs.internal.panama.headers_h.omp_set_num_threads; @@ -45,6 +44,7 @@ import com.nvidia.cuvs.CuVSMatrix; import com.nvidia.cuvs.CuVSResources; import com.nvidia.cuvs.SearchResults; +import com.nvidia.cuvs.internal.common.CloseableRMMAllocation; import com.nvidia.cuvs.internal.panama.cuvsFilter; import java.io.InputStream; import java.io.OutputStream; @@ -118,20 +118,7 @@ public void destroyIndex() { try { int returnValue = cuvsBruteForceIndexDestroy(bruteForceIndexReference.indexPtr); checkCuVSError(returnValue, "cuvsBruteForceIndexDestroy"); - - if (bruteForceIndexReference.datasetBytes > 0) { - try (var resourcesAccessor = resources.access()) { - checkCuVSError( - cuvsRMMFree( - resourcesAccessor.handle(), - bruteForceIndexReference.datasetPtr, - bruteForceIndexReference.datasetBytes), - "cuvsRMMFree"); - } - } - if (bruteForceIndexReference.tensorDataArena != null) { - bruteForceIndexReference.tensorDataArena.close(); - } + bruteForceIndexReference.close(resources); } finally { destroyed = true; } @@ -158,25 +145,31 @@ private IndexReference build( try (var resourcesAccessor = resources.access()) { long cuvsResources = resourcesAccessor.handle(); - MemorySegment datasetMemorySegmentP = allocateRMMSegment(cuvsResources, datasetBytes); + try (var closeableDataMemorySegmentP = allocateRMMSegment(cuvsResources, datasetBytes)) { + MemorySegment datasetMemorySegmentP = closeableDataMemorySegmentP.handle(); - cudaMemcpy(datasetMemorySegmentP, datasetMemSegment, datasetBytes, INFER_DIRECTION); + cudaMemcpy(datasetMemorySegmentP, datasetMemSegment, datasetBytes, INFER_DIRECTION); - long[] datasetShape = {rows, cols}; - var tensorDataArena = Arena.ofShared(); - MemorySegment datasetTensor = - prepareTensor(tensorDataArena, datasetMemorySegmentP, datasetShape, 2, 32, 2, 1); + long[] datasetShape = {rows, cols}; + var tensorDataArena = Arena.ofShared(); + MemorySegment datasetTensor = + prepareTensor(tensorDataArena, datasetMemorySegmentP, datasetShape, 2, 32, 2, 1); - var returnValue = cuvsStreamSync(cuvsResources); - checkCuVSError(returnValue, "cuvsStreamSync"); + var returnValue = cuvsStreamSync(cuvsResources); + checkCuVSError(returnValue, "cuvsStreamSync"); - returnValue = cuvsBruteForceBuild(cuvsResources, datasetTensor, 0, 0.0f, index); - checkCuVSError(returnValue, "cuvsBruteForceBuild"); + returnValue = cuvsBruteForceBuild(cuvsResources, datasetTensor, 0, 0.0f, index); + checkCuVSError(returnValue, "cuvsBruteForceBuild"); - returnValue = cuvsStreamSync(cuvsResources); - checkCuVSError(returnValue, "cuvsStreamSync"); + returnValue = cuvsStreamSync(cuvsResources); + checkCuVSError(returnValue, "cuvsStreamSync"); - return new IndexReference(datasetMemorySegmentP, datasetBytes, tensorDataArena, index); + return new IndexReference( + new CloseableRMMAllocation(closeableDataMemorySegmentP), + datasetBytes, + tensorDataArena, + index); + } } finally { omp_set_num_threads(1); } @@ -205,6 +198,7 @@ public SearchResults search(BruteForceQuery cuvsQuery) throws Throwable { // prepare the prefiltering data final long prefilterDataLength; + final long prefilterBytes; final MemorySegment prefilterDataMemorySegment; BitSet[] prefilters = cuvsQuery.getPrefilters(); if (prefilters != null && prefilters.length > 0) { @@ -212,8 +206,11 @@ public SearchResults search(BruteForceQuery cuvsQuery) throws Throwable { long[] filters = concatenatedFilters.toLongArray(); prefilterDataMemorySegment = buildMemorySegment(localArena, filters); prefilterDataLength = (long) cuvsQuery.getNumDocs() * prefilters.length; + long[] prefilterShape = {(prefilterDataLength + 31) / 32}; + prefilterBytes = C_INT_BYTE_SIZE * prefilterShape[0]; } else { prefilterDataLength = 0; + prefilterBytes = 0; prefilterDataMemorySegment = MemorySegment.NULL; } @@ -223,77 +220,66 @@ public SearchResults search(BruteForceQuery cuvsQuery) throws Throwable { try (var resourcesAccessor = cuvsQuery.getResources().access()) { long cuvsResources = resourcesAccessor.handle(); - long queriesBytes = C_FLOAT_BYTE_SIZE * numQueries * vectorDimension; - long neighborsBytes = C_LONG_BYTE_SIZE * numQueries * topk; - long distanceBytes = C_FLOAT_BYTE_SIZE * numQueries * topk; - long prefilterBytes = 0; // size assigned later - - MemorySegment queriesDP = allocateRMMSegment(cuvsResources, queriesBytes); - MemorySegment neighborsDP = allocateRMMSegment(cuvsResources, neighborsBytes); - MemorySegment distancesDP = allocateRMMSegment(cuvsResources, distanceBytes); - MemorySegment prefilterDP = MemorySegment.NULL; - - cudaMemcpy(queriesDP, querySeg, queriesBytes, INFER_DIRECTION); - - long[] queriesShape = {numQueries, vectorDimension}; - MemorySegment queriesTensor = - prepareTensor(localArena, queriesDP, queriesShape, 2, 32, 2, 1); - long[] neighborsShape = {numQueries, topk}; - MemorySegment neighborsTensor = - prepareTensor(localArena, neighborsDP, neighborsShape, 0, 64, 2, 1); - long[] distancesShape = {numQueries, topk}; - MemorySegment distancesTensor = - prepareTensor(localArena, distancesDP, distancesShape, 2, 32, 2, 1); - - MemorySegment prefilter = cuvsFilter.allocate(localArena); - MemorySegment prefilterTensor; - - if (prefilterDataMemorySegment == MemorySegment.NULL) { - cuvsFilter.type(prefilter, 0); // NO_FILTER - cuvsFilter.addr(prefilter, 0); - } else { - long[] prefilterShape = {(prefilterDataLength + 31) / 32}; - long prefilterLen = prefilterShape[0]; - prefilterBytes = C_INT_BYTE_SIZE * prefilterLen; - - prefilterDP = allocateRMMSegment(cuvsResources, prefilterBytes); - - cudaMemcpy(prefilterDP, prefilterDataMemorySegment, prefilterBytes, HOST_TO_DEVICE); - - prefilterTensor = prepareTensor(localArena, prefilterDP, prefilterShape, 1, 32, 2, 1); - - cuvsFilter.type(prefilter, 2); - cuvsFilter.addr(prefilter, prefilterTensor.address()); - } - - var returnValue = cuvsStreamSync(cuvsResources); - checkCuVSError(returnValue, "cuvsStreamSync"); - - returnValue = - cuvsBruteForceSearch( - cuvsResources, - bruteForceIndexReference.indexPtr, - queriesTensor, - neighborsTensor, - distancesTensor, - prefilter); - checkCuVSError(returnValue, "cuvsBruteForceSearch"); - - returnValue = cuvsStreamSync(cuvsResources); - checkCuVSError(returnValue, "cuvsStreamSync"); - - cudaMemcpy(neighborsMemorySegment, neighborsDP, neighborsBytes, INFER_DIRECTION); - cudaMemcpy(distancesMemorySegment, distancesDP, distanceBytes, INFER_DIRECTION); - - returnValue = cuvsRMMFree(cuvsResources, neighborsDP, neighborsBytes); - checkCuVSError(returnValue, "cuvsRMMFree"); - returnValue = cuvsRMMFree(cuvsResources, distancesDP, distanceBytes); - checkCuVSError(returnValue, "cuvsRMMFree"); - returnValue = cuvsRMMFree(cuvsResources, queriesDP, queriesBytes); - checkCuVSError(returnValue, "cuvsRMMFree"); - if (prefilterBytes > 0) { - returnValue = cuvsRMMFree(cuvsResources, prefilterDP, prefilterBytes); - checkCuVSError(returnValue, "cuvsRMMFree"); + final long queriesBytes = C_FLOAT_BYTE_SIZE * numQueries * vectorDimension; + final long neighborsBytes = C_LONG_BYTE_SIZE * numQueries * topk; + final long distanceBytes = C_FLOAT_BYTE_SIZE * numQueries * topk; + + try (var queriesDP = allocateRMMSegment(cuvsResources, queriesBytes); + var neighborsDP = allocateRMMSegment(cuvsResources, neighborsBytes); + var distancesDP = allocateRMMSegment(cuvsResources, distanceBytes); + var prefilterDP = + prefilterBytes > 0 + ? allocateRMMSegment(cuvsResources, prefilterBytes) + : CloseableRMMAllocation.EMPTY) { + + cudaMemcpy(queriesDP.handle(), querySeg, queriesBytes, INFER_DIRECTION); + + long[] queriesShape = {numQueries, vectorDimension}; + MemorySegment queriesTensor = + prepareTensor(localArena, queriesDP.handle(), queriesShape, 2, 32, 2, 1); + long[] neighborsShape = {numQueries, topk}; + MemorySegment neighborsTensor = + prepareTensor(localArena, neighborsDP.handle(), neighborsShape, 0, 64, 2, 1); + long[] distancesShape = {numQueries, topk}; + MemorySegment distancesTensor = + prepareTensor(localArena, distancesDP.handle(), distancesShape, 2, 32, 2, 1); + + MemorySegment prefilter = cuvsFilter.allocate(localArena); + MemorySegment prefilterTensor; + + if (prefilterDataMemorySegment == MemorySegment.NULL) { + cuvsFilter.type(prefilter, 0); // NO_FILTER + cuvsFilter.addr(prefilter, 0); + } else { + long[] prefilterShape = {(prefilterDataLength + 31) / 32}; + cudaMemcpy( + prefilterDP.handle(), prefilterDataMemorySegment, prefilterBytes, HOST_TO_DEVICE); + + prefilterTensor = + prepareTensor(localArena, prefilterDP.handle(), prefilterShape, 1, 32, 2, 1); + + cuvsFilter.type(prefilter, 2); + cuvsFilter.addr(prefilter, prefilterTensor.address()); + } + + var returnValue = cuvsStreamSync(cuvsResources); + checkCuVSError(returnValue, "cuvsStreamSync"); + + returnValue = + cuvsBruteForceSearch( + cuvsResources, + bruteForceIndexReference.indexPtr, + queriesTensor, + neighborsTensor, + distancesTensor, + prefilter); + checkCuVSError(returnValue, "cuvsBruteForceSearch"); + + returnValue = cuvsStreamSync(cuvsResources); + checkCuVSError(returnValue, "cuvsStreamSync"); + + cudaMemcpy(neighborsMemorySegment, neighborsDP.handle(), neighborsBytes, INFER_DIRECTION); + cudaMemcpy(distancesMemorySegment, distancesDP.handle(), distanceBytes, INFER_DIRECTION); } } return BruteForceSearchResults.create( @@ -479,27 +465,39 @@ public BruteForceIndexImpl build() throws Throwable { */ private static class IndexReference { - private final MemorySegment datasetPtr; + private final CloseableRMMAllocation datasetAllocationHandle; private final long datasetBytes; private final Arena tensorDataArena; private final MemorySegment indexPtr; private IndexReference( - MemorySegment datasetPtr, + CloseableRMMAllocation datasetAllocationHandle, long datasetBytes, Arena tensorDataArena, MemorySegment indexPtr) { - this.datasetPtr = datasetPtr; + this.datasetAllocationHandle = datasetAllocationHandle; this.datasetBytes = datasetBytes; this.tensorDataArena = tensorDataArena; this.indexPtr = indexPtr; } private IndexReference(MemorySegment indexPtr) { - this.datasetPtr = MemorySegment.NULL; + this.datasetAllocationHandle = CloseableRMMAllocation.EMPTY; this.datasetBytes = 0; this.tensorDataArena = null; this.indexPtr = indexPtr; } + + /** + * Free up the memory used for dataset, tensor-data. + */ + private void close(CuVSResources resources) { + try (var resourcesAccessor = resources.access()) { + datasetAllocationHandle.close(); + } + if (tensorDataArena != null) { + tensorDataArena.close(); + } + } } } diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java index 704bc72b34..ac3e9ba43a 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java @@ -16,13 +16,13 @@ package com.nvidia.cuvs.internal; import static com.nvidia.cuvs.internal.CuVSParamsHelper.*; +import static com.nvidia.cuvs.internal.common.CloseableRMMAllocation.allocateRMMSegment; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_FLOAT; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_FLOAT_BYTE_SIZE; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_INT; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_INT_BYTE_SIZE; import static com.nvidia.cuvs.internal.common.Util.CudaMemcpyKind.HOST_TO_DEVICE; import static com.nvidia.cuvs.internal.common.Util.CudaMemcpyKind.INFER_DIRECTION; -import static com.nvidia.cuvs.internal.common.Util.allocateRMMSegment; import static com.nvidia.cuvs.internal.common.Util.buildMemorySegment; import static com.nvidia.cuvs.internal.common.Util.checkCuVSError; import static com.nvidia.cuvs.internal.common.Util.concatenate; @@ -43,6 +43,7 @@ import com.nvidia.cuvs.CuVSResources; import com.nvidia.cuvs.SearchResults; import com.nvidia.cuvs.internal.common.CloseableHandle; +import com.nvidia.cuvs.internal.common.CloseableRMMAllocation; import com.nvidia.cuvs.internal.common.CompositeCloseableHandle; import com.nvidia.cuvs.internal.panama.cuvsCagraCompressionParams; import com.nvidia.cuvs.internal.panama.cuvsCagraIndexParams; @@ -212,6 +213,8 @@ private static MemorySegment createCagraIndex() { } } + private static final BitSet[] EMPTY_PREFILTER_BITSET = new BitSet[0]; + /** * Invokes the native search_cagra_index via the Panama API for searching a * CAGRA index. @@ -235,100 +238,92 @@ public SearchResults search(CagraQuery query) throws Throwable { MemorySegment distancesMemorySegment = localArena.allocate(distancesSequenceLayout); MemorySegment floatsSeg = buildMemorySegment(localArena, query.getQueryVectors()); - long queriesBytes = C_FLOAT_BYTE_SIZE * numQueries * vectorDimension; - long neighborsBytes = C_INT_BYTE_SIZE * numQueries * topK; - long distancesBytes = C_FLOAT_BYTE_SIZE * numQueries * topK; + final long queriesBytes = C_FLOAT_BYTE_SIZE * numQueries * vectorDimension; + final long neighborsBytes = C_INT_BYTE_SIZE * numQueries * topK; + final long distancesBytes = C_FLOAT_BYTE_SIZE * numQueries * topK; + final boolean hasPreFilter = query.getPrefilter() != null; + final BitSet[] prefilters = + hasPreFilter ? new BitSet[] {query.getPrefilter()} : EMPTY_PREFILTER_BITSET; + final long prefilterDataLength = hasPreFilter ? query.getNumDocs() * prefilters.length : 0; + final long prefilterLen = hasPreFilter ? (prefilterDataLength + 31) / 32 : 0; + final long prefilterBytes = C_INT_BYTE_SIZE * prefilterLen; try (var resourcesAccessor = query.getResources().access()) { var cuvsRes = resourcesAccessor.handle(); - MemorySegment queriesDP = allocateRMMSegment(cuvsRes, queriesBytes); - MemorySegment neighborsDP = allocateRMMSegment(cuvsRes, neighborsBytes); - MemorySegment distancesDP = allocateRMMSegment(cuvsRes, distancesBytes); - MemorySegment prefilterDP = MemorySegment.NULL; - long prefilterLen = 0; - - cudaMemcpy(queriesDP, floatsSeg, queriesBytes, INFER_DIRECTION); - - long[] queriesShape = {numQueries, vectorDimension}; - MemorySegment queriesTensor = - prepareTensor(localArena, queriesDP, queriesShape, 2, 32, 2, 1); - long[] neighborsShape = {numQueries, topK}; - MemorySegment neighborsTensor = - prepareTensor(localArena, neighborsDP, neighborsShape, 1, 32, 2, 1); - long[] distancesShape = {numQueries, topK}; - MemorySegment distancesTensor = - prepareTensor(localArena, distancesDP, distancesShape, 2, 32, 2, 1); - - var returnValue = cuvsStreamSync(cuvsRes); - checkCuVSError(returnValue, "cuvsStreamSync"); - - // prepare the prefiltering data - long prefilterDataLength = 0; - MemorySegment prefilterDataMemorySegment = MemorySegment.NULL; - BitSet[] prefilters; - if (query.getPrefilter() != null) { - prefilters = new BitSet[] {query.getPrefilter()}; - BitSet concatenatedFilters = concatenate(prefilters, query.getNumDocs()); - long[] filters = concatenatedFilters.toLongArray(); - prefilterDataMemorySegment = buildMemorySegment(localArena, filters); - prefilterDataLength = query.getNumDocs() * prefilters.length; - } - - MemorySegment prefilter = cuvsFilter.allocate(localArena); - MemorySegment prefilterTensor; - - final long prefilterBytes; - - if (prefilterDataMemorySegment == MemorySegment.NULL) { - cuvsFilter.type(prefilter, 0); // NO_FILTER - cuvsFilter.addr(prefilter, 0); - prefilterBytes = 0; - } else { - long[] prefilterShape = {(prefilterDataLength + 31) / 32}; - prefilterLen = prefilterShape[0]; - prefilterBytes = C_INT_BYTE_SIZE * prefilterLen; - - prefilterDP = allocateRMMSegment(cuvsRes, prefilterBytes); - - cudaMemcpy(prefilterDP, prefilterDataMemorySegment, prefilterBytes, HOST_TO_DEVICE); - - prefilterTensor = prepareTensor(localArena, prefilterDP, prefilterShape, 1, 32, 2, 1); - - cuvsFilter.type(prefilter, 1); - cuvsFilter.addr(prefilter, prefilterTensor.address()); - } - - returnValue = cuvsStreamSync(cuvsRes); - checkCuVSError(returnValue, "cuvsStreamSync"); - - returnValue = - cuvsCagraSearch( - cuvsRes, - segmentFromSearchParams(localArena, query.getCagraSearchParameters()), - cagraIndexReference.getMemorySegment(), - queriesTensor, - neighborsTensor, - distancesTensor, - prefilter); - checkCuVSError(returnValue, "cuvsCagraSearch"); - - returnValue = cuvsStreamSync(cuvsRes); - checkCuVSError(returnValue, "cuvsStreamSync"); - - cudaMemcpy(neighborsMemorySegment, neighborsDP, neighborsBytes, INFER_DIRECTION); - cudaMemcpy(distancesMemorySegment, distancesDP, distancesBytes, INFER_DIRECTION); - - returnValue = cuvsRMMFree(cuvsRes, distancesDP, distancesBytes); - checkCuVSError(returnValue, "cuvsRMMFree"); - returnValue = cuvsRMMFree(cuvsRes, neighborsDP, neighborsBytes); - checkCuVSError(returnValue, "cuvsRMMFree"); - returnValue = cuvsRMMFree(cuvsRes, queriesDP, queriesBytes); - checkCuVSError(returnValue, "cuvsRMMFree"); - - if (prefilterLen > 0) { - returnValue = cuvsRMMFree(cuvsRes, prefilterDP, C_INT_BYTE_SIZE * prefilterBytes); - checkCuVSError(returnValue, "cuvsRMMFree"); + try (var queriesDP = allocateRMMSegment(cuvsRes, queriesBytes); + var neighborsDP = allocateRMMSegment(cuvsRes, neighborsBytes); + var distancesDP = allocateRMMSegment(cuvsRes, distancesBytes); + var prefilterDP = + hasPreFilter + ? allocateRMMSegment(cuvsRes, prefilterBytes) + : CloseableRMMAllocation.EMPTY) { + + cudaMemcpy(queriesDP.handle(), floatsSeg, queriesBytes, INFER_DIRECTION); + + long[] queriesShape = {numQueries, vectorDimension}; + MemorySegment queriesTensor = + prepareTensor( + localArena, queriesDP.handle(), queriesShape, kDLFloat(), 32, kDLCUDA(), 1); + long[] neighborsShape = {numQueries, topK}; + MemorySegment neighborsTensor = + prepareTensor( + localArena, neighborsDP.handle(), neighborsShape, kDLUInt(), 32, kDLCUDA(), 1); + long[] distancesShape = {numQueries, topK}; + MemorySegment distancesTensor = + prepareTensor( + localArena, distancesDP.handle(), distancesShape, kDLFloat(), 32, kDLCUDA(), 1); + + var returnValue = cuvsStreamSync(cuvsRes); + checkCuVSError(returnValue, "cuvsStreamSync"); + + // prepare the prefiltering data + MemorySegment prefilterDataMemorySegment = MemorySegment.NULL; + if (hasPreFilter) { + BitSet concatenatedFilters = concatenate(prefilters, query.getNumDocs()); + long[] filters = concatenatedFilters.toLongArray(); + prefilterDataMemorySegment = buildMemorySegment(localArena, filters); + } + + MemorySegment prefilter = cuvsFilter.allocate(localArena); + MemorySegment prefilterTensor; + + if (!hasPreFilter) { + cuvsFilter.type(prefilter, 0); // NO_FILTER + cuvsFilter.addr(prefilter, 0); + } else { + long[] prefilterShape = {prefilterLen}; + + cudaMemcpy( + prefilterDP.handle(), prefilterDataMemorySegment, prefilterBytes, HOST_TO_DEVICE); + + prefilterTensor = + prepareTensor( + localArena, prefilterDP.handle(), prefilterShape, kDLUInt(), 32, kDLCUDA(), 1); + + cuvsFilter.type(prefilter, 1); + cuvsFilter.addr(prefilter, prefilterTensor.address()); + } + + returnValue = cuvsStreamSync(cuvsRes); + checkCuVSError(returnValue, "cuvsStreamSync"); + + returnValue = + cuvsCagraSearch( + cuvsRes, + segmentFromSearchParams(localArena, query.getCagraSearchParameters()), + cagraIndexReference.getMemorySegment(), + queriesTensor, + neighborsTensor, + distancesTensor, + prefilter); + checkCuVSError(returnValue, "cuvsCagraSearch"); + + returnValue = cuvsStreamSync(cuvsRes); + checkCuVSError(returnValue, "cuvsStreamSync"); + + cudaMemcpy(neighborsMemorySegment, neighborsDP.handle(), neighborsBytes, INFER_DIRECTION); + cudaMemcpy(distancesMemorySegment, distancesDP.handle(), distancesBytes, INFER_DIRECTION); } } diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/TieredIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/TieredIndexImpl.java index ef9d33cd43..31d8cd4bec 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/TieredIndexImpl.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/TieredIndexImpl.java @@ -16,13 +16,13 @@ package com.nvidia.cuvs.internal; import static com.nvidia.cuvs.internal.CuVSParamsHelper.createTieredIndexParams; +import static com.nvidia.cuvs.internal.common.CloseableRMMAllocation.allocateRMMSegment; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_FLOAT; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_FLOAT_BYTE_SIZE; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_INT_BYTE_SIZE; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_LONG; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_LONG_BYTE_SIZE; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_POINTER; -import static com.nvidia.cuvs.internal.common.Util.*; import static com.nvidia.cuvs.internal.common.Util.CudaMemcpyKind.*; import static com.nvidia.cuvs.internal.common.Util.buildMemorySegment; import static com.nvidia.cuvs.internal.common.Util.checkCuVSError; @@ -34,6 +34,7 @@ import com.nvidia.cuvs.*; import com.nvidia.cuvs.CuVSMatrix; import com.nvidia.cuvs.internal.common.CloseableHandle; +import com.nvidia.cuvs.internal.common.CloseableRMMAllocation; import com.nvidia.cuvs.internal.common.Util; import com.nvidia.cuvs.internal.panama.cuvsCagraIndexParams; import com.nvidia.cuvs.internal.panama.cuvsCagraSearchParams; @@ -48,7 +49,7 @@ import java.util.Objects; /** - * {@link TieredIndex} encapscaps a Tiered index, along with methods to interact + * {@link TieredIndex} encapsulates a Tiered index, along with methods to interact * with it. *

* TieredIndex is a hybrid index that combines brute force search for small datasets @@ -134,39 +135,39 @@ private IndexReference build() { // TieredIndex REQUIRES device memory - allocate it long datasetSize = C_FLOAT_BYTE_SIZE * rows * cols; - MemorySegment datasetDP = allocateRMMSegment(cuvsRes, datasetSize); + try (var datasetDP = allocateRMMSegment(cuvsRes, datasetSize)) { + // Copy host to device + Util.cudaMemcpy(datasetDP.handle(), hostDataSeg, datasetSize, HOST_TO_DEVICE); - // Copy host to device - Util.cudaMemcpy(datasetDP, hostDataSeg, datasetSize, HOST_TO_DEVICE); + // Create tensor from device memory + long[] datasetShape = {rows, cols}; + MemorySegment datasetTensor = + prepareTensor( + localArena, datasetDP.handle(), datasetShape, kDLFloat(), 32, kDLCUDA(), 1); - // Create tensor from device memory - long[] datasetShape = {rows, cols}; - MemorySegment datasetTensor = - prepareTensor(localArena, datasetDP, datasetShape, kDLFloat(), 32, kDLCUDA(), 1); + MemorySegment index = localArena.allocate(cuvsTieredIndex_t); + var returnValue = cuvsTieredIndexCreate(index); + checkCuVSError(returnValue, "cuvsTieredIndexCreate"); - MemorySegment index = localArena.allocate(cuvsTieredIndex_t); - var returnValue = cuvsTieredIndexCreate(index); - checkCuVSError(returnValue, "cuvsTieredIndexCreate"); + returnValue = cuvsStreamSync(cuvsRes); + checkCuVSError(returnValue, "cuvsStreamSync"); - returnValue = cuvsStreamSync(cuvsRes); - checkCuVSError(returnValue, "cuvsStreamSync"); + // Extract the actual index pointer that was written by Create + MemorySegment actualIndexPtr = index.get(C_POINTER, 0); - // Extract the actual index pointer that was written by Create - MemorySegment actualIndexPtr = index.get(C_POINTER, 0); + returnValue = + cuvsTieredIndexBuild( + cuvsRes, indexParamsMemorySegment, datasetTensor, actualIndexPtr); + checkCuVSError(returnValue, "cuvsTieredIndexBuild"); - returnValue = - cuvsTieredIndexBuild(cuvsRes, indexParamsMemorySegment, datasetTensor, actualIndexPtr); - checkCuVSError(returnValue, "cuvsTieredIndexBuild"); - - // Clean up device memory after build - returnValue = cuvsRMMFree(cuvsRes, datasetDP, datasetSize); - checkCuVSError(returnValue, "cuvsRMMFree"); - - return new IndexReference(actualIndexPtr); + return new IndexReference(actualIndexPtr); + } } } } + private static final BitSet[] EMPTY_PREFILTER_BITSET = new BitSet[0]; + /** * Translates C search_tiered_index function to Java * Invokes the native search_tiered_index via the Panama API for searching a @@ -196,116 +197,119 @@ public SearchResults search(TieredIndexQuery query) throws Throwable { // Get host query data MemorySegment hostQueriesSeg = Util.buildMemorySegment(localArena, query.getQueryVectors()); + final long queriesBytes = C_FLOAT_BYTE_SIZE * numQueries * vectorDimension; + final long neighborsBytes = C_LONG_BYTE_SIZE * numQueries * topK; // 64-bit for tiered index + final long distancesBytes = C_FLOAT_BYTE_SIZE * numQueries * topK; + final boolean hasPreFilter = query.getPrefilter() != null; + final BitSet[] prefilters = + hasPreFilter ? new BitSet[] {query.getPrefilter()} : EMPTY_PREFILTER_BITSET; + final long prefilterDataLength = hasPreFilter ? query.getNumDocs() * prefilters.length : 0; + final long prefilterLen = hasPreFilter ? (prefilterDataLength + 31) / 32 : 0; + final long prefilterBytes = C_INT_BYTE_SIZE * prefilterLen; + try (var resourceAccess = query.getResources().access()) { long cuvsRes = resourceAccess.handle(); - long queriesBytes = C_FLOAT_BYTE_SIZE * numQueries * vectorDimension; - long neighborsBytes = C_LONG_BYTE_SIZE * numQueries * topK; // 64-bit for tiered index - long distancesBytes = C_FLOAT_BYTE_SIZE * numQueries * topK; - // Allocate DEVICE memory for all data - MemorySegment queriesDP = allocateRMMSegment(cuvsRes, queriesBytes); - MemorySegment neighborsDP = allocateRMMSegment(cuvsRes, neighborsBytes); - MemorySegment distancesDP = allocateRMMSegment(cuvsRes, distancesBytes); - - // Copy queries from host to device - var returnValue = - cudaMemcpy(queriesDP, hostQueriesSeg, queriesBytes, cudaMemcpyHostToDevice()); - checkCudaError(returnValue, "cudaMemcpy"); - - // Create tensors from device memory - long[] queriesShape = {numQueries, vectorDimension}; - MemorySegment queriesTensor = - prepareTensor(localArena, queriesDP, queriesShape, kDLFloat(), 32, kDLCUDA(), 1); - long[] neighborsShape = {numQueries, topK}; - MemorySegment neighborsTensor = - prepareTensor( - localArena, neighborsDP, neighborsShape, kDLInt(), 64, kDLCUDA(), 1); // 64-bit int - long[] distancesShape = {numQueries, topK}; - MemorySegment distancesTensor = - prepareTensor(localArena, distancesDP, distancesShape, kDLFloat(), 32, kDLCUDA(), 1); - - // Sync before prefilter setup - returnValue = cuvsStreamSync(cuvsRes); - checkCuVSError(returnValue, "cuvsStreamSync"); - - // Handle prefilter - MemorySegment prefilter = cuvsFilter.allocate(localArena); - final MemorySegment prefilterDP; - final long prefilterBytes; - - if (query.getPrefilter() != null) { - BitSet[] prefilters = new BitSet[] {query.getPrefilter()}; - BitSet concatenatedFilters = concatenate(prefilters, (int) query.getNumDocs()); - long[] filters = concatenatedFilters.toLongArray(); - MemorySegment hostPrefilterSeg = buildMemorySegment(localArena, filters); - - long prefilterDataLength = query.getNumDocs() * prefilters.length; - long[] prefilterShape = {(prefilterDataLength + 31) / 32}; - long prefilterLen = prefilterShape[0]; - prefilterBytes = C_INT_BYTE_SIZE * prefilterLen; - - // Allocate device memory for prefilter - prefilterDP = allocateRMMSegment(cuvsRes, prefilterBytes); - - // Copy prefilter to device - checkCudaError( - cudaMemcpy(prefilterDP, hostPrefilterSeg, prefilterBytes, cudaMemcpyHostToDevice()), - "cudaMemcpy"); - - MemorySegment prefilterTensor = - prepareTensor(localArena, prefilterDP, prefilterShape, kDLUInt(), 32, kDLCUDA(), 1); - - cuvsFilter.type(prefilter, 1); // BITSET - cuvsFilter.addr(prefilter, prefilterTensor.address()); - } else { - prefilterDP = MemorySegment.NULL; - prefilterBytes = 0; - cuvsFilter.type(prefilter, 0); // NO_FILTER - cuvsFilter.addr(prefilter, 0); - } - - // Perform search - returnValue = - cuvsTieredIndexSearch( - cuvsRes, - segmentFromSearchParams(query.getCagraSearchParameters(), localArena), - tieredIndexReference.getMemorySegment(), - queriesTensor, - neighborsTensor, - distancesTensor, - prefilter); - checkCuVSError(returnValue, "cuvsTieredIndexSearch"); - - // Copy results from device to host - returnValue = - cudaMemcpy(neighborsSeg, neighborsDP, neighborsBytes, cudaMemcpyDeviceToHost()); - checkCudaError(returnValue, "cudaMemcpy"); - returnValue = - cudaMemcpy(distancesSeg, distancesDP, distancesBytes, cudaMemcpyDeviceToHost()); - checkCudaError(returnValue, "cudaMemcpy"); - - // Clean up device memory - returnValue = cuvsRMMFree(cuvsRes, queriesDP, queriesBytes); - checkCuVSError(returnValue, "cuvsRMMFree"); - returnValue = cuvsRMMFree(cuvsRes, neighborsDP, neighborsBytes); - checkCuVSError(returnValue, "cuvsRMMFree"); - returnValue = cuvsRMMFree(cuvsRes, distancesDP, distancesBytes); - checkCuVSError(returnValue, "cuvsRMMFree"); - - if (prefilterDP != MemorySegment.NULL) { - returnValue = cuvsRMMFree(cuvsRes, prefilterDP, prefilterBytes); - checkCuVSError(returnValue, "cuvsRMMFree"); + try (var queriesDP = allocateRMMSegment(cuvsRes, queriesBytes); + var neighborsDP = allocateRMMSegment(cuvsRes, neighborsBytes); + var distancesDP = allocateRMMSegment(cuvsRes, distancesBytes); + var prefilterDP = + hasPreFilter + ? allocateRMMSegment(cuvsRes, prefilterBytes) + : CloseableRMMAllocation.EMPTY) { + + // Copy queries from host to device + var returnValue = + cudaMemcpy( + queriesDP.handle(), hostQueriesSeg, queriesBytes, cudaMemcpyHostToDevice()); + checkCudaError(returnValue, "cudaMemcpy"); + + // Create tensors from device memory + long[] queriesShape = {numQueries, vectorDimension}; + MemorySegment queriesTensor = + prepareTensor( + localArena, queriesDP.handle(), queriesShape, kDLFloat(), 32, kDLCUDA(), 1); + long[] neighborsShape = {numQueries, topK}; + MemorySegment neighborsTensor = + prepareTensor( + localArena, + neighborsDP.handle(), + neighborsShape, + kDLInt(), + 64, + kDLCUDA(), + 1); // 64-bit int + long[] distancesShape = {numQueries, topK}; + MemorySegment distancesTensor = + prepareTensor( + localArena, distancesDP.handle(), distancesShape, kDLFloat(), 32, kDLCUDA(), 1); + + // Sync before prefilter setup + returnValue = cuvsStreamSync(cuvsRes); + checkCuVSError(returnValue, "cuvsStreamSync"); + + // Handle prefilter + MemorySegment prefilter = cuvsFilter.allocate(localArena); + + if (hasPreFilter) { + BitSet concatenatedFilters = concatenate(prefilters, (int) query.getNumDocs()); + long[] filters = concatenatedFilters.toLongArray(); + MemorySegment hostPrefilterSeg = buildMemorySegment(localArena, filters); + + long[] prefilterShape = {prefilterLen}; + + // Copy prefilter to device + checkCudaError( + cudaMemcpy( + prefilterDP.handle(), + hostPrefilterSeg, + prefilterBytes, + cudaMemcpyHostToDevice()), + "cudaMemcpy"); + + MemorySegment prefilterTensor = + prepareTensor( + localArena, prefilterDP.handle(), prefilterShape, kDLUInt(), 32, kDLCUDA(), 1); + + cuvsFilter.type(prefilter, 1); // BITSET + cuvsFilter.addr(prefilter, prefilterTensor.address()); + } else { + cuvsFilter.type(prefilter, 0); // NO_FILTER + cuvsFilter.addr(prefilter, 0); + } + + // Perform search + returnValue = + cuvsTieredIndexSearch( + cuvsRes, + segmentFromSearchParams(query.getCagraSearchParameters(), localArena), + tieredIndexReference.getMemorySegment(), + queriesTensor, + neighborsTensor, + distancesTensor, + prefilter); + checkCuVSError(returnValue, "cuvsTieredIndexSearch"); + + // Copy results from device to host + returnValue = + cudaMemcpy( + neighborsSeg, neighborsDP.handle(), neighborsBytes, cudaMemcpyDeviceToHost()); + checkCudaError(returnValue, "cudaMemcpy"); + returnValue = + cudaMemcpy( + distancesSeg, distancesDP.handle(), distancesBytes, cudaMemcpyDeviceToHost()); + checkCudaError(returnValue, "cudaMemcpy"); + + return TieredSearchResultsImpl.create( + neighborsLayout, + distancesLayout, + neighborsSeg, + distancesSeg, + topK, + query.getMapping(), + numQueries); } - - return TieredSearchResultsImpl.create( - neighborsLayout, - distancesLayout, - neighborsSeg, - distancesSeg, - topK, - query.getMapping(), - numQueries); } } } @@ -333,25 +337,25 @@ private void performExtend(CuVSMatrix extendDataset) { // Allocate device memory for extend data long dataSize = C_FLOAT_BYTE_SIZE * rows * cols; - MemorySegment datasetDP = allocateRMMSegment(cuvsRes, dataSize); - - // Copy host to device - checkCudaError( - cudaMemcpy(datasetDP, hostDataSeg, dataSize, cudaMemcpyHostToDevice()), "cudaMemcpy"); - - // Create tensor from device memory - long[] datasetShape = {rows, cols}; - MemorySegment datasetTensor = - prepareTensor(localArena, datasetDP, datasetShape, kDLFloat(), 32, kDLCUDA(), 1); + try (var datasetDP = allocateRMMSegment(cuvsRes, dataSize)) { + // Copy host to device + checkCudaError( + cudaMemcpy(datasetDP.handle(), hostDataSeg, dataSize, cudaMemcpyHostToDevice()), + "cudaMemcpy"); - checkCuVSError(cuvsStreamSync(cuvsRes), "cuvsStreamSync"); + // Create tensor from device memory + long[] datasetShape = {rows, cols}; + MemorySegment datasetTensor = + prepareTensor( + localArena, datasetDP.handle(), datasetShape, kDLFloat(), 32, kDLCUDA(), 1); - checkCuVSError( - cuvsTieredIndexExtend(cuvsRes, datasetTensor, tieredIndexReference.getMemorySegment()), - "cuvsTieredIndexExtend"); + checkCuVSError(cuvsStreamSync(cuvsRes), "cuvsStreamSync"); - // Clean up device memory - checkCuVSError(cuvsRMMFree(cuvsRes, datasetDP, dataSize), "cuvsRMMFree"); + checkCuVSError( + cuvsTieredIndexExtend( + cuvsRes, datasetTensor, tieredIndexReference.getMemorySegment()), + "cuvsTieredIndexExtend"); + } } } } diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/CloseableRMMAllocation.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/CloseableRMMAllocation.java new file mode 100644 index 0000000000..6dcc4c892f --- /dev/null +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/CloseableRMMAllocation.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.nvidia.cuvs.internal.common; + +import static com.nvidia.cuvs.internal.common.LinkerHelper.C_POINTER; +import static com.nvidia.cuvs.internal.common.Util.checkCuVSError; +import static com.nvidia.cuvs.internal.panama.headers_h.cuvsRMMAlloc; +import static com.nvidia.cuvs.internal.panama.headers_h.cuvsRMMFree; + +import java.lang.foreign.Arena; +import java.lang.foreign.MemorySegment; + +/** + * A closeable handle for RMM allocations that can be used with try-with-resources. + * This class manages the lifecycle of a memory allocation made with RMM (RAPIDS Memory Manager). + * It ensures that the allocated memory is properly released when no longer needed. + */ +public class CloseableRMMAllocation implements CloseableHandle { + + private final long cuvsResourceHandle; + private long numBytes; + private MemorySegment pointer; + + private CloseableRMMAllocation(long cuvsResourceHandle, long numBytes, MemorySegment pointer) { + this.cuvsResourceHandle = cuvsResourceHandle; + this.numBytes = numBytes; + this.pointer = pointer; + } + + /** + * Copy constructor transfers the ownership of the argument's MemorySegment to this object. + */ + public CloseableRMMAllocation(CloseableRMMAllocation other) { + this.cuvsResourceHandle = other.cuvsResourceHandle; + this.numBytes = other.numBytes; + this.pointer = other.release(); + } + + public static CloseableRMMAllocation allocateRMMSegment(long cuvsResourceHandle, long numBytes) { + try (var localArena = Arena.ofConfined()) { + MemorySegment datasetMemorySegment = localArena.allocate(C_POINTER); + checkCuVSError( + cuvsRMMAlloc(cuvsResourceHandle, datasetMemorySegment, numBytes), "cuvsRMMAlloc"); + return new CloseableRMMAllocation( + cuvsResourceHandle, numBytes, datasetMemorySegment.get(C_POINTER, 0)); + } + } + + @Override + public MemorySegment handle() { + return pointer; + } + + private MemorySegment release() { + var oldPointer = pointer; + pointer = MemorySegment.NULL; + numBytes = 0; + return oldPointer; + } + + private boolean mustClose() { + return pointer != MemorySegment.NULL; + } + + @Override + public void close() { + if (mustClose()) { + checkCuVSError(cuvsRMMFree(cuvsResourceHandle, pointer, numBytes), "cuvsRMMFree"); + } + } + + public static CloseableRMMAllocation EMPTY = new CloseableRMMAllocation(0, 0, MemorySegment.NULL); +} diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/Util.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/Util.java index 6d08878650..d5972ecca7 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/Util.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/Util.java @@ -19,12 +19,10 @@ import static com.nvidia.cuvs.internal.common.LinkerHelper.C_FLOAT; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_INT; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_LONG; -import static com.nvidia.cuvs.internal.common.LinkerHelper.C_POINTER; import static com.nvidia.cuvs.internal.panama.headers_h.cudaGetDeviceCount; import static com.nvidia.cuvs.internal.panama.headers_h.cudaGetDeviceProperties_v2; import static com.nvidia.cuvs.internal.panama.headers_h.cudaMemGetInfo; import static com.nvidia.cuvs.internal.panama.headers_h.cudaSetDevice; -import static com.nvidia.cuvs.internal.panama.headers_h.cuvsRMMAlloc; import static com.nvidia.cuvs.internal.panama.headers_h.size_t; import com.nvidia.cuvs.GPUInfo; @@ -351,15 +349,4 @@ public static MemorySegment prepareTensor( return tensor; } - - public static MemorySegment allocateRMMSegment(long resourceHandle, long datasetBytes) { - try (var localArena = Arena.ofConfined()) { - MemorySegment datasetMemorySegment = localArena.allocate(C_POINTER); - - var returnValue = cuvsRMMAlloc(resourceHandle, datasetMemorySegment, datasetBytes); - checkCuVSError(returnValue, "cuvsRMMAlloc"); - - return datasetMemorySegment.get(C_POINTER, 0); - } - } }