Merge branch 'branch-25.10' into java/cagra-index-int8

benfred · web-flow · commit 3e3212f8eb69 · 2025-09-25T11:08:33.000-07:00
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSDeviceMatrixImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSDeviceMatrixImpl.java
@@ -15,21 +15,17 @@
  */
 package com.nvidia.cuvs.internal;
 
-import static com.nvidia.cuvs.internal.common.LinkerHelper.C_POINTER;
 import static com.nvidia.cuvs.internal.common.Util.*;
 import static com.nvidia.cuvs.internal.panama.headers_h.*;
 
 import com.nvidia.cuvs.*;
+import com.nvidia.cuvs.internal.common.PinnedMemoryBuffer;
 import com.nvidia.cuvs.internal.panama.DLManagedTensor;
 import com.nvidia.cuvs.internal.panama.DLTensor;
 import java.lang.foreign.*;
 
 public class CuVSDeviceMatrixImpl extends CuVSMatrixBaseImpl implements CuVSDeviceMatrix {
 
-  private static final int CHUNK_BYTES =
-      8 * 1024 * 1024; // Based on benchmarks, 8MB seems the minimum size to optimize PCIe bandwidth
-  private final long hostBufferBytes;
-
   private long bufferedMatrixRowStart = 0;
   private long bufferedMatrixRowEnd = 0;
 
@@ -38,7 +34,7 @@ public class CuVSDeviceMatrixImpl extends CuVSMatrixBaseImpl implements CuVSDevi
   private final long rowStride;
   private final long columnStride;
 
-  private MemorySegment hostBuffer = MemorySegment.NULL;
+  private final PinnedMemoryBuffer hostBuffer;
 
   protected CuVSDeviceMatrixImpl(
       CuVSResources resources,
@@ -63,18 +59,7 @@ protected CuVSDeviceMatrixImpl(
     this.resources = resources;
     this.rowStride = rowStride;
     this.columnStride = columnStride;
-
-    long rowBytes = columns * valueLayout.byteSize();
-    long matrixBytes = size * rowBytes;
-    if (matrixBytes < CHUNK_BYTES) {
-      this.hostBufferBytes = matrixBytes;
-    } else if (rowBytes > CHUNK_BYTES) {
-      // We need to buffer at least one row at time
-      this.hostBufferBytes = rowBytes;
-    } else {
-      var rowCount = (CHUNK_BYTES / rowBytes);
-      this.hostBufferBytes = rowBytes * rowCount;
-    }
+    this.hostBuffer = new PinnedMemoryBuffer(size, columns, valueLayout);
   }
 
   @Override
@@ -84,27 +69,10 @@ public MemorySegment toTensor(Arena arena) {
         arena, memorySegment, new long[] {size, columns}, strides, code(), bits(), kDLCUDA());
   }
 
-  private static MemorySegment createPinnedBuffer(long bufferBytes) {
-    try (var localArena = Arena.ofConfined()) {
-      MemorySegment pointer = localArena.allocate(C_POINTER);
-      checkCudaError(cudaMallocHost(pointer, bufferBytes), "cudaMallocHost");
-      return pointer.get(C_POINTER, 0);
-    }
-  }
-
-  private static void destroyPinnedBuffer(MemorySegment bufferSegment) {
-    checkCudaError(cudaFreeHost(bufferSegment), "cudaFreeHost");
-  }
-
   private void populateBuffer(long startRow) {
-    if (hostBuffer == MemorySegment.NULL) {
-      //      System.out.println("Creating a buffer of size " + hostBufferBytes);
-      hostBuffer = createPinnedBuffer(hostBufferBytes);
-    }
-
     try (var localArena = Arena.ofConfined()) {
       long rowBytes = columns * valueLayout.byteSize();
-      var endRow = Math.min(startRow + (hostBufferBytes / rowBytes), size);
+      var endRow = Math.min(startRow + (hostBuffer.size() / rowBytes), size);
       var rowCount = endRow - startRow;
 
       //      System.out.printf(
@@ -123,7 +91,12 @@ private void populateBuffer(long startRow) {
 
       MemorySegment bufferTensor =
           prepareTensor(
-              localArena, hostBuffer, new long[] {rowCount, columns}, code(), bits(), kDLCPU());
+              localArena,
+              hostBuffer.address(),
+              new long[] {rowCount, columns},
+              code(),
+              bits(),
+              kDLCPU());
 
       try (var resourceAccess = resources.access()) {
         checkCuVSError(
@@ -146,7 +119,7 @@ public RowView getRow(long row) {
     var startRow = row - bufferedMatrixRowStart;
 
     return new SliceRowView(
-        hostBuffer.asSlice(startRow * columns * valueByteSize, columns * valueByteSize),
+        hostBuffer.address().asSlice(startRow * columns * valueByteSize, columns * valueByteSize),
         columns,
         valueLayout,
         dataType,
@@ -248,10 +221,7 @@ public void toDevice(CuVSDeviceMatrix targetMatrix, CuVSResources cuVSResources)
 
   @Override
   public void close() {
-    if (hostBuffer != MemorySegment.NULL) {
-      destroyPinnedBuffer(hostBuffer);
-      hostBuffer = MemorySegment.NULL;
-    }
+    hostBuffer.close();
   }
 
   private static class CuVSDeviceMatrixDelegate implements CuVSDeviceMatrix, CuVSMatrixInternal {
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/PinnedMemoryBuffer.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/PinnedMemoryBuffer.java
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.cuvs.internal.common;
+
+import static com.nvidia.cuvs.internal.common.LinkerHelper.C_POINTER;
+import static com.nvidia.cuvs.internal.common.Util.checkCudaError;
+import static com.nvidia.cuvs.internal.panama.headers_h.*;
+
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+
+public class PinnedMemoryBuffer implements AutoCloseable {
+
+  private static final int CHUNK_BYTES =
+      8 * 1024 * 1024; // Based on benchmarks, 8MB seems the minimum size to optimize PCIe bandwidth
+  private final long hostBufferBytes;
+
+  private MemorySegment hostBuffer = MemorySegment.NULL;
+
+  public PinnedMemoryBuffer(long rows, long columns, ValueLayout valueLayout) {
+
+    long rowBytes = columns * valueLayout.byteSize();
+    long matrixBytes = rows * rowBytes;
+    if (matrixBytes < CHUNK_BYTES) {
+      this.hostBufferBytes = matrixBytes;
+    } else if (rowBytes > CHUNK_BYTES) {
+      // We need to buffer at least one row at time
+      this.hostBufferBytes = rowBytes;
+    } else {
+      var rowCount = (CHUNK_BYTES / rowBytes);
+      this.hostBufferBytes = rowBytes * rowCount;
+    }
+  }
+
+  private static MemorySegment createPinnedBuffer(long bufferBytes) {
+    try (var localArena = Arena.ofConfined()) {
+      MemorySegment pointer = localArena.allocate(C_POINTER);
+      checkCudaError(cudaMallocHost(pointer, bufferBytes), "cudaMallocHost");
+      return pointer.get(C_POINTER, 0);
+    }
+  }
+
+  private static void destroyPinnedBuffer(MemorySegment bufferSegment) {
+    checkCudaError(cudaFreeHost(bufferSegment), "cudaFreeHost");
+  }
+
+  public MemorySegment address() {
+    if (hostBuffer == MemorySegment.NULL) {
+      hostBuffer = createPinnedBuffer(hostBufferBytes);
+    }
+    return hostBuffer;
+  }
+
+  public long size() {
+    return hostBufferBytes;
+  }
+
+  @Override
+  public void close() {
+    if (hostBuffer != MemorySegment.NULL) {
+      destroyPinnedBuffer(hostBuffer);
+      hostBuffer = MemorySegment.NULL;
+    }
+  }
+}
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
@@ -18,9 +18,11 @@
 import static com.nvidia.cuvs.internal.common.Util.*;
 import static com.nvidia.cuvs.internal.panama.headers_h.cuvsVersionGet;
 import static com.nvidia.cuvs.internal.panama.headers_h.uint16_t;
+import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaStreamSynchronize;
 
 import com.nvidia.cuvs.*;
 import com.nvidia.cuvs.internal.*;
+import com.nvidia.cuvs.internal.common.PinnedMemoryBuffer;
 import com.nvidia.cuvs.internal.common.Util;
 import java.io.IOException;
 import java.lang.foreign.Arena;
@@ -216,7 +218,7 @@ public CuVSHostMatrix build() {
   public CuVSMatrix.Builder<CuVSDeviceMatrix> newDeviceMatrixBuilder(
       CuVSResources resources, long size, long columns, CuVSMatrix.DataType dataType)
       throws UnsupportedOperationException {
-    return new HeapSegmentBuilder(resources, size, columns, dataType);
+    return new BufferedSegmentBuilder(resources, size, columns, dataType);
   }
 
   @Override
@@ -227,7 +229,7 @@ public CuVSMatrix.Builder<CuVSDeviceMatrix> newDeviceMatrixBuilder(
       int rowStride,
       int columnStride,
       CuVSMatrix.DataType dataType) {
-    return new HeapSegmentBuilder(resources, size, columns, rowStride, columnStride, dataType);
+    return new BufferedSegmentBuilder(resources, size, columns, rowStride, columnStride, dataType);
   }
 
   @Override
@@ -279,28 +281,38 @@ public CuVSMatrix newMatrixFromArray(byte[][] vectors) {
 
   /**
    * This {@link CuVSDeviceMatrix} builder implementation returns a {@link CuVSDeviceMatrix} backed by managed RMM
-   * device memory. It uses a non-native {@link MemorySegment} created directly from on-heap java arrays to avoid
-   * an intermediate allocation and copy to a native (off-heap) segment.
-   * It requires the copy function ({@code cudaMemcpyAsync}) to have the {@code Critical} linker option in order
-   * to allow the access to on-heap memory (see {@link Util#cudaMemcpyAsync}).
+   * device memory. It uses a {@link PinnedMemoryBuffer} to batch data before copying it to the GPU.
    */
-  private static class HeapSegmentBuilder implements CuVSMatrix.Builder<CuVSDeviceMatrix> {
+  private static class BufferedSegmentBuilder implements CuVSMatrix.Builder<CuVSDeviceMatrix> {
+
     private final long columns;
     private final long size;
     private final CuVSDeviceMatrixImpl matrix;
     private final MemorySegment stream;
-    private int current;
 
-    private HeapSegmentBuilder(
+    private final long rowBytes;
+    private int currentRow;
+
+    private final PinnedMemoryBuffer hostBuffer;
+    private final long bufferRowCount;
+    private int currentBufferRow;
+
+    private BufferedSegmentBuilder(
         CuVSResources resources, long size, long columns, CuVSMatrix.DataType dataType) {
       this.columns = columns;
       this.size = size;
       this.matrix = CuVSDeviceMatrixRMMImpl.create(resources, size, columns, dataType);
       this.stream = Util.getStream(resources);
-      this.current = 0;
+      this.currentRow = 0;
+
+      this.hostBuffer = new PinnedMemoryBuffer(size, columns, matrix.valueLayout());
+
+      this.rowBytes = columns * matrix.valueLayout().byteSize();
+      this.bufferRowCount = Math.min((hostBuffer.size() / rowBytes), size);
+      this.currentBufferRow = 0;
     }
 
-    private HeapSegmentBuilder(
+    private BufferedSegmentBuilder(
         CuVSResources resources,
         long size,
         long columns,
@@ -313,7 +325,13 @@ private HeapSegmentBuilder(
           CuVSDeviceMatrixRMMImpl.create(
               resources, size, columns, rowStride, columnStride, dataType);
       this.stream = Util.getStream(resources);
-      this.current = 0;
+      this.currentRow = 0;
+
+      this.hostBuffer = new PinnedMemoryBuffer(size, columns, matrix.valueLayout());
+
+      this.rowBytes = columns * matrix.valueLayout().byteSize();
+      this.bufferRowCount = Math.min((hostBuffer.size() / rowBytes), size);
+      this.currentBufferRow = 0;
     }
 
     @Override
@@ -347,19 +365,38 @@ public void addVector(int[] vector) {
     }
 
     private void internalAddVector(MemorySegment vector) {
-      if (current >= size) {
+      if (currentRow >= size) {
         throw new ArrayIndexOutOfBoundsException();
       }
+      var hostBufferOffset = currentBufferRow * rowBytes;
+      MemorySegment.copy(vector, 0, hostBuffer.address(), hostBufferOffset, rowBytes);
 
-      long rowBytes = columns * matrix.valueLayout().byteSize();
+      currentRow++;
+      currentBufferRow++;
+      if (currentBufferRow == bufferRowCount) {
+        flushBuffer();
+      }
+    }
 
-      var dstOffset = ((current++) * rowBytes);
-      var dst = matrix.memorySegment().asSlice(dstOffset);
-      cudaMemcpyAsync(dst, vector, rowBytes, CudaMemcpyKind.HOST_TO_DEVICE, stream);
+    private void flushBuffer() {
+      if (currentBufferRow > 0) {
+        var deviceMemoryOffset = (currentRow - currentBufferRow) * rowBytes;
+        var dst = matrix.memorySegment().asSlice(deviceMemoryOffset);
+        cudaMemcpyAsync(
+            dst,
+            hostBuffer.address(),
+            currentBufferRow * rowBytes,
+            CudaMemcpyKind.HOST_TO_DEVICE,
+            stream);
+        currentBufferRow = 0;
+        checkCudaError(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
+      }
     }
 
     @Override
     public CuVSDeviceMatrix build() {
+      flushBuffer();
+      hostBuffer.close();
       return matrix;
     }
   }