Skip to content

Commit 3e3212f

Browse files
authored
Merge branch 'branch-25.10' into java/cagra-index-int8
2 parents 077d855 + c07ab70 commit 3e3212f

3 files changed

Lines changed: 145 additions & 59 deletions

File tree

java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSDeviceMatrixImpl.java

Lines changed: 12 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,17 @@
1515
*/
1616
package com.nvidia.cuvs.internal;
1717

18-
import static com.nvidia.cuvs.internal.common.LinkerHelper.C_POINTER;
1918
import static com.nvidia.cuvs.internal.common.Util.*;
2019
import static com.nvidia.cuvs.internal.panama.headers_h.*;
2120

2221
import com.nvidia.cuvs.*;
22+
import com.nvidia.cuvs.internal.common.PinnedMemoryBuffer;
2323
import com.nvidia.cuvs.internal.panama.DLManagedTensor;
2424
import com.nvidia.cuvs.internal.panama.DLTensor;
2525
import java.lang.foreign.*;
2626

2727
public class CuVSDeviceMatrixImpl extends CuVSMatrixBaseImpl implements CuVSDeviceMatrix {
2828

29-
private static final int CHUNK_BYTES =
30-
8 * 1024 * 1024; // Based on benchmarks, 8MB seems the minimum size to optimize PCIe bandwidth
31-
private final long hostBufferBytes;
32-
3329
private long bufferedMatrixRowStart = 0;
3430
private long bufferedMatrixRowEnd = 0;
3531

@@ -38,7 +34,7 @@ public class CuVSDeviceMatrixImpl extends CuVSMatrixBaseImpl implements CuVSDevi
3834
private final long rowStride;
3935
private final long columnStride;
4036

41-
private MemorySegment hostBuffer = MemorySegment.NULL;
37+
private final PinnedMemoryBuffer hostBuffer;
4238

4339
protected CuVSDeviceMatrixImpl(
4440
CuVSResources resources,
@@ -63,18 +59,7 @@ protected CuVSDeviceMatrixImpl(
6359
this.resources = resources;
6460
this.rowStride = rowStride;
6561
this.columnStride = columnStride;
66-
67-
long rowBytes = columns * valueLayout.byteSize();
68-
long matrixBytes = size * rowBytes;
69-
if (matrixBytes < CHUNK_BYTES) {
70-
this.hostBufferBytes = matrixBytes;
71-
} else if (rowBytes > CHUNK_BYTES) {
72-
// We need to buffer at least one row at time
73-
this.hostBufferBytes = rowBytes;
74-
} else {
75-
var rowCount = (CHUNK_BYTES / rowBytes);
76-
this.hostBufferBytes = rowBytes * rowCount;
77-
}
62+
this.hostBuffer = new PinnedMemoryBuffer(size, columns, valueLayout);
7863
}
7964

8065
@Override
@@ -84,27 +69,10 @@ public MemorySegment toTensor(Arena arena) {
8469
arena, memorySegment, new long[] {size, columns}, strides, code(), bits(), kDLCUDA());
8570
}
8671

87-
private static MemorySegment createPinnedBuffer(long bufferBytes) {
88-
try (var localArena = Arena.ofConfined()) {
89-
MemorySegment pointer = localArena.allocate(C_POINTER);
90-
checkCudaError(cudaMallocHost(pointer, bufferBytes), "cudaMallocHost");
91-
return pointer.get(C_POINTER, 0);
92-
}
93-
}
94-
95-
private static void destroyPinnedBuffer(MemorySegment bufferSegment) {
96-
checkCudaError(cudaFreeHost(bufferSegment), "cudaFreeHost");
97-
}
98-
9972
private void populateBuffer(long startRow) {
100-
if (hostBuffer == MemorySegment.NULL) {
101-
// System.out.println("Creating a buffer of size " + hostBufferBytes);
102-
hostBuffer = createPinnedBuffer(hostBufferBytes);
103-
}
104-
10573
try (var localArena = Arena.ofConfined()) {
10674
long rowBytes = columns * valueLayout.byteSize();
107-
var endRow = Math.min(startRow + (hostBufferBytes / rowBytes), size);
75+
var endRow = Math.min(startRow + (hostBuffer.size() / rowBytes), size);
10876
var rowCount = endRow - startRow;
10977

11078
// System.out.printf(
@@ -123,7 +91,12 @@ private void populateBuffer(long startRow) {
12391

12492
MemorySegment bufferTensor =
12593
prepareTensor(
126-
localArena, hostBuffer, new long[] {rowCount, columns}, code(), bits(), kDLCPU());
94+
localArena,
95+
hostBuffer.address(),
96+
new long[] {rowCount, columns},
97+
code(),
98+
bits(),
99+
kDLCPU());
127100

128101
try (var resourceAccess = resources.access()) {
129102
checkCuVSError(
@@ -146,7 +119,7 @@ public RowView getRow(long row) {
146119
var startRow = row - bufferedMatrixRowStart;
147120

148121
return new SliceRowView(
149-
hostBuffer.asSlice(startRow * columns * valueByteSize, columns * valueByteSize),
122+
hostBuffer.address().asSlice(startRow * columns * valueByteSize, columns * valueByteSize),
150123
columns,
151124
valueLayout,
152125
dataType,
@@ -248,10 +221,7 @@ public void toDevice(CuVSDeviceMatrix targetMatrix, CuVSResources cuVSResources)
248221

249222
@Override
250223
public void close() {
251-
if (hostBuffer != MemorySegment.NULL) {
252-
destroyPinnedBuffer(hostBuffer);
253-
hostBuffer = MemorySegment.NULL;
254-
}
224+
hostBuffer.close();
255225
}
256226

257227
private static class CuVSDeviceMatrixDelegate implements CuVSDeviceMatrix, CuVSMatrixInternal {
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.nvidia.cuvs.internal.common;
17+
18+
import static com.nvidia.cuvs.internal.common.LinkerHelper.C_POINTER;
19+
import static com.nvidia.cuvs.internal.common.Util.checkCudaError;
20+
import static com.nvidia.cuvs.internal.panama.headers_h.*;
21+
22+
import java.lang.foreign.Arena;
23+
import java.lang.foreign.MemorySegment;
24+
import java.lang.foreign.ValueLayout;
25+
26+
public class PinnedMemoryBuffer implements AutoCloseable {
27+
28+
private static final int CHUNK_BYTES =
29+
8 * 1024 * 1024; // Based on benchmarks, 8MB seems the minimum size to optimize PCIe bandwidth
30+
private final long hostBufferBytes;
31+
32+
private MemorySegment hostBuffer = MemorySegment.NULL;
33+
34+
public PinnedMemoryBuffer(long rows, long columns, ValueLayout valueLayout) {
35+
36+
long rowBytes = columns * valueLayout.byteSize();
37+
long matrixBytes = rows * rowBytes;
38+
if (matrixBytes < CHUNK_BYTES) {
39+
this.hostBufferBytes = matrixBytes;
40+
} else if (rowBytes > CHUNK_BYTES) {
41+
// We need to buffer at least one row at time
42+
this.hostBufferBytes = rowBytes;
43+
} else {
44+
var rowCount = (CHUNK_BYTES / rowBytes);
45+
this.hostBufferBytes = rowBytes * rowCount;
46+
}
47+
}
48+
49+
private static MemorySegment createPinnedBuffer(long bufferBytes) {
50+
try (var localArena = Arena.ofConfined()) {
51+
MemorySegment pointer = localArena.allocate(C_POINTER);
52+
checkCudaError(cudaMallocHost(pointer, bufferBytes), "cudaMallocHost");
53+
return pointer.get(C_POINTER, 0);
54+
}
55+
}
56+
57+
private static void destroyPinnedBuffer(MemorySegment bufferSegment) {
58+
checkCudaError(cudaFreeHost(bufferSegment), "cudaFreeHost");
59+
}
60+
61+
public MemorySegment address() {
62+
if (hostBuffer == MemorySegment.NULL) {
63+
hostBuffer = createPinnedBuffer(hostBufferBytes);
64+
}
65+
return hostBuffer;
66+
}
67+
68+
public long size() {
69+
return hostBufferBytes;
70+
}
71+
72+
@Override
73+
public void close() {
74+
if (hostBuffer != MemorySegment.NULL) {
75+
destroyPinnedBuffer(hostBuffer);
76+
hostBuffer = MemorySegment.NULL;
77+
}
78+
}
79+
}

java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java

Lines changed: 54 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@
1818
import static com.nvidia.cuvs.internal.common.Util.*;
1919
import static com.nvidia.cuvs.internal.panama.headers_h.cuvsVersionGet;
2020
import static com.nvidia.cuvs.internal.panama.headers_h.uint16_t;
21+
import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaStreamSynchronize;
2122

2223
import com.nvidia.cuvs.*;
2324
import com.nvidia.cuvs.internal.*;
25+
import com.nvidia.cuvs.internal.common.PinnedMemoryBuffer;
2426
import com.nvidia.cuvs.internal.common.Util;
2527
import java.io.IOException;
2628
import java.lang.foreign.Arena;
@@ -216,7 +218,7 @@ public CuVSHostMatrix build() {
216218
public CuVSMatrix.Builder<CuVSDeviceMatrix> newDeviceMatrixBuilder(
217219
CuVSResources resources, long size, long columns, CuVSMatrix.DataType dataType)
218220
throws UnsupportedOperationException {
219-
return new HeapSegmentBuilder(resources, size, columns, dataType);
221+
return new BufferedSegmentBuilder(resources, size, columns, dataType);
220222
}
221223

222224
@Override
@@ -227,7 +229,7 @@ public CuVSMatrix.Builder<CuVSDeviceMatrix> newDeviceMatrixBuilder(
227229
int rowStride,
228230
int columnStride,
229231
CuVSMatrix.DataType dataType) {
230-
return new HeapSegmentBuilder(resources, size, columns, rowStride, columnStride, dataType);
232+
return new BufferedSegmentBuilder(resources, size, columns, rowStride, columnStride, dataType);
231233
}
232234

233235
@Override
@@ -279,28 +281,38 @@ public CuVSMatrix newMatrixFromArray(byte[][] vectors) {
279281

280282
/**
281283
* This {@link CuVSDeviceMatrix} builder implementation returns a {@link CuVSDeviceMatrix} backed by managed RMM
282-
* device memory. It uses a non-native {@link MemorySegment} created directly from on-heap java arrays to avoid
283-
* an intermediate allocation and copy to a native (off-heap) segment.
284-
* It requires the copy function ({@code cudaMemcpyAsync}) to have the {@code Critical} linker option in order
285-
* to allow the access to on-heap memory (see {@link Util#cudaMemcpyAsync}).
284+
* device memory. It uses a {@link PinnedMemoryBuffer} to batch data before copying it to the GPU.
286285
*/
287-
private static class HeapSegmentBuilder implements CuVSMatrix.Builder<CuVSDeviceMatrix> {
286+
private static class BufferedSegmentBuilder implements CuVSMatrix.Builder<CuVSDeviceMatrix> {
287+
288288
private final long columns;
289289
private final long size;
290290
private final CuVSDeviceMatrixImpl matrix;
291291
private final MemorySegment stream;
292-
private int current;
293292

294-
private HeapSegmentBuilder(
293+
private final long rowBytes;
294+
private int currentRow;
295+
296+
private final PinnedMemoryBuffer hostBuffer;
297+
private final long bufferRowCount;
298+
private int currentBufferRow;
299+
300+
private BufferedSegmentBuilder(
295301
CuVSResources resources, long size, long columns, CuVSMatrix.DataType dataType) {
296302
this.columns = columns;
297303
this.size = size;
298304
this.matrix = CuVSDeviceMatrixRMMImpl.create(resources, size, columns, dataType);
299305
this.stream = Util.getStream(resources);
300-
this.current = 0;
306+
this.currentRow = 0;
307+
308+
this.hostBuffer = new PinnedMemoryBuffer(size, columns, matrix.valueLayout());
309+
310+
this.rowBytes = columns * matrix.valueLayout().byteSize();
311+
this.bufferRowCount = Math.min((hostBuffer.size() / rowBytes), size);
312+
this.currentBufferRow = 0;
301313
}
302314

303-
private HeapSegmentBuilder(
315+
private BufferedSegmentBuilder(
304316
CuVSResources resources,
305317
long size,
306318
long columns,
@@ -313,7 +325,13 @@ private HeapSegmentBuilder(
313325
CuVSDeviceMatrixRMMImpl.create(
314326
resources, size, columns, rowStride, columnStride, dataType);
315327
this.stream = Util.getStream(resources);
316-
this.current = 0;
328+
this.currentRow = 0;
329+
330+
this.hostBuffer = new PinnedMemoryBuffer(size, columns, matrix.valueLayout());
331+
332+
this.rowBytes = columns * matrix.valueLayout().byteSize();
333+
this.bufferRowCount = Math.min((hostBuffer.size() / rowBytes), size);
334+
this.currentBufferRow = 0;
317335
}
318336

319337
@Override
@@ -347,19 +365,38 @@ public void addVector(int[] vector) {
347365
}
348366

349367
private void internalAddVector(MemorySegment vector) {
350-
if (current >= size) {
368+
if (currentRow >= size) {
351369
throw new ArrayIndexOutOfBoundsException();
352370
}
371+
var hostBufferOffset = currentBufferRow * rowBytes;
372+
MemorySegment.copy(vector, 0, hostBuffer.address(), hostBufferOffset, rowBytes);
353373

354-
long rowBytes = columns * matrix.valueLayout().byteSize();
374+
currentRow++;
375+
currentBufferRow++;
376+
if (currentBufferRow == bufferRowCount) {
377+
flushBuffer();
378+
}
379+
}
355380

356-
var dstOffset = ((current++) * rowBytes);
357-
var dst = matrix.memorySegment().asSlice(dstOffset);
358-
cudaMemcpyAsync(dst, vector, rowBytes, CudaMemcpyKind.HOST_TO_DEVICE, stream);
381+
private void flushBuffer() {
382+
if (currentBufferRow > 0) {
383+
var deviceMemoryOffset = (currentRow - currentBufferRow) * rowBytes;
384+
var dst = matrix.memorySegment().asSlice(deviceMemoryOffset);
385+
cudaMemcpyAsync(
386+
dst,
387+
hostBuffer.address(),
388+
currentBufferRow * rowBytes,
389+
CudaMemcpyKind.HOST_TO_DEVICE,
390+
stream);
391+
currentBufferRow = 0;
392+
checkCudaError(cudaStreamSynchronize(stream), "cudaStreamSynchronize");
393+
}
359394
}
360395

361396
@Override
362397
public CuVSDeviceMatrix build() {
398+
flushBuffer();
399+
hostBuffer.close();
363400
return matrix;
364401
}
365402
}

0 commit comments

Comments
 (0)