Skip to content
Merged
Show file tree
Hide file tree
Changes from 212 commits
Commits
Show all changes
214 commits
Select commit Hold shift + click to select a range
cfa308f
fix of code format
linhu-nv Apr 15, 2025
d0881c0
fix bugs about ipc_memory_handle
linhu-nv Apr 15, 2025
7d63547
fix logic problems in client
linhu-nv Apr 15, 2025
dc00c77
quick fix to client-server example
linhu-nv Apr 15, 2025
45bc9f8
fix some bugs to run compile
linhu-nv Apr 16, 2025
4414e83
modify blockmeta definition & impl mempool
zhuofan1123 Apr 10, 2025
3cbb19d
impl index
zhuofan1123 Apr 16, 2025
409ef8a
add index benchmark
zhuofan1123 Apr 16, 2025
7d90ee2
faster hash
zhuofan1123 Apr 16, 2025
d4c5a39
add test for storage_engine + transfer_engine
linhu-nv Apr 17, 2025
ca90a73
optim index
zhuofan1123 Apr 16, 2025
6489c3b
fix a few bugs about performance, should have normal perf now
linhu-nv Apr 17, 2025
0866932
add mempool benchmark
zhuofan1123 Apr 17, 2025
9550fa5
optim mempool
zhuofan1123 Apr 17, 2025
15bfaff
optimize index.insert
zhuofan1123 Apr 18, 2025
cbea324
fix mempool
zhuofan1123 Apr 18, 2025
117ad1a
eviction implementation and optimization
linhu-nv Apr 22, 2025
69167f7
fix evictor
zhuofan1123 Apr 22, 2025
fcc4e87
flatten free ids tensor
zhuofan1123 Apr 18, 2025
be2737e
impl get/put pipeline
zhuofan1123 Apr 21, 2025
7ae1876
add reset for cache engine
zhuofan1123 Apr 21, 2025
143c261
global id allocator
zhuofan1123 Apr 21, 2025
0bff033
list to tensor
zhuofan1123 Apr 21, 2025
7c66cc7
init kvmanager
zhuofan1123 Apr 21, 2025
69410b6
run the pipeline
linhu-nv Apr 25, 2025
e6ba7d8
print cpu-gpu transfer info
zhuofan1123 Apr 25, 2025
4d26d16
refactor index
zhuofan1123 Apr 27, 2025
3bd3119
refactor kvmanager and cache engine
zhuofan1123 Apr 27, 2025
5cf26ff
add insert_length for insert
zhuofan1123 Apr 27, 2025
6ce0002
cleanup buffer
zhuofan1123 Apr 28, 2025
bd7a07d
Layer wise
linhu-nv Apr 29, 2025
242c557
Added the xxhash
May 6, 2025
4af7c81
add cmake
May 6, 2025
958bc6d
adapt use xxhash
May 7, 2025
46655e0
fix bug
zhuofan1123 May 8, 2025
aea67fe
refactor
zhuofan1123 May 8, 2025
bedd8b7
fix
zhuofan1123 May 8, 2025
2e3ae02
remove comment
zhuofan1123 May 9, 2025
dbfc238
add benchmark and refactor code
zhuofan1123 May 9, 2025
030a0ac
fix bug
zhuofan1123 May 9, 2025
bffe028
radix tree
zhuofan1123 May 9, 2025
c6561d0
update
zhuofan1123 May 10, 2025
0d5da6d
update
zhuofan1123 May 10, 2025
717dfd6
fix
zhuofan1123 May 12, 2025
99eca31
quick fix to ssd file allocation
linhu-nv May 12, 2025
1695c96
remove batch transfer
zhuofan1123 May 12, 2025
3147216
quick fix to transfer_op initialization
linhu-nv May 12, 2025
358dc46
check tensor type
zhuofan1123 May 13, 2025
fa820f1
support ce transfer
zhuofan1123 May 13, 2025
fc59950
modify graph creation
zhuofan1123 May 13, 2025
2bdf835
remove dataclass
zhuofan1123 May 13, 2025
6d64ab1
fix to illegal memory access
linhu-nv May 14, 2025
a8c4970
fix to transfer_op default values
linhu-nv May 14, 2025
6f00ee9
remove gitlab-ci yml
zhuofan1123 May 14, 2025
44f33ef
submodule install automatically
zhuofan1123 May 14, 2025
afa85a3
init README
zhuofan1123 May 14, 2025
ef7e3c4
add cython
May 19, 2025
4d570c1
use multiple processes for transfer workers to bypass GIL
linhu-nv May 19, 2025
cc27d0b
fix multi queue import
linhu-nv May 20, 2025
70013cd
Merge pull request #2 from nvidia-china-sae/linhu/mp4worker
zhuofan1123 May 20, 2025
39acc7c
quick fix
zhuofan1123 May 20, 2025
f7cac98
Merge pull request #4 from nvidia-china-sae/linhu/mp4worker
linhu-nv May 20, 2025
91a0c63
support different layout && multiple ssd (#3)
zhuofan1123 May 21, 2025
b4c9af6
add debug mode
May 22, 2025
6544a58
Merge branch 'dev' into pz/cython
May 22, 2025
efd682a
default to debug
May 22, 2025
2d3778d
default to debug
May 22, 2025
8fe52fe
Merge pull request #1 from nvidia-china-sae/pz/cython
zhuofan1123 May 22, 2025
8bf8ddb
NVTX Integration && I/O Optimization (#6)
zhuofan1123 May 23, 2025
855ff9e
include remote cache engine, use ssd as fake remote file for test
linhu-nv May 21, 2025
9c9eff8
some case works now
linhu-nv May 22, 2025
081a035
fix
zhuofan1123 May 23, 2025
a671fad
support parallel remote read/write
zhuofan1123 May 23, 2025
33ffcaf
add nvtx info
zhuofan1123 May 25, 2025
f517378
Merge pull request #5 from nvidia-china-sae/linhu/remote-support
zhuofan1123 May 25, 2025
0ba3bb7
cpu-only
zhuofan1123 May 26, 2025
2b08441
reduce polling sleep time
zhuofan1123 May 26, 2025
9bb42b1
remove unused link
zhuofan1123 May 26, 2025
eda8249
Merge pull request #7 from nvidia-china-sae/zhuofanl/dev
zhuofan1123 May 26, 2025
d84b292
Support for multi-SSD read/write and round-robin layout (#8)
zhuofan1123 May 29, 2025
2e59b07
share tensor (#9)
zhuofan1123 May 30, 2025
561f3a0
test tensor sharing by zmq (#10)
zhuofan1123 May 30, 2025
acf3183
Implement Server Client Mode (#11)
gz944367214 Jun 5, 2025
0523c73
merge leolingli/dev-2 to dev (#13)
peaceforeverCN Jun 5, 2025
6c359eb
add FLEXKV_ENABLE_CFS env var to control build/link flags (#14)
linhu-nv Jun 6, 2025
14c9e38
some optimizations about transfer graph and task status trackers
linhu-nv Jun 3, 2025
a8727f7
rebased on the latest dev branch, draft
linhu-nv Jun 5, 2025
5d01219
tp enabled, draft, commited for multi-gpu test
linhu-nv Jun 7, 2025
04c2a2b
tp&dp support works now on test_kvmanager
Jun 9, 2025
4da4644
quick fix of multi_process test, the return masks is ok now, results …
Jun 9, 2025
218ee58
some small fixes, deal with empty graphs
Jun 10, 2025
c0b9807
Merge pull request #12 from nvidia-china-sae/linhu/tp-worker
zhuofan1123 Jun 10, 2025
acd2331
debug for tp and dp (#15)
gz944367214 Jun 11, 2025
db4f32c
fix corner case && add pytest (#16)
zhuofan1123 Jun 11, 2025
4874c4d
Standardize import & add .so to system path to avoid export manaually
linhu-nv Jun 11, 2025
fb64e7a
refactor transferWorkers
linhu-nv Jun 11, 2025
97a1b44
quick fix
linhu-nv Jun 11, 2025
c37edff
add dtype into the modelConfig so that we can support different dtypes
linhu-nv Jun 11, 2025
bb4b284
use ordered_dict to implement a expiring dict
linhu-nv Jun 11, 2025
5cb2dd0
quick fix
linhu-nv Jun 11, 2025
c96936b
Merge pull request #17 from nvidia-china-sae/linhu/code-clean-refactor
zhuofan1123 Jun 11, 2025
49aaa68
quick fix to build.sh
linhu-nv Jun 11, 2025
d43085a
Merge pull request #18 from nvidia-china-sae/linhu/code-clean-refactor
zhuofan1123 Jun 11, 2025
cf04ef7
refactor get/put (#19)
zhuofan1123 Jun 12, 2025
340f37a
Support (async) query interface. (#20)
axxx03 Jun 12, 2025
a1fea60
in the return mask of put op, only set tokens that are really transfe…
linhu-nv Jun 12, 2025
edf35f0
use GPUCPUTransferWorker when the tp size is only 1 & fix a bug of te…
linhu-nv Jun 12, 2025
fa1e8ca
Merge pull request #21 from nvidia-china-sae/linhu/code-clean-refactor
zhuofan1123 Jun 12, 2025
0fc9e10
add pytest unit for transfer_engine
linhu-nv Jun 13, 2025
9f6664a
Merge pull request #23 from nvidia-china-sae/linhu/code-clean-refactor
zhuofan1123 Jun 13, 2025
3ac17f3
add mypy (#22)
zhuofan1123 Jun 16, 2025
e3af33c
refactor tensor handle (#25)
zhuofan1123 Jun 16, 2025
aaa9117
small fixes
linhu-nv Jun 16, 2025
098a4b3
now we accept gpu_kv_layout as a parameter, and automically infer oth…
linhu-nv Jun 16, 2025
6935dd3
fix of kvlayout generation, manually launch transfer_engine
linhu-nv Jun 16, 2025
28781c4
Merge pull request #24 from nvidia-china-sae/linhu/code-clean-refactor
zhuofan1123 Jun 16, 2025
fb16af1
quick fix
linhu-nv Jun 16, 2025
17a8f8a
Merge pull request #26 from nvidia-china-sae/linhu/code-clean-refactor
zhuofan1123 Jun 16, 2025
46d1921
add mla support
linhu-nv Jun 16, 2025
ac75777
Merge pull request #27 from nvidia-china-sae/linhu/mla
zhuofan1123 Jun 17, 2025
625e548
refactor graph generation (#28)
zhuofan1123 Jun 18, 2025
567fa59
change remote cache: (#30)
peaceforeverCN Jun 24, 2025
6d1e6bf
quick fix for mla + tp
linhu-nv Jun 25, 2025
7396de7
Merge pull request #31 from nvidia-china-sae/linhu/quickfix-mla-tp
zhuofan1123 Jun 25, 2025
acab171
support both layer-wise and block-wise storage for cpu-blocks (#29)
linhu-nv Jun 27, 2025
e93848c
quick fix for len(token_ids) < tokens_per_blocks and len(True in mask…
linhu-nv Jun 30, 2025
7d254c7
skip unready block in put (#34)
zhuofan1123 Jul 1, 2025
a0dedb4
io_uring support for ssd cache (#32)
charliecgxu Jul 3, 2025
cb3bf52
add tracer that can record all requests of flexKV, and the replay scr…
linhu-nv Jul 7, 2025
65de614
fix to radixtree (#38)
linhu-nv Jul 7, 2025
db6897b
repair kvmanager verify logic to fit no remote cache situation (#39)
peaceforeverCN Jul 8, 2025
f440067
quick fix for radix tree (#40)
linhu-nv Jul 8, 2025
026f030
add header file dependency (#41)
charliecgxu Jul 10, 2025
8627757
fix precommit format issues (#42)
linhu-nv Jul 10, 2025
15f3f84
change try_wait and only return the finished request (#43)
peaceforeverCN Jul 10, 2025
f0fe215
use numpy instead of tensor for zmq communication (#45)
linhu-nv Jul 11, 2025
1ea33a2
use fadvise correctly (#44)
charliecgxu Jul 11, 2025
9dbe1b6
task id from client for faster get/put_async (#47)
linhu-nv Jul 15, 2025
5616b59
fix some improper variable names (#48)
charliecgxu Jul 15, 2025
3ba4f97
add config (#46)
zhuofan1123 Jul 15, 2025
d229ec1
spread IO to as many files as possible (#49)
charliecgxu Jul 17, 2025
dce28f6
refactor pytest: add test_utils; add server-client mode
Jul 19, 2025
fe0eea5
Merge pull request #51 from nvidia-china-sae/linhu/refactor-test
zhuofan1123 Jul 21, 2025
39fb3dd
print input parameters correctly in the error case of iouring
charliecgxu Jul 21, 2025
99c3269
remove the restriction of pin memory for iouring
charliecgxu Jul 21, 2025
c9aa940
add unit benchmark for workers (#52)
zhuofan1123 Jul 21, 2025
055513c
format adjusted; delete two test scripts
Jul 21, 2025
ac611d3
Merge pull request #54 from nvidia-china-sae/linhu/format-fix
zhuofan1123 Jul 21, 2025
4144d95
remove direct flag for ssd write
charliecgxu Jul 21, 2025
4d50b43
Merge pull request #53 from nvidia-china-sae/charliecgxu/dev
zhuofan1123 Jul 21, 2025
872c6fa
refactor benchmark_cache_engine and fix some issues (#55)
zhuofan1123 Jul 22, 2025
542e8fa
avoid opening ssd files per io request
charliecgxu Jul 23, 2025
58d22dd
fix bug, ssd_layer_stride_in_bytes compute error
peaceforeverCN Jul 23, 2025
8c51029
Merge pull request #56 from nvidia-china-sae/charliecgxu/dev
zhuofan1123 Jul 23, 2025
cbfb017
Merge pull request #57 from nvidia-china-sae/flex-taco/dev4
zhuofan1123 Jul 23, 2025
bb363e7
add e2e benchmark for kvmanager (#58)
zhuofan1123 Jul 24, 2025
8aa68c7
fix default params
zhuofan1123 Jul 24, 2025
067c2b6
add server_schedular for reduce process communication overhead (#59)
linhu-nv Jul 28, 2025
508a1b1
Unify the interface of the flexkv server (#60)
zhuofan1123 Jul 28, 2025
6d8efa3
select direct_io fds only in read && direct mode
charliecgxu Jul 28, 2025
92b05a9
integrate expiring_dict
zhuofan1123 Jul 28, 2025
6fae3f6
Merge pull request #61 from nvidia-china-sae/charliecgxu/dev
zhuofan1123 Jul 28, 2025
e39e791
Merge pull request #62 from nvidia-china-sae/zhuofanl/dev
zhuofan1123 Jul 28, 2025
fb63fb8
use Pipe instead of Queue for comm in transferEngine and worker
linhu-nv Jul 28, 2025
8a600ec
Merge pull request #63 from nvidia-china-sae/linhu/mp-pipe
zhuofan1123 Jul 29, 2025
2a9a663
limit the id range
zhuofan1123 Jul 29, 2025
47fbee9
fallback to preadv/pwritev when iouring inflight request over limit (…
charliecgxu Jul 29, 2025
f80c836
reduce bubble between op launch (#65)
zhuofan1123 Jul 30, 2025
49c0681
some small fixes (#66)
linhu-nv Aug 5, 2025
e9481fa
remove worker_init_timeout_minutes (#67)
zhuofan1123 Aug 11, 2025
459e787
quick fix that the is_mla is not given to tp_client
linhu-nv Aug 12, 2025
740231f
Merge pull request #69 from nvidia-china-sae/server_client_fix
zhuofan1123 Aug 12, 2025
324eb5c
add a message when error (#71)
peaceforeverCN Aug 14, 2025
5cb99f7
radix tree c++ impl (#70)
charliecgxu Aug 19, 2025
af7f6ee
sync kernel launch
Luis-xu Aug 20, 2025
74c4620
Merge pull request #76 from nvidia-china-sae/sync_kernel_launch
zhuofan1123 Aug 20, 2025
b679970
kvmanager refactor (#73)
zhuofan1123 Aug 21, 2025
830c7a8
feat: add support release wheel (#77)
ggaaooppeenngg Aug 22, 2025
6ae5a78
add evict_ratio in cache config, default is 0
peaceforeverCN Aug 21, 2025
da3ddf4
update unit tests for new version (#79)
zhuofan1123 Aug 22, 2025
82277ce
rename functions
zhuofan1123 Aug 22, 2025
a0f18d3
enable profile in release build
charliecgxu Aug 22, 2025
610133e
Merge pull request #80 from nvidia-china-sae/charliecgxu/dev
zhuofan1123 Aug 25, 2025
a32bb20
Merge pull request #81 from nvidia-china-sae/zhuofanl/dev
zhuofan1123 Aug 25, 2025
abe5e38
Merge pull request #78 from nvidia-china-sae/leolingli/dev
zhuofan1123 Aug 25, 2025
d8a55c2
update benchmark worker (#82)
zhuofan1123 Aug 25, 2025
8283fb5
ci: trigger on main and dev
Aug 25, 2025
d7acfd7
fix broken cpp radix tree support for cache engine (#84)
charliecgxu Aug 25, 2025
7be0aeb
Merge pull request #83 from nvidia-china-sae/lilgao/feat/ci-branch
zhuofan1123 Aug 25, 2025
e6d6143
fix direct io
zhuofan1123 Aug 26, 2025
5a41280
Merge pull request #85 from nvidia-china-sae/zhuofanl/dev
zhuofan1123 Aug 27, 2025
d9cdea5
Using ring buffer in transfer engine to manage the src and dst block …
Luis-xu Aug 27, 2025
efc9da2
quickfix for return type of reduce_tensor
linhu-nv Aug 27, 2025
8c1663b
Merge pull request #87 from nvidia-china-sae/memory_handle_fix
zhuofan1123 Aug 28, 2025
5890621
fix bug
gz944367214 Aug 28, 2025
545dc8b
Merge remote-tracking branch 'origin/dev' into main_process_buffer_ma…
Luis-xu Aug 28, 2025
140aedc
refine ring_buffer and apply it to all workers
Luis-xu Aug 28, 2025
47fdd70
rename PinnedMemoryRing to SharedMemoryRing
Luis-xu Aug 28, 2025
969e0bf
fix status bug
zhuofan1123 Aug 29, 2025
0876171
Merge pull request #89 from nvidia-china-sae/zuogan/dev
zhuofan1123 Aug 29, 2025
5ace688
allow to exceed the max_block_num
zhuofan1123 Aug 29, 2025
f5b6a94
refactor: use hash to allocate buffer && no wait for free slot
zhuofan1123 Aug 29, 2025
7d6232b
Merge pull request #86 from nvidia-china-sae/main_process_buffer_manager
zhuofan1123 Sep 1, 2025
2d70104
add gds
Jul 1, 2025
287f572
fix batch sync
Jul 1, 2025
bdd3e0d
add gds transfer worker support
Jul 17, 2025
c65de2e
add gds worker & test
wenpengw-nv Oct 15, 2025
e12fe9c
gdsput changed to original ssdtransfer
wenpengw-nv Oct 15, 2025
0a3067b
Merge branch 'dev' into wenpengw/gds
wenpengw-nv Oct 20, 2025
51e7fbf
fix callback bug
wenpengw-nv Oct 20, 2025
3b71a34
add gds docs
wenpengw-nv Oct 20, 2025
68bbb6f
remove redundant code
wenpengw-nv Oct 21, 2025
e61c6c6
build flag & assert
wenpengw-nv Oct 27, 2025
9c51cbb
refactor gds transfer thread
wenpengw-nv Oct 27, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,4 +93,4 @@ FlexKV performs:
- **In-Process Cache Engine Integration**: In the dev branch, the implementation, integration, and invocation of the Cache Engine will be further optimized, along with synchronized updates to related APIs.
- **Framework Integration**: Support works for vLLM, SGLang, and other acceleration frameworks will be updated soon.
- **Distributed Query Support**: Enable scalable, distributed KVCache lookup.
- **Latency Optimization**: Further reduce *get* latency via smarter prefetching and compression.
- **Latency Optimization**: Further reduce *get* latency via smarter prefetching and compression.
212 changes: 212 additions & 0 deletions csrc/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
#include "cache_utils.h"
#include "pcfs/pcfs.h"
#include "tp_transfer_thread_group.h"
#include "gds/tp_gds_transfer_thread_group.h"
#include "transfer.cuh"
#include "transfer_ssd.h"
#include "radix_tree.h"
#include "gds/gds_manager.h"

namespace py = pybind11;

Expand Down Expand Up @@ -105,6 +107,155 @@ void transfer_kv_blocks_remote(
}
#endif

void transfer_kv_blocks_gds_binding(
GDSManager& gds_manager,
const py::list& gds_filepaths_py,
const torch::Tensor& gpu_layer_id_list,
const torch::Tensor& gpu_layer_ptrs_tensor,
const torch::Tensor& gds_block_ids,
const torch::Tensor& gpu_block_ids,
int64_t gpu_kv_stride_in_bytes,
int64_t gds_layer_stride_in_bytes,
int64_t gds_block_stride_in_bytes,
int64_t gds_kv_stride_in_bytes,
int64_t block_size_in_bytes,
int num_blocks_per_file,
int64_t total_layers,
bool is_read,
bool verbose = false,
bool is_mla = false
) {
TORCH_CHECK(gpu_layer_ptrs_tensor.dtype() == torch::kInt64,
"gpu_layer_ptrs must be int64");
TORCH_CHECK(gds_block_ids.dtype() == torch::kInt64,
"gds_block_ids must be int64");
TORCH_CHECK(gpu_block_ids.dtype() == torch::kInt64,
"gpu_block_ids must be int64");
TORCH_CHECK(gpu_layer_id_list.dtype() == torch::kInt32,
"gpu_layer_id_list must be int32");

// Convert Python list to C++ vector
std::vector<std::string> gds_filepaths;
for (const auto& filepath_py : gds_filepaths_py) {
gds_filepaths.push_back(filepath_py.cast<std::string>());
}

transfer_kv_blocks_gds(
gds_manager, gds_filepaths, gpu_layer_id_list, gpu_layer_ptrs_tensor,
gds_block_ids, gpu_block_ids, gpu_kv_stride_in_bytes,
gds_layer_stride_in_bytes, gds_block_stride_in_bytes, gds_kv_stride_in_bytes,
block_size_in_bytes, 0, num_blocks_per_file, total_layers, is_read, verbose, is_mla);
}

// GDS Manager Python bindings
py::list gds_batch_write_binding(GDSManager& manager,
py::list operations_list) {
size_t batch_size = operations_list.size();
std::vector<BatchWriteOp> operations(batch_size);
std::vector<ssize_t> results(batch_size);

for (size_t i = 0; i < batch_size; ++i) {
py::dict op_dict = operations_list[i].cast<py::dict>();
operations[i].filename = op_dict["filename"].cast<std::string>().c_str();
operations[i].gpu_data = op_dict["gpu_data"].cast<torch::Tensor>().data_ptr();
operations[i].size = op_dict["size"].cast<size_t>();
operations[i].file_offset = op_dict["file_offset"].cast<size_t>();
operations[i].result = &results[i];
}

int batch_id = manager.batch_write(operations.data(), batch_size);

py::list result_list;
result_list.append(batch_id);
for (size_t i = 0; i < batch_size; ++i) {
result_list.append(results[i]);
}

return result_list;
}

py::list gds_batch_read_binding(GDSManager& manager,
py::list operations_list) {
size_t batch_size = operations_list.size();
std::vector<BatchReadOp> operations(batch_size);
std::vector<ssize_t> results(batch_size);

for (size_t i = 0; i < batch_size; ++i) {
py::dict op_dict = operations_list[i].cast<py::dict>();
operations[i].filename = op_dict["filename"].cast<std::string>().c_str();
operations[i].gpu_buffer = op_dict["gpu_buffer"].cast<torch::Tensor>().data_ptr();
operations[i].size = op_dict["size"].cast<size_t>();
operations[i].file_offset = op_dict["file_offset"].cast<size_t>();
operations[i].result = &results[i];
}

int batch_id = manager.batch_read(operations.data(), batch_size);

py::list result_list;
result_list.append(batch_id);
for (size_t i = 0; i < batch_size; ++i) {
result_list.append(results[i]);
}

return result_list;
}

ssize_t gds_write_binding(GDSManager& manager,
const std::string& filename,
torch::Tensor gpu_data,
size_t file_offset = 0) {
return manager.write(filename.c_str(), gpu_data.data_ptr(),
gpu_data.numel() * gpu_data.element_size(), file_offset);
}

ssize_t gds_read_binding(GDSManager& manager,
const std::string& filename,
torch::Tensor gpu_buffer,
size_t file_offset = 0) {
return manager.read(filename.c_str(), gpu_buffer.data_ptr(),
gpu_buffer.numel() * gpu_buffer.element_size(), file_offset);
}

ssize_t gds_write_async_binding(GDSManager& manager,
const std::string& filename,
torch::Tensor gpu_data,
size_t file_offset = 0) {
return manager.write_async(filename.c_str(), gpu_data.data_ptr(),
gpu_data.numel() * gpu_data.element_size(), file_offset);
}

ssize_t gds_read_async_binding(GDSManager& manager,
const std::string& filename,
torch::Tensor gpu_buffer,
size_t file_offset = 0) {
return manager.read_async(filename.c_str(), gpu_buffer.data_ptr(),
gpu_buffer.numel() * gpu_buffer.element_size(), file_offset);
}

// Helper function to create and initialize a GDS file with specified size
bool create_gds_file_binding(GDSManager& manager,
const std::string& filename,
size_t file_size) {
// First create/truncate the file to the desired size
int fd = open(filename.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
if (fd < 0) {
return false;
}

// Pre-allocate the file to the specified size
if (ftruncate(fd, file_size) != 0) {
close(fd);
return false;
}

// Ensure data is written to disk
fsync(fd);
close(fd);

// Now add the file to GDS manager (this will open it with O_DIRECT and register with cuFile)
return manager.add_file(filename.c_str());
}

PYBIND11_MODULE(c_ext, m) {
m.def("transfer_kv_blocks", &transfer_kv_blocks_binding,
"Transfer multi-layer KV-cache between CPU and GPU");
Expand Down Expand Up @@ -133,6 +284,14 @@ PYBIND11_MODULE(c_ext, m) {
py::arg("num_remote_blocks_per_file"), py::arg("use_mmap") = false,
py::arg("num_threads_per_file") = 16, py::arg("is_mla") = false);
#endif
m.def("transfer_kv_blocks_gds", &transfer_kv_blocks_gds_binding,
"Transfer KV blocks between GPU and GDS storage", py::arg("gds_manager"),
py::arg("gds_filepaths"), py::arg("gpu_layer_id_list"), py::arg("gpu_layer_ptrs_tensor"),
py::arg("gds_block_ids"), py::arg("gpu_block_ids"),
py::arg("gpu_kv_stride_in_bytes"), py::arg("gds_layer_stride_in_bytes"),
py::arg("gds_block_stride_in_bytes"), py::arg("gds_kv_stride_in_bytes"),
py::arg("block_size_in_bytes"), py::arg("num_blocks_per_file"), py::arg("total_layers"),
py::arg("is_read"), py::arg("verbose") = false, py::arg("is_mla") = false);
m.def("get_hash_size", &flexkv::get_hash_size,
"Get the size of the hash result");
m.def("gen_hashes", &flexkv::gen_hashes, "Generate hashes for a tensor",
Expand All @@ -156,6 +315,19 @@ PYBIND11_MODULE(c_ext, m) {
py::arg("layer_id"), py::arg("layer_granularity"),
py::arg("is_mla"));

py::class_<flexkv::TPGDSTransferThreadGroup>(m, "TPGDSTransferThreadGroup")
.def(py::init<int, const std::vector<std::vector<torch::Tensor>> &,
const std::vector<std::string> &, int>())
.def("tp_group_transfer",
&flexkv::TPGDSTransferThreadGroup::tp_group_transfer,
py::arg("gpu_block_id_tensor"), py::arg("gds_block_id_tensor"),
py::arg("gpu_kv_stride_in_bytes"), py::arg("gpu_block_stride_in_bytes"),
py::arg("gpu_chunk_size_in_bytes"), py::arg("gds_layer_stride_in_bytes"),
py::arg("gds_kv_stride_in_bytes"), py::arg("gds_block_stride_in_bytes"),
py::arg("gds_chunk_size_in_bytes"), py::arg("num_blocks_per_file"),
py::arg("is_read"), py::arg("layer_id"), py::arg("layer_granularity"),
py::arg("is_mla"));

// Add Hasher class binding
py::class_<flexkv::Hasher>(m, "Hasher")
.def(py::init<>())
Expand Down Expand Up @@ -226,4 +398,44 @@ PYBIND11_MODULE(c_ext, m) {
.def_readonly("num_ready_matched_blocks", &flexkv::CMatchResult::num_ready_matched_blocks)
.def_readonly("num_matched_blocks", &flexkv::CMatchResult::num_matched_blocks)
.def_readonly("last_node_matched_length", &flexkv::CMatchResult::last_node_matched_length);
// Add GDS Manager class binding
py::class_<GDSManager>(m, "GDSManager")
.def(py::init<const std::vector<std::string>&>(),
"Initialize GDS Manager with file list", py::arg("filenames"))
.def(py::init<>(), "Initialize GDS Manager without files")
.def("is_ready", &GDSManager::is_ready,
"Check if GDS manager is ready for operations")
.def("get_last_error", &GDSManager::get_last_error,
"Get the last error message")
.def("add_file", &GDSManager::add_file,
"Add and register a file with GDS (creates with O_DIRECT)", py::arg("filename"))
.def("remove_file", &GDSManager::remove_file,
"Remove and unregister a file from GDS", py::arg("filename"))
.def("write", &gds_write_binding,
"Write data from GPU memory to file",
py::arg("filename"), py::arg("gpu_data"), py::arg("file_offset") = 0)
.def("read", &gds_read_binding,
"Read data from file to GPU memory",
py::arg("filename"), py::arg("gpu_buffer"), py::arg("file_offset") = 0)
.def("write_async", &gds_write_async_binding,
"Write data from GPU memory to file asynchronously",
py::arg("filename"), py::arg("gpu_data"), py::arg("file_offset") = 0)
.def("read_async", &gds_read_async_binding,
"Read data from file to GPU memory asynchronously",
py::arg("filename"), py::arg("gpu_buffer"), py::arg("file_offset") = 0)
.def("batch_write", &gds_batch_write_binding,
"Batch write operations", py::arg("operations"))
.def("batch_read", &gds_batch_read_binding,
"Batch read operations", py::arg("operations"))
.def("batch_synchronize", &GDSManager::batch_synchronize,
"Wait for batch operations to complete", py::arg("batch_id"))
.def("synchronize", &GDSManager::synchronize,
"Synchronize all internal CUDA streams")
.def("get_file_count", &GDSManager::get_file_count,
"Get number of files currently managed")
.def("get_managed_files", &GDSManager::get_managed_files,
"Get list of all managed files")
.def("create_gds_file", &create_gds_file_binding,
"Create and register a GDS file with specified size",
py::arg("filename"), py::arg("file_size"));
}
Loading