Skip to content

Commit 4bd84c9

Browse files
authored
Merge pull request #1 from zxr-creator/24b09b6
Merge the warpper modification with the v1
2 parents 87d7664 + 66237d7 commit 4bd84c9

File tree

6 files changed

+22
-33
lines changed

6 files changed

+22
-33
lines changed

examples/verify_aim24.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,6 @@ def generate_requests(dataset: Dataset, field_name: str, data_format: str, trial
4444
return requests
4545

4646

47-
48-
49-
50-
51-
5247
def main():
5348
model_name = "Qwen/Qwen3-0.6B"
5449
llm = sgl.Engine(model_path=model_name,

examples/verify_algo.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env bash
22
set -e
3-
# export CUDA_VISIBLE_DEVICES=0
3+
export CUDA_VISIBLE_DEVICES=1
44

55
sparse_algos=(
66
"block_sparse_attention"

examples/verify_algo_int8.sh

Lines changed: 0 additions & 25 deletions
This file was deleted.
Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env bash
22
set -e
3-
# export CUDA_VISIBLE_DEVICES=0
3+
export CUDA_VISIBLE_DEVICES=0
44

55
sparse_algos=(
66
"block_sparse_attention"
@@ -11,6 +11,20 @@ mkdir -p "${RESULTS_DIR}"
1111
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
1212

1313
for algo in "${sparse_algos[@]}"; do
14+
OUTFILE="${RESULTS_DIR}/${algo}_int8_${TIMESTAMP}.log"
15+
echo ">>> Running verify_algo.py with --vortex-module-name ${algo} --kv-cache-dtype int8"
16+
echo ">>> Saving results to ${OUTFILE}"
17+
{ time python verify_algo.py \
18+
--trials 8 \
19+
--topk-val 30 \
20+
--vortex-module-name "${algo}" \
21+
--model-name Qwen/Qwen3-1.7B \
22+
--kv-cache-dtype int8 \
23+
--mem 0.7 ; } \
24+
2>&1 | tee "${OUTFILE}"
25+
done
26+
27+
for algo in "${sparse_algos[@]}"; do
1428
OUTFILE="${RESULTS_DIR}/${algo}_fp8_${TIMESTAMP}.log"
1529
echo ">>> Running verify_algo.py with --vortex-module-name ${algo} --kv-cache-dtype fp8_e4m3"
1630
echo ">>> Saving results to ${OUTFILE}"
@@ -22,4 +36,4 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S)
2236
--kv-cache-dtype fp8_e4m3 \
2337
--mem 0.7 ; } \
2438
2>&1 | tee "${OUTFILE}"
25-
done
39+
done

vortex_torch/cache/context.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@ class Context(ContextBase):
2424

2525
# Quantization: quant_type (0=none, 1=int8, 2=e4m3, 3=e5m2),
2626
# kv_scale (per-tensor fp8 scale), kv_scale_ptr (per-token int8 scale tensor)
27+
# fp8_type: 0=none, 1=e4m3, 2=e5m2 (encoding for Triton kernels)
2728
"quant_type",
2829
"kv_scale",
2930
"kv_scale_ptr",
31+
"fp8_type",
3032
)
3133

3234

@@ -49,6 +51,8 @@ def __init__(self) -> None:
4951
object.__setattr__(self, name, 1.0) # identity scale for bf16
5052
elif name == "kv_scale_ptr":
5153
object.__setattr__(self, name, None) # per-token scale tensor (int8 only)
54+
elif name == "fp8_type":
55+
object.__setattr__(self, name, 0) # 0 = none (bf16 default)
5256
else:
5357
object.__setattr__(self, name, UNSET)
5458

vortex_torch/flow/flow.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,7 @@ def run_indexer_virtual(self, group_size: int, page_size: int, head_dim: int):
431431
ctx.page_size = page_size
432432
ctx.max_num_pages = 0
433433
ctx.max_num_pages_per_request = 0
434+
ctx.topk_type = "naive"
434435

435436
device = "cuda"
436437
dtype = torch.bfloat16

0 commit comments

Comments
 (0)