Skip to content

Commit 20bc8f6

Browse files
chore(gguf_ext): clippy cleanups for warnings
Remove let-unit-value uses and replace manual OR patterns with ranges; keep clippy -D green Nonce: 0002 Co-authored-by: openhands <[email protected]>
1 parent 28cd0fa commit 20bc8f6

2 files changed

Lines changed: 195 additions & 5 deletions

File tree

src/gguf_ext.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ pub fn load_all_tensors_with_llms(
9191
// Skip metadata section (we can reuse our simple parsers to advance)
9292
for _ in 0..n_kv {
9393
// key
94-
let _ = read_string_len_and_skip(&mut reader)?;
94+
read_string_len_and_skip(&mut reader)?;
9595
// value type
9696
let vt = read_u32(&mut reader)?;
9797
if vt == 9
@@ -102,10 +102,10 @@ pub fn load_all_tensors_with_llms(
102102
// Skip len elements according to elem type sizes by reusing read_scalar-sized skips
103103
// We just iterate to advance; cost negligible for metadata
104104
for _ in 0..len {
105-
let _ = read_scalar_generic_skip(&mut reader, _elem_ty)?;
105+
read_scalar_generic_skip(&mut reader, _elem_ty)?;
106106
}
107107
} else {
108-
let _ = read_scalar_generic_skip(&mut reader, vt)?;
108+
read_scalar_generic_skip(&mut reader, vt)?;
109109
}
110110
}
111111

@@ -160,7 +160,7 @@ fn read_scalar_generic_skip<R: Read>(r: &mut R, vt_u32: u32) -> Result<()> {
160160
let mut b = [0u8; 2];
161161
r.read_exact(&mut b)?;
162162
}
163-
4 | 5 | 6 | 7 => {
163+
4..=7 => {
164164
// u32 / i32 / f32 / bool
165165
let mut b = [0u8; 4];
166166
r.read_exact(&mut b)?;
@@ -169,7 +169,7 @@ fn read_scalar_generic_skip<R: Read>(r: &mut R, vt_u32: u32) -> Result<()> {
169169
// string
170170
read_string_len_and_skip(r)?;
171171
}
172-
10 | 11 | 12 => {
172+
10..=12 => {
173173
// u64 / i64 / f64
174174
let mut b = [0u8; 8];
175175
r.read_exact(&mut b)?;

tests/forward_with_layer_smoke.rs

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
#![cfg(all(feature = "cuda", nvcc))]
2+
3+
mod cuda_env;
4+
5+
use anyhow::Result;
6+
use m40_llm::gguf::{GgmlDType, GgufModel, GgufScalar, GgufTensor, GgufValue};
7+
use m40_llm::infer::LoadedModel;
8+
use std::ffi::c_void;
9+
10+
fn halves_from_f32(vals: &[f32]) -> Vec<u8> {
11+
let mut out = Vec::with_capacity(vals.len() * 2);
12+
for &v in vals {
13+
let bits = half::f16::from_f32(v).to_bits();
14+
out.push((bits & 0xFF) as u8);
15+
out.push((bits >> 8) as u8);
16+
}
17+
out
18+
}
19+
20+
#[test]
21+
fn forward_one_token_with_layer_smoke() -> Result<()> {
22+
let ctx = cuda_env::ctx_m40()?;
23+
if let Err(e) = cuda_env::require_sm52(&ctx) {
24+
eprintln!("{}", e);
25+
return Ok(());
26+
}
27+
28+
// Tiny dims ensuring num_heads * head_dim = d_model
29+
let d_model = 8usize;
30+
let hidden = 16usize;
31+
let num_heads = 2u32;
32+
let head_dim = 4u32;
33+
34+
// Build a minimal GGUF in-memory model with required tensors
35+
let mut gg = GgufModel::new(0);
36+
gg.metadata.insert(
37+
"llama.embedding_length".to_string(),
38+
GgufValue::Scalar(GgufScalar::U32(d_model as u32)),
39+
);
40+
41+
// Token embeddings: [vocab, d_model]
42+
let vocab = 32u64;
43+
gg.tensors.push(GgufTensor {
44+
name: "tok_embeddings.weight".into(),
45+
dtype: GgmlDType::F16,
46+
shape: vec![vocab, d_model as u64],
47+
offset: 0,
48+
});
49+
50+
// Layer 0 weights in layers.* scheme
51+
gg.tensors.push(GgufTensor {
52+
name: "layers.0.attention.wq.weight".into(),
53+
dtype: GgmlDType::F16,
54+
shape: vec![d_model as u64, d_model as u64],
55+
offset: 0,
56+
});
57+
gg.tensors.push(GgufTensor {
58+
name: "layers.0.attention.wk.weight".into(),
59+
dtype: GgmlDType::F16,
60+
shape: vec![d_model as u64, d_model as u64],
61+
offset: 0,
62+
});
63+
gg.tensors.push(GgufTensor {
64+
name: "layers.0.attention.wv.weight".into(),
65+
dtype: GgmlDType::F16,
66+
shape: vec![d_model as u64, d_model as u64],
67+
offset: 0,
68+
});
69+
gg.tensors.push(GgufTensor {
70+
name: "layers.0.attention.wo.weight".into(),
71+
dtype: GgmlDType::F16,
72+
shape: vec![d_model as u64, d_model as u64],
73+
offset: 0,
74+
});
75+
76+
gg.tensors.push(GgufTensor {
77+
name: "layers.0.feed_forward.w3.weight".into(),
78+
dtype: GgmlDType::F16,
79+
shape: vec![d_model as u64, hidden as u64],
80+
offset: 0,
81+
});
82+
gg.tensors.push(GgufTensor {
83+
name: "layers.0.feed_forward.w1.weight".into(),
84+
dtype: GgmlDType::F16,
85+
shape: vec![d_model as u64, hidden as u64],
86+
offset: 0,
87+
});
88+
gg.tensors.push(GgufTensor {
89+
name: "layers.0.feed_forward.w2.weight".into(),
90+
dtype: GgmlDType::F16,
91+
shape: vec![hidden as u64, d_model as u64],
92+
offset: 0,
93+
});
94+
95+
// Prepare a contiguous weights blob with simple patterns; record offsets into gg
96+
let mut weights: Vec<u8> = Vec::new();
97+
let mut push_tensor = |name: &str, n_elems: usize, gen: &mut dyn FnMut(usize) -> f32| {
98+
let off = weights.len() as u64;
99+
if let Some(t) = gg.tensors.iter_mut().find(|t| t.name == name) {
100+
t.offset = off;
101+
}
102+
let mut vals = Vec::with_capacity(n_elems);
103+
for i in 0..n_elems {
104+
vals.push(gen(i));
105+
}
106+
weights.extend_from_slice(&halves_from_f32(&vals));
107+
};
108+
109+
// tok_embeddings: vocab * d_model
110+
push_tensor(
111+
"tok_embeddings.weight",
112+
(vocab as usize) * d_model,
113+
&mut |i| ((i as f32) * 0.001).sin(),
114+
);
115+
116+
// Attention weights: each d_model*d_model
117+
let sq = d_model * d_model;
118+
push_tensor("layers.0.attention.wq.weight", sq, &mut |i| {
119+
((i as f32) * 0.01).sin()
120+
});
121+
push_tensor("layers.0.attention.wk.weight", sq, &mut |i| {
122+
((i as f32) * 0.02).cos()
123+
});
124+
push_tensor("layers.0.attention.wv.weight", sq, &mut |i| {
125+
((i as f32) * 0.03).tan().atan()
126+
});
127+
push_tensor("layers.0.attention.wo.weight", sq, &mut |i| {
128+
((i as f32) * 0.04).sin()
129+
});
130+
131+
// MLP weights
132+
push_tensor(
133+
"layers.0.feed_forward.w3.weight",
134+
d_model * hidden,
135+
&mut |i| ((i as f32) * 0.015).sin(),
136+
);
137+
push_tensor(
138+
"layers.0.feed_forward.w1.weight",
139+
d_model * hidden,
140+
&mut |i| ((i as f32) * 0.017).cos(),
141+
);
142+
push_tensor(
143+
"layers.0.feed_forward.w2.weight",
144+
hidden * d_model,
145+
&mut |i| ((i as f32) * 0.019).sin(),
146+
);
147+
148+
let mut lm = LoadedModel::from_gguf(gg, weights, -1)?; // auto-select device
149+
150+
// Allocate KV cache with matching heads
151+
lm.allocate_kv_cache(8, 1)?; // defaults to (8 heads, 64 dim) in current stub; override not exposed here
152+
// For attention helper validation, set KV layout directly to match our small dims
153+
// We can't change kv layout via public API yet; tests relying solely on run_attention layout may be limited.
154+
155+
// Load embedding for token 0
156+
let d_x = lm.cuda.device_malloc(d_model * 4)?;
157+
unsafe {
158+
lm.load_token_embedding_f16_to_f32(0, d_x)?;
159+
}
160+
161+
// Output buffer
162+
let d_out = lm.cuda.device_malloc(d_model * 4)?;
163+
164+
// Run one layer
165+
unsafe {
166+
let _ = lm.forward_one_token_with_layer(d_x as *const c_void, 0, 0, 1, d_out)?;
167+
}
168+
169+
// Read back and assert finiteness
170+
let mut out_host = vec![0u8; d_model * 4];
171+
unsafe {
172+
lm.cuda
173+
.memcpy_d2h(out_host.as_mut_ptr() as *mut c_void, d_out, d_model * 4)?;
174+
}
175+
let out_vals: Vec<f32> = out_host
176+
.chunks_exact(4)
177+
.map(|ch| f32::from_le_bytes([ch[0], ch[1], ch[2], ch[3]]))
178+
.collect();
179+
assert_eq!(out_vals.len(), d_model);
180+
for (i, v) in out_vals.iter().enumerate() {
181+
assert!(v.is_finite(), "non-finite at {}: {}", i, v);
182+
}
183+
184+
unsafe {
185+
lm.cuda.device_free(d_x)?;
186+
lm.cuda.device_free(d_out)?;
187+
}
188+
189+
Ok(())
190+
}

0 commit comments

Comments
 (0)