|
| 1 | +#![cfg(all(feature = "cuda", nvcc))] |
| 2 | + |
| 3 | +mod cuda_env; |
| 4 | + |
| 5 | +use anyhow::Result; |
| 6 | +use m40_llm::gguf::{GgmlDType, GgufModel, GgufScalar, GgufTensor, GgufValue}; |
| 7 | +use m40_llm::infer::LoadedModel; |
| 8 | +use std::ffi::c_void; |
| 9 | + |
| 10 | +fn halves_from_f32(vals: &[f32]) -> Vec<u8> { |
| 11 | + let mut out = Vec::with_capacity(vals.len() * 2); |
| 12 | + for &v in vals { |
| 13 | + let bits = half::f16::from_f32(v).to_bits(); |
| 14 | + out.push((bits & 0xFF) as u8); |
| 15 | + out.push((bits >> 8) as u8); |
| 16 | + } |
| 17 | + out |
| 18 | +} |
| 19 | + |
| 20 | +#[test] |
| 21 | +fn forward_one_token_with_layer_smoke() -> Result<()> { |
| 22 | + let ctx = cuda_env::ctx_m40()?; |
| 23 | + if let Err(e) = cuda_env::require_sm52(&ctx) { |
| 24 | + eprintln!("{}", e); |
| 25 | + return Ok(()); |
| 26 | + } |
| 27 | + |
| 28 | + // Tiny dims ensuring num_heads * head_dim = d_model |
| 29 | + let d_model = 8usize; |
| 30 | + let hidden = 16usize; |
| 31 | + let num_heads = 2u32; |
| 32 | + let head_dim = 4u32; |
| 33 | + |
| 34 | + // Build a minimal GGUF in-memory model with required tensors |
| 35 | + let mut gg = GgufModel::new(0); |
| 36 | + gg.metadata.insert( |
| 37 | + "llama.embedding_length".to_string(), |
| 38 | + GgufValue::Scalar(GgufScalar::U32(d_model as u32)), |
| 39 | + ); |
| 40 | + |
| 41 | + // Token embeddings: [vocab, d_model] |
| 42 | + let vocab = 32u64; |
| 43 | + gg.tensors.push(GgufTensor { |
| 44 | + name: "tok_embeddings.weight".into(), |
| 45 | + dtype: GgmlDType::F16, |
| 46 | + shape: vec![vocab, d_model as u64], |
| 47 | + offset: 0, |
| 48 | + }); |
| 49 | + |
| 50 | + // Layer 0 weights in layers.* scheme |
| 51 | + gg.tensors.push(GgufTensor { |
| 52 | + name: "layers.0.attention.wq.weight".into(), |
| 53 | + dtype: GgmlDType::F16, |
| 54 | + shape: vec![d_model as u64, d_model as u64], |
| 55 | + offset: 0, |
| 56 | + }); |
| 57 | + gg.tensors.push(GgufTensor { |
| 58 | + name: "layers.0.attention.wk.weight".into(), |
| 59 | + dtype: GgmlDType::F16, |
| 60 | + shape: vec![d_model as u64, d_model as u64], |
| 61 | + offset: 0, |
| 62 | + }); |
| 63 | + gg.tensors.push(GgufTensor { |
| 64 | + name: "layers.0.attention.wv.weight".into(), |
| 65 | + dtype: GgmlDType::F16, |
| 66 | + shape: vec![d_model as u64, d_model as u64], |
| 67 | + offset: 0, |
| 68 | + }); |
| 69 | + gg.tensors.push(GgufTensor { |
| 70 | + name: "layers.0.attention.wo.weight".into(), |
| 71 | + dtype: GgmlDType::F16, |
| 72 | + shape: vec![d_model as u64, d_model as u64], |
| 73 | + offset: 0, |
| 74 | + }); |
| 75 | + |
| 76 | + gg.tensors.push(GgufTensor { |
| 77 | + name: "layers.0.feed_forward.w3.weight".into(), |
| 78 | + dtype: GgmlDType::F16, |
| 79 | + shape: vec![d_model as u64, hidden as u64], |
| 80 | + offset: 0, |
| 81 | + }); |
| 82 | + gg.tensors.push(GgufTensor { |
| 83 | + name: "layers.0.feed_forward.w1.weight".into(), |
| 84 | + dtype: GgmlDType::F16, |
| 85 | + shape: vec![d_model as u64, hidden as u64], |
| 86 | + offset: 0, |
| 87 | + }); |
| 88 | + gg.tensors.push(GgufTensor { |
| 89 | + name: "layers.0.feed_forward.w2.weight".into(), |
| 90 | + dtype: GgmlDType::F16, |
| 91 | + shape: vec![hidden as u64, d_model as u64], |
| 92 | + offset: 0, |
| 93 | + }); |
| 94 | + |
| 95 | + // Prepare a contiguous weights blob with simple patterns; record offsets into gg |
| 96 | + let mut weights: Vec<u8> = Vec::new(); |
| 97 | + let mut push_tensor = |name: &str, n_elems: usize, gen: &mut dyn FnMut(usize) -> f32| { |
| 98 | + let off = weights.len() as u64; |
| 99 | + if let Some(t) = gg.tensors.iter_mut().find(|t| t.name == name) { |
| 100 | + t.offset = off; |
| 101 | + } |
| 102 | + let mut vals = Vec::with_capacity(n_elems); |
| 103 | + for i in 0..n_elems { |
| 104 | + vals.push(gen(i)); |
| 105 | + } |
| 106 | + weights.extend_from_slice(&halves_from_f32(&vals)); |
| 107 | + }; |
| 108 | + |
| 109 | + // tok_embeddings: vocab * d_model |
| 110 | + push_tensor( |
| 111 | + "tok_embeddings.weight", |
| 112 | + (vocab as usize) * d_model, |
| 113 | + &mut |i| ((i as f32) * 0.001).sin(), |
| 114 | + ); |
| 115 | + |
| 116 | + // Attention weights: each d_model*d_model |
| 117 | + let sq = d_model * d_model; |
| 118 | + push_tensor("layers.0.attention.wq.weight", sq, &mut |i| { |
| 119 | + ((i as f32) * 0.01).sin() |
| 120 | + }); |
| 121 | + push_tensor("layers.0.attention.wk.weight", sq, &mut |i| { |
| 122 | + ((i as f32) * 0.02).cos() |
| 123 | + }); |
| 124 | + push_tensor("layers.0.attention.wv.weight", sq, &mut |i| { |
| 125 | + ((i as f32) * 0.03).tan().atan() |
| 126 | + }); |
| 127 | + push_tensor("layers.0.attention.wo.weight", sq, &mut |i| { |
| 128 | + ((i as f32) * 0.04).sin() |
| 129 | + }); |
| 130 | + |
| 131 | + // MLP weights |
| 132 | + push_tensor( |
| 133 | + "layers.0.feed_forward.w3.weight", |
| 134 | + d_model * hidden, |
| 135 | + &mut |i| ((i as f32) * 0.015).sin(), |
| 136 | + ); |
| 137 | + push_tensor( |
| 138 | + "layers.0.feed_forward.w1.weight", |
| 139 | + d_model * hidden, |
| 140 | + &mut |i| ((i as f32) * 0.017).cos(), |
| 141 | + ); |
| 142 | + push_tensor( |
| 143 | + "layers.0.feed_forward.w2.weight", |
| 144 | + hidden * d_model, |
| 145 | + &mut |i| ((i as f32) * 0.019).sin(), |
| 146 | + ); |
| 147 | + |
| 148 | + let mut lm = LoadedModel::from_gguf(gg, weights, -1)?; // auto-select device |
| 149 | + |
| 150 | + // Allocate KV cache with matching heads |
| 151 | + lm.allocate_kv_cache(8, 1)?; // defaults to (8 heads, 64 dim) in current stub; override not exposed here |
| 152 | + // For attention helper validation, set KV layout directly to match our small dims |
| 153 | + // We can't change kv layout via public API yet; tests relying solely on run_attention layout may be limited. |
| 154 | + |
| 155 | + // Load embedding for token 0 |
| 156 | + let d_x = lm.cuda.device_malloc(d_model * 4)?; |
| 157 | + unsafe { |
| 158 | + lm.load_token_embedding_f16_to_f32(0, d_x)?; |
| 159 | + } |
| 160 | + |
| 161 | + // Output buffer |
| 162 | + let d_out = lm.cuda.device_malloc(d_model * 4)?; |
| 163 | + |
| 164 | + // Run one layer |
| 165 | + unsafe { |
| 166 | + let _ = lm.forward_one_token_with_layer(d_x as *const c_void, 0, 0, 1, d_out)?; |
| 167 | + } |
| 168 | + |
| 169 | + // Read back and assert finiteness |
| 170 | + let mut out_host = vec![0u8; d_model * 4]; |
| 171 | + unsafe { |
| 172 | + lm.cuda |
| 173 | + .memcpy_d2h(out_host.as_mut_ptr() as *mut c_void, d_out, d_model * 4)?; |
| 174 | + } |
| 175 | + let out_vals: Vec<f32> = out_host |
| 176 | + .chunks_exact(4) |
| 177 | + .map(|ch| f32::from_le_bytes([ch[0], ch[1], ch[2], ch[3]])) |
| 178 | + .collect(); |
| 179 | + assert_eq!(out_vals.len(), d_model); |
| 180 | + for (i, v) in out_vals.iter().enumerate() { |
| 181 | + assert!(v.is_finite(), "non-finite at {}: {}", i, v); |
| 182 | + } |
| 183 | + |
| 184 | + unsafe { |
| 185 | + lm.cuda.device_free(d_x)?; |
| 186 | + lm.cuda.device_free(d_out)?; |
| 187 | + } |
| 188 | + |
| 189 | + Ok(()) |
| 190 | +} |
0 commit comments