Skip to content

Commit b6f873c

Browse files
authored
feat: add wgebra docs (#7)
1 parent 70166d9 commit b6f873c

File tree

11 files changed

+921
-22
lines changed

11 files changed

+921
-22
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,8 @@ build
3434

3535
# Ignore these because they are generated automatically from docs/user_guides/templates
3636
docs/user_guides/wgcore
37+
docs/user_guides/wgebra
38+
docs/user_guides/wgparry
39+
docs/user_guides/wgrapier
3740
docs/user_guides/templates_injected
3841
tmp_*

docs-examples/inject_file/Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs-examples/inject_file/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@ edition = "2021"
66
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
77

88
[dependencies]
9-
strinject = { path = "../../../strinject", features = ["download"] }
9+
strinject = { version = "0.2", features = ["download"] }

docs/about_wgmath.mdx

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,3 @@ welcome!
2323

2424
The **wgcore** crate part of the **wgmath** ecosystem exposes a set of proc-macros to facilitate sharing and composing
2525
shaders across Rust libraries.
26-
27-
**wgmath** is developed by the [Dimforge](https://dimforge.com) open-source company. You can support us by sponsoring us
28-
on [GitHub sponsor](https://github.com/sponsors/dimforge).
29-
30-
![dimforge_logo](https://www.dimforge.com/img/logo/logo_dimforge_full)
Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
---
2+
id: blas_operations
3+
title: Matrix multiplication, BLAS
4+
sidebar_label: Matrix multiplication, BLAS
5+
---
6+
7+
import Tabs from '@theme/Tabs';
8+
import TabItem from '@theme/TabItem';
9+
10+
## Matrix multiplication
11+
12+
WGebra implements matrix-matrix and matrix-vector multiplications. It also supports 3-tensors where each element along
13+
the third dimensions is seen as an individual matrix. These shaders are designed for large matrices and are not
14+
composable. Instead, they expose compute pipelines for dispatching the operation to the GPU.
15+
16+
The non-composable shaders are `Gemm` (for matrix-matrix multiplication) and `Gemv` (for matrix-vector multiplication),
17+
following the well-known BLAS terminology. Each of these shaders actually contain several compute pipelines, also
18+
providing variants where the left-hand-side matrix is transposed, as well as variants with some work-in-progress
19+
optimizations.
20+
21+
:::tip Quantized matrices
22+
WGebra only implements linear algebra on **f32 matrices**. If you are looking for linear algebra on **quantized matrices**,
23+
(for example for AI), see the [wgml](https://github.com/dimforge/wgml) crate instead.
24+
:::
25+
26+
<Tabs
27+
groupId="wgebra"
28+
defaultValue="gemm"
29+
values={[
30+
{label: 'gemm.rs', value: 'gemm'},
31+
{label: 'gemv.rs', value: 'gemv'},
32+
]}>
33+
<TabItem value="gemm">
34+
35+
```rust
36+
async fn gpu_gemm() {
37+
let gpu = GpuInstance::new().await.unwrap();
38+
let gemm = super::Gemm::from_device(gpu.device()).unwrap();
39+
let shapes = ViewShapeBuffers::new();
40+
41+
const NROWS: u32 = 256;
42+
const NCOLS: u32 = 256;
43+
44+
/// Create some random matrices using nalgebra.
45+
let m1_cpu = DMatrix::<f32>::new_random(NROWS as usize, NCOLS as usize);
46+
let m2_cpu = DMatrix::<f32>::new_random(NCOLS as usize, NROWS as usize);
47+
48+
/// Convert our nalgebra matrices to GPU tensors.
49+
let m1 = TensorBuilder::matrix(NROWS, NCOLS, BufferUsages::STORAGE)
50+
.build_init(gpu.device(), m1_cpu.as_slice());
51+
let m2 = TensorBuilder::matrix(NCOLS, NROWS, BufferUsages::STORAGE)
52+
.build_init(gpu.device(), m2_cpu.as_slice());
53+
/// GPU matrix that will contain the result.
54+
let result =
55+
TensorBuilder::matrix(NROWS, NROWS, BufferUsages::STORAGE | BufferUsages::COPY_SRC)
56+
.build_init(gpu.device(), lhs_cpu.as_slice());
57+
/// Buffer for reading back the operation result into RAM.
58+
let staging = TensorBuilder::matrix(
59+
NROWS,
60+
NROWS,
61+
BufferUsages::MAP_READ | BufferUsages::COPY_DST,
62+
)
63+
.build(gpu.device());
64+
65+
for variant in [
66+
/// m1 * m2
67+
GemmVariant::Gemm,
68+
/// transpose(m1) * m2
69+
GemmVariant::GemmTr,
70+
/// m1 * m2 using experimental optimizations.
71+
GemmVariant::GemmFast,
72+
/// transpose(m1) * m2 using experimental optimizations.
73+
GemmVariant::GemmTrFast,
74+
] {
75+
let mut encoder = gpu.device().create_command_encoder(&Default::default());
76+
let mut pass = encoder.compute_pass("test", None);
77+
78+
// Dispatch the matrix multiplication operation for running it on the gpu.
79+
gemm.dispatch_generic(
80+
gpu.device(),
81+
&shapes,
82+
&mut pass,
83+
result.as_embedded_view(),
84+
m1.as_embedded_view(),
85+
m2.as_embedded_view(),
86+
variant,
87+
);
88+
drop(pass); // Ensure the pass is ended before the encoder is borrowed again.
89+
90+
staging.copy_from(&mut encoder, &result);
91+
92+
gpu.queue().submit(Some(encoder.finish()));
93+
94+
// Read the result and compare with the value computed on the CPU.
95+
let gpu_result = staging.read(gpu.device()).await.unwrap();
96+
let cpu_result = match variant {
97+
GemmVariant::Gemm | GemmVariant::GemmFast => &m1_cpu * &m2_cpu,
98+
GemmVariant::GemmTr | GemmVariant::GemmTrFast => m1_cpu.tr_mul(&m2_cpu),
99+
};
100+
101+
let gpu_result = DMatrix::from_vec(NROWS as usize, NROWS as usize, gpu_result);
102+
assert_relative_eq!(gpu_result, cpu_result, epsilon = 1.0e-3);
103+
}
104+
}
105+
```
106+
</TabItem>
107+
108+
109+
<TabItem value="gemv">
110+
111+
```rust
112+
async fn gpu_gemv() {
113+
let gpu = GpuInstance::new().await.unwrap();
114+
let gemv = super::Gemv::from_device(gpu.device()).unwrap();
115+
let shapes = ViewShapeBuffers::new();
116+
117+
const NROWS: u32 = 1024;
118+
const NCOLS: u32 = 1024;
119+
120+
/// Create some random matrices/vectors using nalgebra.
121+
let m_cpu = DMatrix::<f32>::new_random(NROWS as usize, NCOLS as usize);
122+
let v_cpu = DVector::<f32>::new_random(NCOLS as usize);
123+
let lhs_cpu = DVector::<f32>::new_random(NROWS as usize);
124+
125+
/// Convert our nalgebra matrices/vectors to GPU tensors.
126+
let m = TensorBuilder::matrix(NROWS, NCOLS, BufferUsages::STORAGE)
127+
.build_init(gpu.device(), m_cpu.as_slice());
128+
let v = TensorBuilder::vector(v_cpu.nrows() as u32, BufferUsages::STORAGE)
129+
.build_init(gpu.device(), v_cpu.as_slice());
130+
/// GPU vector that will contain the result.
131+
let result = TensorBuilder::vector(NROWS, BufferUsages::STORAGE | BufferUsages::COPY_SRC)
132+
.build_init(gpu.device(), lhs_cpu.as_slice());
133+
/// Buffer for reading back the operation result into RAM.
134+
let staging = TensorBuilder::vector(NROWS, BufferUsages::MAP_READ | BufferUsages::COPY_DST)
135+
.build(gpu.device());
136+
137+
for variant in [
138+
/// m * v
139+
GemvVariant::Gemv,
140+
/// transpose(m) * v
141+
GemvVariant::GemvTr,
142+
/// m * v using experimental optimizations.
143+
GemvVariant::GemvFast,
144+
/// transpose(m) * v using experimental optimizations.
145+
GemvVariant::GemvTrFast,
146+
] {
147+
let mut encoder = gpu.device().create_command_encoder(&Default::default());
148+
let mut pass = encoder.compute_pass("test", None);
149+
// Dispatch the matrix multiplication operation for running it on the gpu.
150+
gemv.dispatch_generic(gpu.device(), &shapes, &mut pass, &result, &m, &v, variant);
151+
drop(pass); // Ensure the pass is ended before the encoder is borrowed again.
152+
153+
staging.copy_from(&mut encoder, &result);
154+
155+
gpu.queue().submit(Some(encoder.finish()));
156+
157+
// Read the result and compare with the value computed on the CPU.
158+
let gpu_result = staging.read(gpu.device()).await.unwrap();
159+
let cpu_result = match variant {
160+
GemvVariant::Gemv | GemvVariant::GemvFast => &m_cpu * &v_cpu,
161+
GemvVariant::GemvTr | GemvVariant::GemvTrFast => m_cpu.tr_mul(&v_cpu),
162+
};
163+
164+
approx::assert_relative_eq!(DVector::from(gpu_result), cpu_result, epsilon = 1.0e-3);
165+
}
166+
}
167+
```
168+
</TabItem>
169+
</Tabs>
170+
171+
## Componentwise operations
172+
173+
The `OpAssign` shader provides componentwise operations between two vectors. The first (left-hand-side) vector is
174+
overwritten with the result of the operation. This can be used for calculating the **sum** or **difference** of two vectors,
175+
as well as their **componentwise product**, **division**. It can also be configured so that the first vector is simply overwritten
176+
with a **copy** of the second vector.
177+
178+
<Tabs
179+
groupId="wgebra"
180+
defaultValue="op_assign"
181+
values={[
182+
{label: 'op_assign.rs', value: 'op_assign'},
183+
]}>
184+
<TabItem value="op_assign">
185+
186+
```rust
187+
async fn gpu_op_assign() {
188+
let ops = [
189+
// a += b
190+
OpAssignVariant::Add,
191+
// a -= b
192+
OpAssignVariant::Sub,
193+
// a[i] *= b[i]
194+
OpAssignVariant::Mul,
195+
// a[i] /= b[i]
196+
OpAssignVariant::Div,
197+
// a = b
198+
OpAssignVariant::Copy,
199+
];
200+
let gpu = GpuInstance::new().await.unwrap();
201+
let shapes = ViewShapeBuffers::new();
202+
203+
for op in ops {
204+
let op_assign = OpAssign::new(gpu.device(), op).unwrap();
205+
let mut encoder = gpu.device().create_command_encoder(&Default::default());
206+
207+
const LEN: u32 = 1757;
208+
209+
// Generate two random vectors.
210+
let v0 = DVector::from_fn(LEN as usize, |i, _| i as f32 + 0.1);
211+
let v1 = DVector::from_fn(LEN as usize, |i, _| i as f32 * 10.0 + 0.1);
212+
// Convert the vectors to gpu 1-tensors.
213+
// Note that `gpu_v0` is the one that will be overwritten with the result of the operation.
214+
let gpu_v0 = TensorBuilder::vector(LEN, BufferUsages::STORAGE | BufferUsages::COPY_SRC)
215+
.build_init(gpu.device(), v0.as_slice());
216+
let gpu_v1 = TensorBuilder::vector(LEN, BufferUsages::STORAGE)
217+
.build_init(gpu.device(), v1.as_slice());
218+
219+
let mut pass = encoder.compute_pass("test", None);
220+
op_assign.dispatch(gpu.device(), &shapes, &mut pass, &gpu_v0, &gpu_v1);
221+
drop(pass); // Ensure the pass is ended before the encoder is borrowed again.
222+
223+
gpu.queue().submit(Some(encoder.finish()));
224+
}
225+
}
226+
```
227+
</TabItem>
228+
</Tabs>
229+
230+
## Vector reductions
231+
232+
The `Reduce` shader provides the calculations combining all the components of a single vector to compute their
233+
**minimum**, **maximum**, **sum**, **product**, or **squared norm**. The selected operation is specified when
234+
instantiating the shader with `Reduce::new`.
235+
236+
<Tabs
237+
groupId="wgebra"
238+
defaultValue="reductions"
239+
values={[
240+
{label: 'reductions.rs', value: 'reductions'},
241+
]}>
242+
<TabItem value="reductions">
243+
244+
```rust
245+
async fn gpu_reduce() {
246+
let gpu = GpuInstance::new().await.unwrap();
247+
let shapes = ViewShapeBuffers::new();
248+
249+
let ops = [
250+
// The minimum value among all the vector’s elements.
251+
ReduceOp::Min,
252+
// The maximum value among all the vector’s elements.
253+
ReduceOp::Max,
254+
// The sum of all the vector’s elements.
255+
ReduceOp::Sum,
256+
// Squared magnitude of the vector.
257+
ReduceOp::SqNorm,
258+
// The product of all the vector’s elements.
259+
ReduceOp::Prod,
260+
];
261+
262+
for op in ops {
263+
// Instanciate the shader (and compute pipeline) with the desired operation `op`.
264+
let reduce = super::Reduce::new(gpu.device(), op).unwrap();
265+
let mut encoder = gpu.device().create_command_encoder(&Default::default());
266+
267+
const LEN: usize = 345;
268+
let numbers: DVector<f32> = DVector::new_random(LEN);
269+
270+
// Convert the vector to a GPU 1-tensor.
271+
let vector = TensorBuilder::vector(numbers.len() as u32, BufferUsages::STORAGE)
272+
.build_init(gpu.device(), numbers.as_slice());
273+
// A single-element tensor that contains the result of the reduction.
274+
let result = TensorBuilder::scalar(BufferUsages::STORAGE)
275+
.build(gpu.device());
276+
277+
let mut pass = encoder.compute_pass("test", None);
278+
reduce.dispatch(gpu.device(), &shapes, &mut pass, &vector, &result);
279+
drop(pass); // Ensure the pass is ended before the encoder is borrowed again.
280+
281+
gpu.queue().submit(Some(encoder.finish()));
282+
}
283+
}
284+
```
285+
</TabItem>
286+
</Tabs>

0 commit comments

Comments
 (0)