Files
rle-encoded-q4km/benches/matmul.rs
2026-04-12 15:40:19 -07:00

317 lines
12 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! # Benchmark: BlockQ4K vs BlockQ4KRle
//!
//! Measures three operations across two weight distributions:
//!
//! | Group | What is timed |
//! |--------------|--------------------------------------------------|
//! | `encode` | BlockQ4K → BlockQ4KRle for a batch of 512 blocks |
//! | `dequantize` | Single-block dequantisation for all three paths |
//! | `matmul` | Full A×B multiply at three matrix sizes |
//!
//! ## Weight distributions
//!
//! **uniform** — each qs byte is drawn from a pseudo-random sequence (LCG).
//! Consecutive bytes almost never repeat, so each block produces ~128
//! single-byte runs. At 2 bytes per pair that would require ~256 bytes,
//! which exceeds the 128-byte raw payload, so `encode` always keeps these
//! blocks in **raw mode** (IS_RLE = 0). This is representative of typical
//! unstructured LLM weight matrices.
//!
//! **rle_optimal** — every byte in a block's qs field is the same value.
//! `encode` stores a single (value, count) pair — 2 bytes instead of 128 —
//! and sets IS_RLE = 1. This is the theoretical compression maximum, and
//! is representative of highly sparse or dead-neuron weight matrices.
use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
use matrix_testing::{
dequantize_block_q4k, matmul_q4k_fp16,
rle::{dequantize_block_q4k_rle, encode, matmul_q4k_rle_fp16, BlockQ4KRle},
BlockQ4K, K_SCALE_SIZE, QK_K,
};
// ---------------------------------------------------------------------------
// Minimal 64-bit LCG — no external dependencies needed
// ---------------------------------------------------------------------------
/// Deterministic pseudo-random generator using Knuth / PCG constants.
struct Lcg(u64);
impl Lcg {
fn new(seed: u64) -> Self {
Self(seed)
}
fn next_u8(&mut self) -> u8 {
self.0 = self
.0
.wrapping_mul(6_364_136_223_846_793_005)
.wrapping_add(1_442_695_040_888_963_407);
(self.0 >> 33) as u8
}
}
// ---------------------------------------------------------------------------
// Fixture helpers
// ---------------------------------------------------------------------------
/// Lossily encode a finite, non-subnormal f32 to its fp16 bit pattern.
///
/// Only used for block header fields (d, dmin); values must lie within the
/// fp16 normal range [~6.1e-5, 65504]. No overflow / underflow checks.
fn f32_to_fp16(f: f32) -> u16 {
if f == 0.0 {
return 0;
}
let bits = f.to_bits();
let sign = ((bits >> 31) as u16) << 15;
let exp = ((bits >> 23) & 0xFF) as i32 - 127 + 15;
let mantissa = (bits & 0x007F_FFFF) >> 13;
sign | ((exp as u16) << 10) | mantissa as u16
}
/// Build a 12-byte `scales` array where all 8 sub-blocks share the same
/// `scale` and `min` (both must be < 16, matching the test helper in lib.rs).
fn make_scales(scale: u8, min: u8) -> [u8; K_SCALE_SIZE] {
let mut s = [0u8; K_SCALE_SIZE];
for j in 0..4 {
s[j] = scale;
s[j + 4] = min;
}
for j in 8..12 {
s[j] = (scale & 0x0F) | ((min & 0x0F) << 4);
}
s
}
/// Return `count` blocks whose qs bytes are pseudo-random.
///
/// With uniformly distributed bytes, consecutive bytes match with probability
/// 1/256 ≈ 0.4%, yielding ~128 runs per block. Storing those as (value,
/// count) pairs would need ~256 bytes — more than the 128-byte raw payload —
/// so `encode` will always select **raw mode** (IS_RLE = 0).
fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {
let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234);
let scales = make_scales(7, 2);
let d = f32_to_fp16(0.01);
let dmin = f32_to_fp16(0.001);
(0..count)
.map(|_| {
let mut qs = [0u8; QK_K / 2];
for b in qs.iter_mut() {
*b = rng.next_u8();
}
BlockQ4K { d, dmin, scales, qs }
})
.collect()
}
/// Return `count` blocks where every qs byte is the same value.
///
/// A uniform byte array collapses to one (value, count) RLE pair: 2 bytes
/// instead of 128. `encode` will always select **RLE mode** (IS_RLE = 1).
/// Each block uses a fresh pseudo-random byte so no two blocks are identical,
/// avoiding degenerate cache-warm effects across the batch.
fn rle_optimal_blocks(count: usize) -> Vec<BlockQ4K> {
let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0);
let scales = make_scales(7, 2);
let d = f32_to_fp16(0.01);
let dmin = f32_to_fp16(0.001);
(0..count)
.map(|_| {
let byte = rng.next_u8();
BlockQ4K { d, dmin, scales, qs: [byte; QK_K / 2] }
})
.collect()
}
/// Build a K×N FP16 matrix (raw u16 bits) where every element is 1.0.
fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
vec![f32_to_fp16(1.0); k * n]
}
// ---------------------------------------------------------------------------
// Group 1 — encode
// ---------------------------------------------------------------------------
/// Number of blocks encoded per iteration in `bench_encode`.
const ENCODE_BATCH: usize = 512;
/// Measures the cost of scanning qs bytes and writing the BlockQ4KRle output.
///
/// Both distributions perform the same O(128) run-length scan. The only
/// divergence is at the output stage:
/// * **uniform** — run count > 63 → fall through to memcpy of 128 bytes.
/// * **rle_optimal** — run count = 1 → write 2 bytes and set IS_RLE.
fn bench_encode(c: &mut Criterion) {
let uniform = uniform_blocks(ENCODE_BATCH);
let rle_opt = rle_optimal_blocks(ENCODE_BATCH);
let mut group = c.benchmark_group("encode");
// Throughput = blocks encoded per second.
group.throughput(Throughput::Elements(ENCODE_BATCH as u64));
group.bench_function("uniform", |b| {
b.iter(|| {
for blk in &uniform {
black_box(encode(black_box(blk)));
}
});
});
group.bench_function("rle_optimal", |b| {
b.iter(|| {
for blk in &rle_opt {
black_box(encode(black_box(blk)));
}
});
});
group.finish();
}
// ---------------------------------------------------------------------------
// Group 2 — dequantize (single block)
// ---------------------------------------------------------------------------
/// Compares the three single-block dequantisation code paths.
///
/// | Variant | Block type | Encoding | Extra work vs baseline |
/// |------------------|-------------|----------|-------------------------------|
/// | `q4k_baseline` | BlockQ4K | — | none |
/// | `rle_raw_mode` | BlockQ4KRle | IS_RLE=0 | one branch (`is_rle()` check) |
/// | `rle_rle_mode` | BlockQ4KRle | IS_RLE=1 | RLE expansion into 128-B buf |
///
/// Throughput is the number of dequantised weights produced per second.
fn bench_dequantize(c: &mut Criterion) {
let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap();
let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap();
let rle_raw = encode(&q4k_uniform); // IS_RLE = 0
let rle_rle = encode(&q4k_rle_opt); // IS_RLE = 1
// Confirm the fixtures ended up in the right encoding modes.
assert!(!rle_raw.is_rle(), "uniform block should encode to raw mode");
assert!(rle_rle.is_rle(), "rle-optimal block should encode to rle mode");
let mut group = c.benchmark_group("dequantize");
// Throughput = QK_K (256) weights dequantised per second.
group.throughput(Throughput::Elements(QK_K as u64));
group.bench_function("q4k_baseline", |b| {
b.iter(|| {
let mut out = [0.0f32; QK_K];
dequantize_block_q4k(black_box(&q4k_uniform), &mut out);
black_box(out)
});
});
group.bench_function("rle_raw_mode", |b| {
b.iter(|| {
let mut out = [0.0f32; QK_K];
dequantize_block_q4k_rle(black_box(&rle_raw), &mut out);
black_box(out)
});
});
group.bench_function("rle_rle_mode", |b| {
b.iter(|| {
let mut out = [0.0f32; QK_K];
dequantize_block_q4k_rle(black_box(&rle_rle), &mut out);
black_box(out)
});
});
group.finish();
}
// ---------------------------------------------------------------------------
// Group 3 — matmul
// ---------------------------------------------------------------------------
/// Matrix size configurations as (M rows, blocks-per-row, N output cols).
///
/// The shared dimension K = blocks_per_row × QK_K.
///
/// | Label | A shape | B shape | total MACs |
/// |--------|------------|-------------|------------|
/// | tiny | 4 × 256 | 256 × 32 | 32 768 |
/// | medium | 16 × 1024 | 1024 × 64 | 1 048 576 |
/// | large | 64 × 2048 | 2048 × 128 |16 777 216 |
const CONFIGS: &[(usize, usize, usize)] = &[
( 4, 1, 32), // tiny
(16, 4, 64), // medium
(64, 8, 128), // large
];
/// Full matrix-multiply benchmark across weight distributions and matrix sizes.
///
/// Four variants per size:
///
/// | Label | A type | RLE mode? |
/// |----------------------|-------------|-----------|
/// | `baseline/uniform` | BlockQ4K | — |
/// | `rle/uniform` | BlockQ4KRle | raw |
/// | `baseline/rle_opt` | BlockQ4K | — |
/// | `rle/rle_opt` | BlockQ4KRle | rle |
///
/// Throughput is reported as multiply-accumulate operations (M × K × N) per
/// second, allowing fair cross-size comparison.
///
/// The A and B matrices are pre-built outside `iter()` so fixture construction
/// is not timed. Output Vec allocation/deallocation is included because it is
/// an inherent part of the current API's real-world cost.
fn bench_matmul(c: &mut Criterion) {
let mut group = c.benchmark_group("matmul");
for &(m, bpr, n) in CONFIGS {
let k = bpr * QK_K;
let label = format!("{m}x{k}x{n}");
let macs = (m * k * n) as u64;
// Build all four A variants and the shared B matrix for this config.
let a_q4k_u: Vec<BlockQ4K> = uniform_blocks(m * bpr);
let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(encode).collect();
let a_q4k_r: Vec<BlockQ4K> = rle_optimal_blocks(m * bpr);
let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(encode).collect();
let b = fp16_ones(k, n);
// Set throughput for all four benchmarks at this matrix size.
group.throughput(Throughput::Elements(macs));
group.bench_function(format!("baseline/uniform/{label}"), |bench| {
bench.iter(|| matmul_q4k_fp16(
black_box(&a_q4k_u), black_box(&b), m, k, n,
));
});
group.bench_function(format!("rle/uniform/{label}"), |bench| {
bench.iter(|| matmul_q4k_rle_fp16(
black_box(&a_rle_u), black_box(&b), m, k, n,
));
});
group.bench_function(format!("baseline/rle_opt/{label}"), |bench| {
bench.iter(|| matmul_q4k_fp16(
black_box(&a_q4k_r), black_box(&b), m, k, n,
));
});
group.bench_function(format!("rle/rle_opt/{label}"), |bench| {
bench.iter(|| matmul_q4k_rle_fp16(
black_box(&a_rle_r), black_box(&b), m, k, n,
));
});
}
group.finish();
}
// ---------------------------------------------------------------------------
// Registration
// ---------------------------------------------------------------------------
criterion_group!(benches, bench_encode, bench_dequantize, bench_matmul);
criterion_main!(benches);