Files
rle-encoded-q4km/benches/matmul.rs
2026-04-12 21:26:36 -07:00

358 lines
14 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! # Benchmark: BlockQ4K vs BlockQ4KRle
//!
//! Measures three operations across three weight distributions, encoded with
//! `min_coverage = 0.01` (blocks need ≥ 1 % of their 256 nibbles in repeated
//! runs to use RLE mode).
//!
//! | Group | What is timed |
//! |--------------|-----------------------------------------------------|
//! | `encode` | BlockQ4K → BlockQ4KRle for a batch of 512 blocks |
//! | `dequantize` | Single-block dequantisation across all four paths |
//! | `matmul` | Full A×B multiply at three matrix sizes |
//!
//! ## Weight distributions
//!
//! **uniform** — each qs byte is drawn from a pseudo-random LCG sequence.
//! Adjacent nibbles match with probability 1/16, giving ~12 % nibble coverage.
//! At `min_coverage = 0.01` these blocks encode to **RLE mode** (IS_RLE = 1)
//! with ~230240 nibble entries — a realistic proxy for trained Q4_K weights.
//!
//! **rle_optimal** — every qs byte is the same value. All 256 nibbles are
//! identical, giving 100 % coverage and just 16 nibble entries. This is the
//! theoretical RLE maximum and represents highly structured weight blocks.
//!
//! **zero_coverage** — nibbles cycle deterministically so no two consecutive
//! nibbles (in output-sequential order) are ever equal. Coverage = 0 %;
//! `encode` keeps these blocks in **raw mode** (IS_RLE = 0) at any positive
//! threshold. Used only in the `dequantize` group to benchmark the raw path.
use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
use matrix_testing::{
dequantize_block_q4k, matmul_q4k_fp16,
rle::{dequantize_block_q4k_rle, encode, matmul_q4k_rle_fp16, BlockQ4KRle},
BlockQ4K, K_SCALE_SIZE, QK_K,
};
// ---------------------------------------------------------------------------
// Minimal 64-bit LCG — no external dependencies needed
// ---------------------------------------------------------------------------
/// Deterministic pseudo-random generator using Knuth / PCG constants.
struct Lcg(u64);
impl Lcg {
fn new(seed: u64) -> Self {
Self(seed)
}
fn next_u8(&mut self) -> u8 {
self.0 = self
.0
.wrapping_mul(6_364_136_223_846_793_005)
.wrapping_add(1_442_695_040_888_963_407);
(self.0 >> 33) as u8
}
}
// ---------------------------------------------------------------------------
// Fixture helpers
// ---------------------------------------------------------------------------
/// Lossily encode a finite, non-subnormal f32 to its fp16 bit pattern.
///
/// Only used for block header fields (d, dmin); values must lie within the
/// fp16 normal range [~6.1e-5, 65504]. No overflow / underflow checks.
fn f32_to_fp16(f: f32) -> u16 {
if f == 0.0 {
return 0;
}
let bits = f.to_bits();
let sign = ((bits >> 31) as u16) << 15;
let exp = ((bits >> 23) & 0xFF) as i32 - 127 + 15;
let mantissa = (bits & 0x007F_FFFF) >> 13;
sign | ((exp as u16) << 10) | mantissa as u16
}
/// Build a 12-byte `scales` array where all 8 sub-blocks share the same
/// `scale` and `min` (both must be < 16, matching the test helper in lib.rs).
fn make_scales(scale: u8, min: u8) -> [u8; K_SCALE_SIZE] {
let mut s = [0u8; K_SCALE_SIZE];
for j in 0..4 {
s[j] = scale;
s[j + 4] = min;
}
for j in 8..12 {
s[j] = (scale & 0x0F) | ((min & 0x0F) << 4);
}
s
}
/// Return `count` blocks whose qs bytes are pseudo-random (LCG).
///
/// Adjacent nibbles match with probability 1/16, giving each block roughly
/// 12 % nibble coverage. At `min_coverage = 0.01` these blocks encode to
/// **RLE mode** (IS_RLE = 1) with ~230240 nibble entries per block.
fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {
let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234);
let scales = make_scales(7, 2);
let d = f32_to_fp16(0.01);
let dmin = f32_to_fp16(0.001);
(0..count)
.map(|_| {
let mut qs = [0u8; QK_K / 2];
for b in qs.iter_mut() {
*b = rng.next_u8();
}
BlockQ4K { d, dmin, scales, qs }
})
.collect()
}
/// Return `count` blocks where every qs byte is the same value.
///
/// All 256 nibbles are identical → 100 % nibble coverage → always **RLE mode**
/// with exactly 16 entries (256 nibbles / 16 per entry).
/// Each block uses a fresh pseudo-random byte to avoid cache-warm artifacts.
fn rle_optimal_blocks(count: usize) -> Vec<BlockQ4K> {
let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0);
let scales = make_scales(7, 2);
let d = f32_to_fp16(0.01);
let dmin = f32_to_fp16(0.001);
(0..count)
.map(|_| {
let byte = rng.next_u8();
BlockQ4K { d, dmin, scales, qs: [byte; QK_K / 2] }
})
.collect()
}
/// Build a K×N FP16 matrix (raw u16 bits) where every element is 1.0.
fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
vec![f32_to_fp16(1.0); k * n]
}
/// Build one block whose nibbles cycle so that no two consecutive nibbles
/// (in output-sequential order) are ever equal → 0 % nibble coverage.
///
/// Lo nibble of byte `i` = `i % 16`; hi nibble = `(i + 8) % 16`.
/// Within every 32-byte group the lo and hi streams each visit all 16 values
/// twice without repetition, and across group boundaries the last nibble of
/// one stream differs from the first nibble of the next.
///
/// At any `min_coverage > 0.0`, `encode` keeps this block in **raw mode**.
fn zero_coverage_block() -> BlockQ4K {
let scales = make_scales(7, 2);
let d = f32_to_fp16(0.01);
let dmin = f32_to_fp16(0.001);
let mut qs = [0u8; QK_K / 2];
for (i, b) in qs.iter_mut().enumerate() {
let lo = (i % 16) as u8;
let hi = ((i + 8) % 16) as u8;
*b = lo | (hi << 4);
}
BlockQ4K { d, dmin, scales, qs }
}
// ---------------------------------------------------------------------------
// Group 1 — encode
// ---------------------------------------------------------------------------
/// Number of blocks encoded per iteration in `bench_encode`.
const ENCODE_BATCH: usize = 512;
/// Measures the cost of scanning nibbles and writing the `BlockQ4KRle` output.
///
/// Both distributions perform the same O(256) nibble scan. The output differs:
/// * **uniform** — ~12 % coverage → RLE mode, ~230240 entries written.
/// * **rle_optimal** — 100 % coverage → RLE mode, exactly 16 entries written.
fn bench_encode(c: &mut Criterion) {
let uniform = uniform_blocks(ENCODE_BATCH);
let rle_opt = rle_optimal_blocks(ENCODE_BATCH);
let mut group = c.benchmark_group("encode");
// Throughput = blocks encoded per second.
group.throughput(Throughput::Elements(ENCODE_BATCH as u64));
group.bench_function("uniform", |b| {
b.iter(|| {
for blk in &uniform {
black_box(encode(black_box(blk), 0.01));
}
});
});
group.bench_function("rle_optimal", |b| {
b.iter(|| {
for blk in &rle_opt {
black_box(encode(black_box(blk), 0.01));
}
});
});
group.finish();
}
// ---------------------------------------------------------------------------
// Group 2 — dequantize (single block)
// ---------------------------------------------------------------------------
/// Compares four single-block dequantisation code paths.
///
/// | Variant | Block type | Encoding | IS_RLE | Entries |
/// |--------------------|-------------|-----------|--------|---------|
/// | `q4k_baseline` | BlockQ4K | — | — | — |
/// | `rle_raw_mode` | BlockQ4KRle | raw | 0 | — |
/// | `rle_sparse` | BlockQ4KRle | RLE | 1 | ~235 |
/// | `rle_dense` | BlockQ4KRle | RLE | 1 | 16 |
///
/// `rle_raw_mode` uses the zero-coverage fixture (0 % nibble coverage), which
/// stays in raw mode at any positive threshold.
/// `rle_sparse` uses the LCG uniform fixture (~12 % coverage, ~235 entries),
/// representative of actual trained Q4_K weight blocks.
/// `rle_dense` uses the rle_optimal fixture (100 % coverage, 16 entries).
///
/// Throughput is the number of dequantised weights produced per second.
fn bench_dequantize(c: &mut Criterion) {
let q4k_baseline_block = uniform_blocks(1).into_iter().next().unwrap();
let q4k_zero_cov = zero_coverage_block();
let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap();
let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap();
let rle_raw = encode(&q4k_zero_cov, 0.01); // IS_RLE = 0 (0 % coverage)
let rle_sparse = encode(&q4k_uniform, 0.01); // IS_RLE = 1 (~12 % coverage)
let rle_dense = encode(&q4k_rle_opt, 0.01); // IS_RLE = 1 (100 % coverage)
assert!(!rle_raw.is_rle(), "zero-coverage block must be raw mode");
assert!(rle_sparse.is_rle(), "uniform block must be RLE at 0.01 threshold");
assert!(rle_dense.is_rle(), "rle-optimal block must be RLE mode");
let mut group = c.benchmark_group("dequantize");
// Throughput = QK_K (256) weights dequantised per second.
group.throughput(Throughput::Elements(QK_K as u64));
group.bench_function("q4k_baseline", |b| {
b.iter(|| {
let mut out = [0.0f32; QK_K];
dequantize_block_q4k(black_box(&q4k_baseline_block), &mut out);
black_box(out)
});
});
group.bench_function("rle_raw_mode", |b| {
b.iter(|| {
let mut out = [0.0f32; QK_K];
dequantize_block_q4k_rle(black_box(&rle_raw), &mut out);
black_box(out)
});
});
group.bench_function("rle_sparse", |b| {
b.iter(|| {
let mut out = [0.0f32; QK_K];
dequantize_block_q4k_rle(black_box(&rle_sparse), &mut out);
black_box(out)
});
});
group.bench_function("rle_dense", |b| {
b.iter(|| {
let mut out = [0.0f32; QK_K];
dequantize_block_q4k_rle(black_box(&rle_dense), &mut out);
black_box(out)
});
});
group.finish();
}
// ---------------------------------------------------------------------------
// Group 3 — matmul
// ---------------------------------------------------------------------------
/// Matrix size configurations as (M rows, blocks-per-row, N output cols).
///
/// The shared dimension K = blocks_per_row × QK_K.
///
/// | Label | A shape | B shape | total MACs |
/// |--------|------------|-------------|------------|
/// | tiny | 4 × 256 | 256 × 32 | 32 768 |
/// | medium | 16 × 1024 | 1024 × 64 | 1 048 576 |
/// | large | 64 × 2048 | 2048 × 128 |16 777 216 |
const CONFIGS: &[(usize, usize, usize)] = &[
( 4, 1, 32), // tiny
(16, 4, 64), // medium
(64, 8, 128), // large
];
/// Full matrix-multiply benchmark across weight distributions and matrix sizes.
///
/// Four variants per size (`min_coverage = 0.01`):
///
/// | Label | A type | IS_RLE | Entries/block |
/// |----------------------|-------------|--------|---------------|
/// | `baseline/uniform` | BlockQ4K | — | — |
/// | `rle/uniform` | BlockQ4KRle | 1 | ~235 |
/// | `baseline/rle_opt` | BlockQ4K | — | — |
/// | `rle/rle_opt` | BlockQ4KRle | 1 | 16 |
///
/// Throughput is reported as multiply-accumulate operations (M × K × N) per
/// second, allowing fair cross-size comparison.
///
/// The A and B matrices are pre-built outside `iter()` so fixture construction
/// is not timed. Output Vec allocation/deallocation is included because it is
/// an inherent part of the current API's real-world cost.
fn bench_matmul(c: &mut Criterion) {
let mut group = c.benchmark_group("matmul");
for &(m, bpr, n) in CONFIGS {
let k = bpr * QK_K;
let label = format!("{m}x{k}x{n}");
let macs = (m * k * n) as u64;
// Build all four A variants and the shared B matrix for this config.
let a_q4k_u: Vec<BlockQ4K> = uniform_blocks(m * bpr);
let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(|b| encode(b, 0.01)).collect();
let a_q4k_r: Vec<BlockQ4K> = rle_optimal_blocks(m * bpr);
let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(|b| encode(b, 0.01)).collect();
let b = fp16_ones(k, n);
// Set throughput for all four benchmarks at this matrix size.
group.throughput(Throughput::Elements(macs));
group.bench_function(format!("baseline/uniform/{label}"), |bench| {
bench.iter(|| matmul_q4k_fp16(
black_box(&a_q4k_u), black_box(&b), m, k, n,
));
});
group.bench_function(format!("rle/uniform/{label}"), |bench| {
bench.iter(|| matmul_q4k_rle_fp16(
black_box(&a_rle_u), black_box(&b), m, k, n,
));
});
group.bench_function(format!("baseline/rle_opt/{label}"), |bench| {
bench.iter(|| matmul_q4k_fp16(
black_box(&a_q4k_r), black_box(&b), m, k, n,
));
});
group.bench_function(format!("rle/rle_opt/{label}"), |bench| {
bench.iter(|| matmul_q4k_rle_fp16(
black_box(&a_rle_r), black_box(&b), m, k, n,
));
});
}
group.finish();
}
// ---------------------------------------------------------------------------
// Registration
// ---------------------------------------------------------------------------
criterion_group!(benches, bench_encode, bench_dequantize, bench_matmul);
criterion_main!(benches);