2026-04-12 15:40:19 -07:00
|
|
|
|
//! # Benchmark: BlockQ4K vs BlockQ4KRle
|
|
|
|
|
|
//!
|
2026-04-12 21:26:36 -07:00
|
|
|
|
//! Measures three operations across three weight distributions, encoded with
|
|
|
|
|
|
//! `min_coverage = 0.01` (blocks need ≥ 1 % of their 256 nibbles in repeated
|
|
|
|
|
|
//! runs to use RLE mode).
|
2026-04-12 15:40:19 -07:00
|
|
|
|
//!
|
2026-04-12 21:26:36 -07:00
|
|
|
|
//! | Group | What is timed |
|
|
|
|
|
|
//! |--------------|-----------------------------------------------------|
|
|
|
|
|
|
//! | `encode` | BlockQ4K → BlockQ4KRle for a batch of 512 blocks |
|
|
|
|
|
|
//! | `dequantize` | Single-block dequantisation across all four paths |
|
|
|
|
|
|
//! | `matmul` | Full A×B multiply at three matrix sizes |
|
2026-04-12 15:40:19 -07:00
|
|
|
|
//!
|
|
|
|
|
|
//! ## Weight distributions
|
|
|
|
|
|
//!
|
2026-04-12 21:26:36 -07:00
|
|
|
|
//! **uniform** — each qs byte is drawn from a pseudo-random LCG sequence.
|
|
|
|
|
|
//! Adjacent nibbles match with probability 1/16, giving ~12 % nibble coverage.
|
|
|
|
|
|
//! At `min_coverage = 0.01` these blocks encode to **RLE mode** (IS_RLE = 1)
|
|
|
|
|
|
//! with ~230–240 nibble entries — a realistic proxy for trained Q4_K weights.
|
2026-04-12 15:40:19 -07:00
|
|
|
|
//!
|
2026-04-12 21:26:36 -07:00
|
|
|
|
//! **rle_optimal** — every qs byte is the same value. All 256 nibbles are
|
|
|
|
|
|
//! identical, giving 100 % coverage and just 16 nibble entries. This is the
|
|
|
|
|
|
//! theoretical RLE maximum and represents highly structured weight blocks.
|
|
|
|
|
|
//!
|
|
|
|
|
|
//! **zero_coverage** — nibbles cycle deterministically so no two consecutive
|
|
|
|
|
|
//! nibbles (in output-sequential order) are ever equal. Coverage = 0 %;
|
|
|
|
|
|
//! `encode` keeps these blocks in **raw mode** (IS_RLE = 0) at any positive
|
|
|
|
|
|
//! threshold. Used only in the `dequantize` group to benchmark the raw path.
|
2026-04-12 15:40:19 -07:00
|
|
|
|
|
|
|
|
|
|
use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
|
|
|
|
|
|
use matrix_testing::{
|
|
|
|
|
|
dequantize_block_q4k, matmul_q4k_fp16,
|
|
|
|
|
|
rle::{dequantize_block_q4k_rle, encode, matmul_q4k_rle_fp16, BlockQ4KRle},
|
|
|
|
|
|
BlockQ4K, K_SCALE_SIZE, QK_K,
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
// Minimal 64-bit LCG — no external dependencies needed
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
/// Deterministic pseudo-random generator using Knuth / PCG constants.
|
|
|
|
|
|
struct Lcg(u64);
|
|
|
|
|
|
|
|
|
|
|
|
impl Lcg {
|
|
|
|
|
|
fn new(seed: u64) -> Self {
|
|
|
|
|
|
Self(seed)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fn next_u8(&mut self) -> u8 {
|
|
|
|
|
|
self.0 = self
|
|
|
|
|
|
.0
|
|
|
|
|
|
.wrapping_mul(6_364_136_223_846_793_005)
|
|
|
|
|
|
.wrapping_add(1_442_695_040_888_963_407);
|
|
|
|
|
|
(self.0 >> 33) as u8
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
// Fixture helpers
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
/// Lossily encode a finite, non-subnormal f32 to its fp16 bit pattern.
|
|
|
|
|
|
///
|
|
|
|
|
|
/// Only used for block header fields (d, dmin); values must lie within the
|
|
|
|
|
|
/// fp16 normal range [~6.1e-5, 65504]. No overflow / underflow checks.
|
|
|
|
|
|
fn f32_to_fp16(f: f32) -> u16 {
|
|
|
|
|
|
if f == 0.0 {
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
let bits = f.to_bits();
|
|
|
|
|
|
let sign = ((bits >> 31) as u16) << 15;
|
|
|
|
|
|
let exp = ((bits >> 23) & 0xFF) as i32 - 127 + 15;
|
|
|
|
|
|
let mantissa = (bits & 0x007F_FFFF) >> 13;
|
|
|
|
|
|
sign | ((exp as u16) << 10) | mantissa as u16
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Build a 12-byte `scales` array where all 8 sub-blocks share the same
|
|
|
|
|
|
/// `scale` and `min` (both must be < 16, matching the test helper in lib.rs).
|
|
|
|
|
|
fn make_scales(scale: u8, min: u8) -> [u8; K_SCALE_SIZE] {
|
|
|
|
|
|
let mut s = [0u8; K_SCALE_SIZE];
|
|
|
|
|
|
for j in 0..4 {
|
|
|
|
|
|
s[j] = scale;
|
|
|
|
|
|
s[j + 4] = min;
|
|
|
|
|
|
}
|
|
|
|
|
|
for j in 8..12 {
|
|
|
|
|
|
s[j] = (scale & 0x0F) | ((min & 0x0F) << 4);
|
|
|
|
|
|
}
|
|
|
|
|
|
s
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-12 21:26:36 -07:00
|
|
|
|
/// Return `count` blocks whose qs bytes are pseudo-random (LCG).
|
2026-04-12 15:40:19 -07:00
|
|
|
|
///
|
2026-04-12 21:26:36 -07:00
|
|
|
|
/// Adjacent nibbles match with probability 1/16, giving each block roughly
|
|
|
|
|
|
/// 12 % nibble coverage. At `min_coverage = 0.01` these blocks encode to
|
|
|
|
|
|
/// **RLE mode** (IS_RLE = 1) with ~230–240 nibble entries per block.
|
2026-04-12 15:40:19 -07:00
|
|
|
|
fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {
|
|
|
|
|
|
let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234);
|
|
|
|
|
|
let scales = make_scales(7, 2);
|
|
|
|
|
|
let d = f32_to_fp16(0.01);
|
|
|
|
|
|
let dmin = f32_to_fp16(0.001);
|
|
|
|
|
|
(0..count)
|
|
|
|
|
|
.map(|_| {
|
|
|
|
|
|
let mut qs = [0u8; QK_K / 2];
|
|
|
|
|
|
for b in qs.iter_mut() {
|
|
|
|
|
|
*b = rng.next_u8();
|
|
|
|
|
|
}
|
|
|
|
|
|
BlockQ4K { d, dmin, scales, qs }
|
|
|
|
|
|
})
|
|
|
|
|
|
.collect()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Return `count` blocks where every qs byte is the same value.
|
|
|
|
|
|
///
|
2026-04-12 21:26:36 -07:00
|
|
|
|
/// All 256 nibbles are identical → 100 % nibble coverage → always **RLE mode**
|
|
|
|
|
|
/// with exactly 16 entries (256 nibbles / 16 per entry).
|
|
|
|
|
|
/// Each block uses a fresh pseudo-random byte to avoid cache-warm artifacts.
|
2026-04-12 15:40:19 -07:00
|
|
|
|
fn rle_optimal_blocks(count: usize) -> Vec<BlockQ4K> {
|
|
|
|
|
|
let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0);
|
|
|
|
|
|
let scales = make_scales(7, 2);
|
|
|
|
|
|
let d = f32_to_fp16(0.01);
|
|
|
|
|
|
let dmin = f32_to_fp16(0.001);
|
|
|
|
|
|
(0..count)
|
|
|
|
|
|
.map(|_| {
|
|
|
|
|
|
let byte = rng.next_u8();
|
|
|
|
|
|
BlockQ4K { d, dmin, scales, qs: [byte; QK_K / 2] }
|
|
|
|
|
|
})
|
|
|
|
|
|
.collect()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Build a K×N FP16 matrix (raw u16 bits) where every element is 1.0.
|
|
|
|
|
|
fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
|
|
|
|
|
|
vec![f32_to_fp16(1.0); k * n]
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-12 21:26:36 -07:00
|
|
|
|
/// Build one block whose nibbles cycle so that no two consecutive nibbles
|
|
|
|
|
|
/// (in output-sequential order) are ever equal → 0 % nibble coverage.
|
|
|
|
|
|
///
|
|
|
|
|
|
/// Lo nibble of byte `i` = `i % 16`; hi nibble = `(i + 8) % 16`.
|
|
|
|
|
|
/// Within every 32-byte group the lo and hi streams each visit all 16 values
|
|
|
|
|
|
/// twice without repetition, and across group boundaries the last nibble of
|
|
|
|
|
|
/// one stream differs from the first nibble of the next.
|
|
|
|
|
|
///
|
|
|
|
|
|
/// At any `min_coverage > 0.0`, `encode` keeps this block in **raw mode**.
|
|
|
|
|
|
fn zero_coverage_block() -> BlockQ4K {
|
|
|
|
|
|
let scales = make_scales(7, 2);
|
|
|
|
|
|
let d = f32_to_fp16(0.01);
|
|
|
|
|
|
let dmin = f32_to_fp16(0.001);
|
|
|
|
|
|
let mut qs = [0u8; QK_K / 2];
|
|
|
|
|
|
for (i, b) in qs.iter_mut().enumerate() {
|
|
|
|
|
|
let lo = (i % 16) as u8;
|
|
|
|
|
|
let hi = ((i + 8) % 16) as u8;
|
|
|
|
|
|
*b = lo | (hi << 4);
|
|
|
|
|
|
}
|
|
|
|
|
|
BlockQ4K { d, dmin, scales, qs }
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-12 15:40:19 -07:00
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
// Group 1 — encode
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
/// Number of blocks encoded per iteration in `bench_encode`.
|
|
|
|
|
|
const ENCODE_BATCH: usize = 512;
|
|
|
|
|
|
|
2026-04-12 21:26:36 -07:00
|
|
|
|
/// Measures the cost of scanning nibbles and writing the `BlockQ4KRle` output.
|
2026-04-12 15:40:19 -07:00
|
|
|
|
///
|
2026-04-12 21:26:36 -07:00
|
|
|
|
/// Both distributions perform the same O(256) nibble scan. The output differs:
|
|
|
|
|
|
/// * **uniform** — ~12 % coverage → RLE mode, ~230–240 entries written.
|
|
|
|
|
|
/// * **rle_optimal** — 100 % coverage → RLE mode, exactly 16 entries written.
|
2026-04-12 15:40:19 -07:00
|
|
|
|
fn bench_encode(c: &mut Criterion) {
|
|
|
|
|
|
let uniform = uniform_blocks(ENCODE_BATCH);
|
|
|
|
|
|
let rle_opt = rle_optimal_blocks(ENCODE_BATCH);
|
|
|
|
|
|
|
|
|
|
|
|
let mut group = c.benchmark_group("encode");
|
|
|
|
|
|
// Throughput = blocks encoded per second.
|
|
|
|
|
|
group.throughput(Throughput::Elements(ENCODE_BATCH as u64));
|
|
|
|
|
|
|
|
|
|
|
|
group.bench_function("uniform", |b| {
|
|
|
|
|
|
b.iter(|| {
|
|
|
|
|
|
for blk in &uniform {
|
2026-04-12 21:26:36 -07:00
|
|
|
|
black_box(encode(black_box(blk), 0.01));
|
2026-04-12 15:40:19 -07:00
|
|
|
|
}
|
|
|
|
|
|
});
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
group.bench_function("rle_optimal", |b| {
|
|
|
|
|
|
b.iter(|| {
|
|
|
|
|
|
for blk in &rle_opt {
|
2026-04-12 21:26:36 -07:00
|
|
|
|
black_box(encode(black_box(blk), 0.01));
|
2026-04-12 15:40:19 -07:00
|
|
|
|
}
|
|
|
|
|
|
});
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
group.finish();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
// Group 2 — dequantize (single block)
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
|
2026-04-12 21:26:36 -07:00
|
|
|
|
/// Compares four single-block dequantisation code paths.
|
|
|
|
|
|
///
|
|
|
|
|
|
/// | Variant | Block type | Encoding | IS_RLE | Entries |
|
|
|
|
|
|
/// |--------------------|-------------|-----------|--------|---------|
|
|
|
|
|
|
/// | `q4k_baseline` | BlockQ4K | — | — | — |
|
|
|
|
|
|
/// | `rle_raw_mode` | BlockQ4KRle | raw | 0 | — |
|
|
|
|
|
|
/// | `rle_sparse` | BlockQ4KRle | RLE | 1 | ~235 |
|
|
|
|
|
|
/// | `rle_dense` | BlockQ4KRle | RLE | 1 | 16 |
|
2026-04-12 15:40:19 -07:00
|
|
|
|
///
|
2026-04-12 21:26:36 -07:00
|
|
|
|
/// `rle_raw_mode` uses the zero-coverage fixture (0 % nibble coverage), which
|
|
|
|
|
|
/// stays in raw mode at any positive threshold.
|
|
|
|
|
|
/// `rle_sparse` uses the LCG uniform fixture (~12 % coverage, ~235 entries),
|
|
|
|
|
|
/// representative of actual trained Q4_K weight blocks.
|
|
|
|
|
|
/// `rle_dense` uses the rle_optimal fixture (100 % coverage, 16 entries).
|
2026-04-12 15:40:19 -07:00
|
|
|
|
///
|
|
|
|
|
|
/// Throughput is the number of dequantised weights produced per second.
|
|
|
|
|
|
fn bench_dequantize(c: &mut Criterion) {
|
2026-04-12 21:26:36 -07:00
|
|
|
|
let q4k_baseline_block = uniform_blocks(1).into_iter().next().unwrap();
|
|
|
|
|
|
let q4k_zero_cov = zero_coverage_block();
|
|
|
|
|
|
let q4k_uniform = uniform_blocks(1).into_iter().next().unwrap();
|
|
|
|
|
|
let q4k_rle_opt = rle_optimal_blocks(1).into_iter().next().unwrap();
|
2026-04-12 15:40:19 -07:00
|
|
|
|
|
2026-04-12 21:26:36 -07:00
|
|
|
|
let rle_raw = encode(&q4k_zero_cov, 0.01); // IS_RLE = 0 (0 % coverage)
|
|
|
|
|
|
let rle_sparse = encode(&q4k_uniform, 0.01); // IS_RLE = 1 (~12 % coverage)
|
|
|
|
|
|
let rle_dense = encode(&q4k_rle_opt, 0.01); // IS_RLE = 1 (100 % coverage)
|
2026-04-12 15:40:19 -07:00
|
|
|
|
|
2026-04-12 21:26:36 -07:00
|
|
|
|
assert!(!rle_raw.is_rle(), "zero-coverage block must be raw mode");
|
|
|
|
|
|
assert!(rle_sparse.is_rle(), "uniform block must be RLE at 0.01 threshold");
|
|
|
|
|
|
assert!(rle_dense.is_rle(), "rle-optimal block must be RLE mode");
|
2026-04-12 15:40:19 -07:00
|
|
|
|
|
|
|
|
|
|
let mut group = c.benchmark_group("dequantize");
|
|
|
|
|
|
// Throughput = QK_K (256) weights dequantised per second.
|
|
|
|
|
|
group.throughput(Throughput::Elements(QK_K as u64));
|
|
|
|
|
|
|
|
|
|
|
|
group.bench_function("q4k_baseline", |b| {
|
|
|
|
|
|
b.iter(|| {
|
|
|
|
|
|
let mut out = [0.0f32; QK_K];
|
2026-04-12 21:26:36 -07:00
|
|
|
|
dequantize_block_q4k(black_box(&q4k_baseline_block), &mut out);
|
2026-04-12 15:40:19 -07:00
|
|
|
|
black_box(out)
|
|
|
|
|
|
});
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
group.bench_function("rle_raw_mode", |b| {
|
|
|
|
|
|
b.iter(|| {
|
|
|
|
|
|
let mut out = [0.0f32; QK_K];
|
|
|
|
|
|
dequantize_block_q4k_rle(black_box(&rle_raw), &mut out);
|
|
|
|
|
|
black_box(out)
|
|
|
|
|
|
});
|
|
|
|
|
|
});
|
|
|
|
|
|
|
2026-04-12 21:26:36 -07:00
|
|
|
|
group.bench_function("rle_sparse", |b| {
|
|
|
|
|
|
b.iter(|| {
|
|
|
|
|
|
let mut out = [0.0f32; QK_K];
|
|
|
|
|
|
dequantize_block_q4k_rle(black_box(&rle_sparse), &mut out);
|
|
|
|
|
|
black_box(out)
|
|
|
|
|
|
});
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
group.bench_function("rle_dense", |b| {
|
2026-04-12 15:40:19 -07:00
|
|
|
|
b.iter(|| {
|
|
|
|
|
|
let mut out = [0.0f32; QK_K];
|
2026-04-12 21:26:36 -07:00
|
|
|
|
dequantize_block_q4k_rle(black_box(&rle_dense), &mut out);
|
2026-04-12 15:40:19 -07:00
|
|
|
|
black_box(out)
|
|
|
|
|
|
});
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
group.finish();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
// Group 3 — matmul
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
/// Matrix size configurations as (M rows, blocks-per-row, N output cols).
|
|
|
|
|
|
///
|
|
|
|
|
|
/// The shared dimension K = blocks_per_row × QK_K.
|
|
|
|
|
|
///
|
|
|
|
|
|
/// | Label | A shape | B shape | total MACs |
|
|
|
|
|
|
/// |--------|------------|-------------|------------|
|
|
|
|
|
|
/// | tiny | 4 × 256 | 256 × 32 | 32 768 |
|
|
|
|
|
|
/// | medium | 16 × 1024 | 1024 × 64 | 1 048 576 |
|
|
|
|
|
|
/// | large | 64 × 2048 | 2048 × 128 |16 777 216 |
|
|
|
|
|
|
const CONFIGS: &[(usize, usize, usize)] = &[
|
|
|
|
|
|
( 4, 1, 32), // tiny
|
|
|
|
|
|
(16, 4, 64), // medium
|
|
|
|
|
|
(64, 8, 128), // large
|
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
|
|
/// Full matrix-multiply benchmark across weight distributions and matrix sizes.
|
|
|
|
|
|
///
|
2026-04-12 21:26:36 -07:00
|
|
|
|
/// Four variants per size (`min_coverage = 0.01`):
|
2026-04-12 15:40:19 -07:00
|
|
|
|
///
|
2026-04-12 21:26:36 -07:00
|
|
|
|
/// | Label | A type | IS_RLE | Entries/block |
|
|
|
|
|
|
/// |----------------------|-------------|--------|---------------|
|
|
|
|
|
|
/// | `baseline/uniform` | BlockQ4K | — | — |
|
|
|
|
|
|
/// | `rle/uniform` | BlockQ4KRle | 1 | ~235 |
|
|
|
|
|
|
/// | `baseline/rle_opt` | BlockQ4K | — | — |
|
|
|
|
|
|
/// | `rle/rle_opt` | BlockQ4KRle | 1 | 16 |
|
2026-04-12 15:40:19 -07:00
|
|
|
|
///
|
|
|
|
|
|
/// Throughput is reported as multiply-accumulate operations (M × K × N) per
|
|
|
|
|
|
/// second, allowing fair cross-size comparison.
|
|
|
|
|
|
///
|
|
|
|
|
|
/// The A and B matrices are pre-built outside `iter()` so fixture construction
|
|
|
|
|
|
/// is not timed. Output Vec allocation/deallocation is included because it is
|
|
|
|
|
|
/// an inherent part of the current API's real-world cost.
|
|
|
|
|
|
fn bench_matmul(c: &mut Criterion) {
|
|
|
|
|
|
let mut group = c.benchmark_group("matmul");
|
|
|
|
|
|
|
|
|
|
|
|
for &(m, bpr, n) in CONFIGS {
|
|
|
|
|
|
let k = bpr * QK_K;
|
|
|
|
|
|
let label = format!("{m}x{k}x{n}");
|
|
|
|
|
|
let macs = (m * k * n) as u64;
|
|
|
|
|
|
|
|
|
|
|
|
// Build all four A variants and the shared B matrix for this config.
|
|
|
|
|
|
let a_q4k_u: Vec<BlockQ4K> = uniform_blocks(m * bpr);
|
2026-04-12 21:26:36 -07:00
|
|
|
|
let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(|b| encode(b, 0.01)).collect();
|
2026-04-12 15:40:19 -07:00
|
|
|
|
|
|
|
|
|
|
let a_q4k_r: Vec<BlockQ4K> = rle_optimal_blocks(m * bpr);
|
2026-04-12 21:26:36 -07:00
|
|
|
|
let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(|b| encode(b, 0.01)).collect();
|
2026-04-12 15:40:19 -07:00
|
|
|
|
|
|
|
|
|
|
let b = fp16_ones(k, n);
|
|
|
|
|
|
|
|
|
|
|
|
// Set throughput for all four benchmarks at this matrix size.
|
|
|
|
|
|
group.throughput(Throughput::Elements(macs));
|
|
|
|
|
|
|
|
|
|
|
|
group.bench_function(format!("baseline/uniform/{label}"), |bench| {
|
|
|
|
|
|
bench.iter(|| matmul_q4k_fp16(
|
|
|
|
|
|
black_box(&a_q4k_u), black_box(&b), m, k, n,
|
|
|
|
|
|
));
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
group.bench_function(format!("rle/uniform/{label}"), |bench| {
|
|
|
|
|
|
bench.iter(|| matmul_q4k_rle_fp16(
|
|
|
|
|
|
black_box(&a_rle_u), black_box(&b), m, k, n,
|
|
|
|
|
|
));
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
group.bench_function(format!("baseline/rle_opt/{label}"), |bench| {
|
|
|
|
|
|
bench.iter(|| matmul_q4k_fp16(
|
|
|
|
|
|
black_box(&a_q4k_r), black_box(&b), m, k, n,
|
|
|
|
|
|
));
|
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
|
|
group.bench_function(format!("rle/rle_opt/{label}"), |bench| {
|
|
|
|
|
|
bench.iter(|| matmul_q4k_rle_fp16(
|
|
|
|
|
|
black_box(&a_rle_r), black_box(&b), m, k, n,
|
|
|
|
|
|
));
|
|
|
|
|
|
});
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
group.finish();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
// Registration
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
criterion_group!(benches, bench_encode, bench_dequantize, bench_matmul);
|
|
|
|
|
|
criterion_main!(benches);
|