RLE now works on nibbles

Allow variable coverage
Try sorting
2026-04-12 21:26:36 -07:00 · 2026-04-12 20:51:19 -07:00 · 2026-04-12 19:51:59 -07:00
4 changed files with 533 additions and 344 deletions
--- a/benches/matmul.rs
+++ b/benches/matmul.rs
@@ -1,26 +1,30 @@
 //! # Benchmark: BlockQ4K vs BlockQ4KRle
 //!
-//! Measures three operations across two weight distributions:
+//! Measures three operations across three weight distributions, encoded with
 //! `min_coverage = 0.01` (blocks need ≥ 1 % of their 256 nibbles in repeated
 //! runs to use RLE mode).
 //!
 //! | Group        | What is timed                                       |
-//! |--------------|--------------------------------------------------|
+//! |--------------|-----------------------------------------------------|
 //! | `encode`     | BlockQ4K → BlockQ4KRle for a batch of 512 blocks    |
-//! | `dequantize` | Single-block dequantisation for all three paths  |
+//! | `dequantize` | Single-block dequantisation across all four paths   |
 //! | `matmul`     | Full A×B multiply at three matrix sizes             |
 //!
 //! ## Weight distributions
 //!
-//! **uniform** — each qs byte is drawn from a pseudo-random sequence (LCG).
+//! **uniform** — each qs byte is drawn from a pseudo-random LCG sequence.
-//! Consecutive bytes almost never repeat, so each block produces ~128
+//! Adjacent nibbles match with probability 1/16, giving ~12 % nibble coverage.
-//! single-byte runs.  At 2 bytes per pair that would require ~256 bytes,
+//! At `min_coverage = 0.01` these blocks encode to **RLE mode** (IS_RLE = 1)
-//! which exceeds the 128-byte raw payload, so `encode` always keeps these
+//! with ~230–240 nibble entries — a realistic proxy for trained Q4_K weights.
 //! blocks in **raw mode** (IS_RLE = 0).  This is representative of typical
 //! unstructured LLM weight matrices.
 //!
-//! **rle_optimal** — every byte in a block's qs field is the same value.
+//! **rle_optimal** — every qs byte is the same value.  All 256 nibbles are
-//! `encode` stores a single (value, count) pair — 2 bytes instead of 128 —
+//! identical, giving 100 % coverage and just 16 nibble entries.  This is the
-//! and sets IS_RLE = 1.  This is the theoretical compression maximum, and
+//! theoretical RLE maximum and represents highly structured weight blocks.
-//! is representative of highly sparse or dead-neuron weight matrices.
+//!
 //! **zero_coverage** — nibbles cycle deterministically so no two consecutive
 //! nibbles (in output-sequential order) are ever equal.  Coverage = 0 %;
 //! `encode` keeps these blocks in **raw mode** (IS_RLE = 0) at any positive
 //! threshold.  Used only in the `dequantize` group to benchmark the raw path.
 use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
 use matrix_testing::{
@@ -83,12 +87,11 @@ fn make_scales(scale: u8, min: u8) -> [u8; K_SCALE_SIZE] {
    s
 }
-/// Return `count` blocks whose qs bytes are pseudo-random.
+/// Return `count` blocks whose qs bytes are pseudo-random (LCG).
 ///
-/// With uniformly distributed bytes, consecutive bytes match with probability
+/// Adjacent nibbles match with probability 1/16, giving each block roughly
-/// 1/256 ≈ 0.4%, yielding ~128 runs per block.  Storing those as (value,
+/// 12 % nibble coverage.  At `min_coverage = 0.01` these blocks encode to
-/// count) pairs would need ~256 bytes — more than the 128-byte raw payload —
+/// **RLE mode** (IS_RLE = 1) with ~230–240 nibble entries per block.
 /// so `encode` will always select **raw mode** (IS_RLE = 0).
 fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {
    let mut rng = Lcg::new(0xDEAD_BEEF_CAFE_1234);
    let scales  = make_scales(7, 2);
@@ -107,10 +110,9 @@ fn uniform_blocks(count: usize) -> Vec<BlockQ4K> {
 /// Return `count` blocks where every qs byte is the same value.
 ///
-/// A uniform byte array collapses to one (value, count) RLE pair: 2 bytes
+/// All 256 nibbles are identical → 100 % nibble coverage → always **RLE mode**
-/// instead of 128.  `encode` will always select **RLE mode** (IS_RLE = 1).
+/// with exactly 16 entries (256 nibbles / 16 per entry).
-/// Each block uses a fresh pseudo-random byte so no two blocks are identical,
+/// Each block uses a fresh pseudo-random byte to avoid cache-warm artifacts.
 /// avoiding degenerate cache-warm effects across the batch.
 fn rle_optimal_blocks(count: usize) -> Vec<BlockQ4K> {
    let mut rng = Lcg::new(0x1234_5678_9ABC_DEF0);
    let scales  = make_scales(7, 2);
@@ -129,6 +131,28 @@ fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
    vec![f32_to_fp16(1.0); k * n]
 }
 /// Build one block whose nibbles cycle so that no two consecutive nibbles
 /// (in output-sequential order) are ever equal → 0 % nibble coverage.
 ///
 /// Lo nibble of byte `i` = `i % 16`; hi nibble = `(i + 8) % 16`.
 /// Within every 32-byte group the lo and hi streams each visit all 16 values
 /// twice without repetition, and across group boundaries the last nibble of
 /// one stream differs from the first nibble of the next.
 ///
 /// At any `min_coverage > 0.0`, `encode` keeps this block in **raw mode**.
 fn zero_coverage_block() -> BlockQ4K {
    let scales = make_scales(7, 2);
    let d      = f32_to_fp16(0.01);
    let dmin   = f32_to_fp16(0.001);
    let mut qs = [0u8; QK_K / 2];
    for (i, b) in qs.iter_mut().enumerate() {
        let lo = (i % 16) as u8;
        let hi = ((i + 8) % 16) as u8;
        *b = lo | (hi << 4);
    }
    BlockQ4K { d, dmin, scales, qs }
 }
 // ---------------------------------------------------------------------------
 // Group 1 — encode
 // ---------------------------------------------------------------------------
@@ -136,12 +160,11 @@ fn fp16_ones(k: usize, n: usize) -> Vec<u16> {
 /// Number of blocks encoded per iteration in `bench_encode`.
 const ENCODE_BATCH: usize = 512;
-/// Measures the cost of scanning qs bytes and writing the BlockQ4KRle output.
+/// Measures the cost of scanning nibbles and writing the `BlockQ4KRle` output.
 ///
-/// Both distributions perform the same O(128) run-length scan.  The only
+/// Both distributions perform the same O(256) nibble scan.  The output differs:
-/// divergence is at the output stage:
+/// * **uniform**    — ~12 % coverage → RLE mode, ~230–240 entries written.
-/// * **uniform**    — run count > 63 → fall through to memcpy of 128 bytes.
+/// * **rle_optimal** — 100 % coverage → RLE mode, exactly 16 entries written.
 /// * **rle_optimal** — run count = 1  → write 2 bytes and set IS_RLE.
 fn bench_encode(c: &mut Criterion) {
    let uniform  = uniform_blocks(ENCODE_BATCH);
    let rle_opt  = rle_optimal_blocks(ENCODE_BATCH);
@@ -153,7 +176,7 @@ fn bench_encode(c: &mut Criterion) {
    group.bench_function("uniform", |b| {
        b.iter(|| {
            for blk in &uniform {
-                black_box(encode(black_box(blk)));
+                black_box(encode(black_box(blk), 0.01));
            }
        });
    });
@@ -161,7 +184,7 @@ fn bench_encode(c: &mut Criterion) {
    group.bench_function("rle_optimal", |b| {
        b.iter(|| {
            for blk in &rle_opt {
-                black_box(encode(black_box(blk)));
+                black_box(encode(black_box(blk), 0.01));
            }
        });
    });
@@ -173,25 +196,35 @@ fn bench_encode(c: &mut Criterion) {
 // Group 2 — dequantize (single block)
 // ---------------------------------------------------------------------------
-/// Compares the three single-block dequantisation code paths.
+/// Compares four single-block dequantisation code paths.
 ///
-/// | Variant          | Block type  | Encoding | Extra work vs baseline        |
+/// | Variant            | Block type  | Encoding  | IS_RLE | Entries |
-/// |------------------|-------------|----------|-------------------------------|
+/// |--------------------|-------------|-----------|--------|---------|
-/// | `q4k_baseline`   | BlockQ4K    | —        | none                          |
+/// | `q4k_baseline`     | BlockQ4K    | —         | —      | —       |
-/// | `rle_raw_mode`   | BlockQ4KRle | IS_RLE=0 | one branch (`is_rle()` check) |
+/// | `rle_raw_mode`     | BlockQ4KRle | raw       | 0      | —       |
-/// | `rle_rle_mode`   | BlockQ4KRle | IS_RLE=1 | RLE expansion into 128-B buf  |
+/// | `rle_sparse`       | BlockQ4KRle | RLE       | 1      | ~235    |
 /// | `rle_dense`        | BlockQ4KRle | RLE       | 1      | 16      |
 ///
 /// `rle_raw_mode` uses the zero-coverage fixture (0 % nibble coverage), which
 /// stays in raw mode at any positive threshold.
 /// `rle_sparse` uses the LCG uniform fixture (~12 % coverage, ~235 entries),
 /// representative of actual trained Q4_K weight blocks.
 /// `rle_dense` uses the rle_optimal fixture (100 % coverage, 16 entries).
 ///
 /// Throughput is the number of dequantised weights produced per second.
 fn bench_dequantize(c: &mut Criterion) {
    let q4k_baseline_block = uniform_blocks(1).into_iter().next().unwrap();
    let q4k_zero_cov       = zero_coverage_block();
    let q4k_uniform        = uniform_blocks(1).into_iter().next().unwrap();
    let q4k_rle_opt        = rle_optimal_blocks(1).into_iter().next().unwrap();
-    let rle_raw = encode(&q4k_uniform); // IS_RLE = 0
+    let rle_raw    = encode(&q4k_zero_cov, 0.01); // IS_RLE = 0  (0 % coverage)
-    let rle_rle = encode(&q4k_rle_opt); // IS_RLE = 1
+    let rle_sparse = encode(&q4k_uniform,  0.01); // IS_RLE = 1  (~12 % coverage)
    let rle_dense  = encode(&q4k_rle_opt,  0.01); // IS_RLE = 1  (100 % coverage)
-    // Confirm the fixtures ended up in the right encoding modes.
+    assert!(!rle_raw.is_rle(),    "zero-coverage block must be raw mode");
-    assert!(!rle_raw.is_rle(), "uniform block should encode to raw mode");
+    assert!(rle_sparse.is_rle(),  "uniform block must be RLE at 0.01 threshold");
-    assert!(rle_rle.is_rle(),  "rle-optimal block should encode to rle mode");
+    assert!(rle_dense.is_rle(),   "rle-optimal block must be RLE mode");
    let mut group = c.benchmark_group("dequantize");
    // Throughput = QK_K (256) weights dequantised per second.
@@ -200,7 +233,7 @@ fn bench_dequantize(c: &mut Criterion) {
    group.bench_function("q4k_baseline", |b| {
        b.iter(|| {
            let mut out = [0.0f32; QK_K];
-            dequantize_block_q4k(black_box(&q4k_uniform), &mut out);
+            dequantize_block_q4k(black_box(&q4k_baseline_block), &mut out);
            black_box(out)
        });
    });
@@ -213,10 +246,18 @@ fn bench_dequantize(c: &mut Criterion) {
        });
    });
-    group.bench_function("rle_rle_mode", |b| {
+    group.bench_function("rle_sparse", |b| {
        b.iter(|| {
            let mut out = [0.0f32; QK_K];
-            dequantize_block_q4k_rle(black_box(&rle_rle), &mut out);
+            dequantize_block_q4k_rle(black_box(&rle_sparse), &mut out);
            black_box(out)
        });
    });
    group.bench_function("rle_dense", |b| {
        b.iter(|| {
            let mut out = [0.0f32; QK_K];
            dequantize_block_q4k_rle(black_box(&rle_dense), &mut out);
            black_box(out)
        });
    });
@@ -245,14 +286,14 @@ const CONFIGS: &[(usize, usize, usize)] = &[
 /// Full matrix-multiply benchmark across weight distributions and matrix sizes.
 ///
-/// Four variants per size:
+/// Four variants per size (`min_coverage = 0.01`):
 ///
-/// | Label                | A type      | RLE mode? |
+/// | Label                | A type      | IS_RLE | Entries/block |
-/// |----------------------|-------------|-----------|
+/// |----------------------|-------------|--------|---------------|
-/// | `baseline/uniform`   | BlockQ4K    | —         |
+/// | `baseline/uniform`   | BlockQ4K    | —      | —             |
-/// | `rle/uniform`        | BlockQ4KRle | raw       |
+/// | `rle/uniform`        | BlockQ4KRle | 1      | ~235          |
-/// | `baseline/rle_opt`   | BlockQ4K    | —         |
+/// | `baseline/rle_opt`   | BlockQ4K    | —      | —             |
-/// | `rle/rle_opt`        | BlockQ4KRle | rle       |
+/// | `rle/rle_opt`        | BlockQ4KRle | 1      | 16            |
 ///
 /// Throughput is reported as multiply-accumulate operations (M × K × N) per
 /// second, allowing fair cross-size comparison.
@@ -270,10 +311,10 @@ fn bench_matmul(c: &mut Criterion) {
        // Build all four A variants and the shared B matrix for this config.
        let a_q4k_u: Vec<BlockQ4K>    = uniform_blocks(m * bpr);
-        let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(encode).collect();
+        let a_rle_u: Vec<BlockQ4KRle> = a_q4k_u.iter().map(|b| encode(b, 0.01)).collect();
        let a_q4k_r: Vec<BlockQ4K>    = rle_optimal_blocks(m * bpr);
-        let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(encode).collect();
+        let a_rle_r: Vec<BlockQ4KRle> = a_q4k_r.iter().map(|b| encode(b, 0.01)).collect();
        let b = fp16_ones(k, n);
--- a/src/bin/gguf_matmul.rs
+++ b/src/bin/gguf_matmul.rs
@@ -145,7 +145,7 @@ fn main() -> Result<(), Box<dyn Error>> {
    // ── RLE encode (best of `trials`) ────────────────────────────────────────
    let (rle_blocks, t_enc) = bench(trials, || -> Vec<BlockQ4KRle> {
-        blocks.iter().map(encode).collect()
+        blocks.iter().map(|b| encode(b, 0.0)).collect()
    });
    let n_rle     = rle_blocks.iter().filter(|b| b.is_rle()).count();
--- a/src/bin/gguf_scan.rs
+++ b/src/bin/gguf_scan.rs
@@ -101,11 +101,32 @@ fn fixed(s: &str, width: usize) -> String {
 fn main() -> Result<(), Box<dyn Error>> {
    let args: Vec<String> = env::args().collect();
    if args.len() < 2 {
-        eprintln!("usage: {} <model.gguf>", args[0]);
+        eprintln!("usage: {} <model.gguf> [--threshold <0.0..1.0>]", args[0]);
        eprintln!();
        eprintln!("  --threshold  Minimum fraction of qs bytes that must be in runs of");
        eprintln!("               length ≥ 2 for a block to use RLE mode.  Default: 0.0");
        eprintln!("               (use RLE whenever the pair count fits in 64 pairs).");
        std::process::exit(1);
    }
    let path = &args[1];
    // Parse optional --threshold flag from the remaining arguments.
    let mut threshold = 0.0f32;
    let mut idx = 2usize;
    while idx < args.len() {
        if args[idx] == "--threshold" {
            idx += 1;
            threshold = args.get(idx)
                .and_then(|s| s.parse::<f32>().ok())
                .filter(|&v| (0.0..=1.0).contains(&v))
                .unwrap_or_else(|| {
                    eprintln!("error: --threshold requires a value in [0.0, 1.0]");
                    std::process::exit(1);
                });
        }
        idx += 1;
    }
    // ── Parse header ─────────────────────────────────────────────────────────
    eprintln!("Parsing {path} …");
    let (tensors, data_start) = parse_header(path)?;
@@ -122,6 +143,8 @@ fn main() -> Result<(), Box<dyn Error>> {
        q4k_tensors.len(),
        other_count,
    );
    eprintln!("  RLE threshold: {threshold:.2} (blocks need ≥ {:.0}% of bytes in runs)",
        threshold * 100.0);
    eprintln!();
    // ── Header row ───────────────────────────────────────────────────────────
@@ -145,7 +168,7 @@ fn main() -> Result<(), Box<dyn Error>> {
        let mut stats = TensorStats::new();
        for_each_block(&mut file, data_start, tensor, |block| {
-            let rle_block = encode(block);
+            let rle_block = encode(block, threshold);
            stats.observe(rle_block.is_rle(), rle_block.rle_len());
        })?;
@@ -187,10 +210,15 @@ fn main() -> Result<(), Box<dyn Error>> {
    if !any_rle {
        println!();
-        println!("No blocks compressed with RLE — all weights are effectively random at");
+        println!("No blocks used RLE at threshold {threshold:.2}.");
-        println!("the byte level, which is typical for trained Q4_K quantised weights.");
+        if threshold < 0.01 {
-        println!("RLE compression only helps for structured weight matrices (binary,");
+            println!("All weights are effectively random at the byte level — typical for");
-        println!("ternary, heavily pruned, or synthetic).");
+            println!("trained Q4_K weights.  RLE only helps for structured weight matrices");
            println!("(binary, ternary, heavily pruned, or synthetic).");
        } else {
            println!("Try a lower --threshold (e.g. --threshold 0.0) to see whether any");
            println!("blocks have enough run structure to qualify at a looser threshold.");
        }
    }
    Ok(())
--- a/src/rle.rs
+++ b/src/rle.rs
Author	SHA1	Message	Date
charles	bba9db290e	RLE now works on nibbles	2026-04-12 21:26:36 -07:00
charles	3fb10b78e3	Allow variable coverage	2026-04-12 20:51:19 -07:00
charles	59b5eade7e	Try sorting	2026-04-12 19:51:59 -07:00