add: rl2

2026-04-12 15:30:04 -07:00
parent 16d1f37ae5
commit 5d310b8df5
2 changed files with 994 additions and 0 deletions
@@ -18,6 +18,8 @@
 //! dequantise each row of A into f32, convert each element of B from fp16 to
 //! f32, accumulate dot-products.  No SIMD, no tiling, no tricks.

+pub mod rle;
+
 // ---------------------------------------------------------------------------
 // Constants matching GGML's ggml-common.h
 // ---------------------------------------------------------------------------
@@ -0,0 +1,992 @@
+//! RLE-optional Q4_K super-block encoding.
+//!
+//! This module provides [`BlockQ4KRle`], a variant of [`crate::BlockQ4K`] that
+//! optionally compresses the 128-byte weight payload using **byte-level
+//! run-length encoding** (RLE).  A flag bit in the [`BlockQ4KRle::flags`]
+//! field indicates which mode is active:
+//!
+//! | `IS_RLE` bit | `qs` interpretation                                        |
+//! |--------------|------------------------------------------------------------|
+//! | 0            | Raw packed nibbles, identical to [`crate::BlockQ4K::qs`]  |
+//! | 1            | RLE stream of `(value, count)` byte-pairs                  |
+//!
+//! ## RLE format (when `IS_RLE` = 1)
+//!
+//! - `flags >> 1` gives the number of `(value, count)` pairs stored in `qs`.
+//! - For each pair `i`:
+//!   - `qs[2*i]`     — the byte value (two packed 4-bit weights, same packing
+//!                     as the raw format).
+//!   - `qs[2*i + 1]` — the run length in bytes (1..=255).
+//! - The run lengths must sum to exactly 128 (the uncompressed `qs` size).
+//!
+//! RLE encoding is chosen only when the compressed representation is
+//! **strictly shorter** than the 128-byte raw payload, i.e. when
+//! `pairs * 2 < 128`.  That caps the useful range at ≤ 63 pairs.  The 7-bit
+//! `flags >> 1` sub-field can hold up to 127, so this ceiling is never a
+//! concern in practice.
+//!
+//! ## Constructing blocks
+//!
+//! Use [`encode`] to convert an existing [`crate::BlockQ4K`] into a
+//! [`BlockQ4KRle`].  The function automatically selects the better mode.
+//!
+//! ## Adding this module to your crate
+//!
+//! Add `pub mod rle;` to `lib.rs`.
+
+use crate::{fp16_to_f32, get_scale_min, BlockQ4K, K_SCALE_SIZE, QK_K};
+
+// ---------------------------------------------------------------------------
+// Flag constants
+// ---------------------------------------------------------------------------
+
+/// Flag bit in [`BlockQ4KRle::flags`]: if set, `qs` contains an RLE stream.
+pub const IS_RLE: u8 = 0x01;
+
+// ---------------------------------------------------------------------------
+// Block definition
+// ---------------------------------------------------------------------------
+
+/// A Q4_K super-block with optional byte-level RLE compression on the weights.
+///
+/// Identical to [`crate::BlockQ4K`] except for the additional [`flags`](Self::flags)
+/// byte inserted between `scales` and `qs`.
+///
+/// Memory layout (repr C):
+///
+/// | Offset | Field      | Size  | Notes                          |
+/// |--------|------------|-------|--------------------------------|
+/// |  0     | `d`        | 2 B   | fp16 super-block scale         |
+/// |  2     | `dmin`     | fp16 super-block min scale      | 2 B   |
+/// |  4     | `scales`   | 12 B  | packed 6-bit sub-block params  |
+/// | 16     | `flags`    | 1 B   | encoding flags (see below)     |
+/// | 17     | `qs`       | 128 B | raw nibbles or RLE stream      |
+/// | 145    | (padding)  | 1 B   | implicit trailing alignment pad|
+///
+/// **sizeof = 146 bytes** (padded to 2-byte alignment imposed by `u16` fields).
+///
+/// ## `flags` bit layout
+///
+/// | Bits | Meaning                                                       |
+/// |------|---------------------------------------------------------------|
+/// | 0    | [`IS_RLE`] — 1 = `qs` is RLE-encoded, 0 = raw packed nibbles |
+/// | 1–7  | When `IS_RLE`=1: number of `(value, count)` pairs in `qs`    |
+#[repr(C)]
+#[derive(Clone, Copy, Debug)]
+pub struct BlockQ4KRle {
+    /// Super-block scale for quantised sub-block scales (fp16 bits).
+    pub d: u16,
+    /// Super-block scale for quantised sub-block mins (fp16 bits).
+    pub dmin: u16,
+    /// Packed 6-bit sub-block scales and mins (same layout as [`crate::BlockQ4K`]).
+    pub scales: [u8; K_SCALE_SIZE],
+    /// Encoding flags.  Bit 0 = [`IS_RLE`].  Bits 1-7 = RLE pair count when
+    /// `IS_RLE` is set.
+    pub flags: u8,
+    /// Raw packed-nibble weights (`IS_RLE` = 0) or RLE byte stream (`IS_RLE` = 1).
+    pub qs: [u8; QK_K / 2],
+}
+
+impl BlockQ4KRle {
+    /// Returns `true` when `qs` holds an RLE-encoded stream.
+    #[inline]
+    pub fn is_rle(&self) -> bool {
+        self.flags & IS_RLE != 0
+    }
+
+    /// Number of `(value, count)` byte-pairs stored at the start of `qs`.
+    ///
+    /// Only meaningful when [`is_rle`](Self::is_rle) returns `true`.
+    #[inline]
+    pub fn rle_len(&self) -> usize {
+        (self.flags >> 1) as usize
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Encoding
+// ---------------------------------------------------------------------------
+
+/// Encode a [`BlockQ4K`] block into a [`BlockQ4KRle`] block.
+///
+/// The 128-byte `qs` payload is scanned for runs of identical bytes.  If the
+/// RLE representation fits in the same 128-byte field **and is strictly
+/// shorter** than the raw payload, it is stored with `IS_RLE` set.  Otherwise
+/// the raw bytes are copied unchanged and `IS_RLE` is cleared.
+///
+/// The `d`, `dmin`, and `scales` fields are always copied verbatim.
+pub fn encode(block: &BlockQ4K) -> BlockQ4KRle {
+    let raw = &block.qs;
+
+    // Scan the 128-byte raw payload for runs of equal bytes.
+    let mut pairs: Vec<(u8, u8)> = Vec::with_capacity(64);
+    let mut i = 0usize;
+    while i < raw.len() {
+        let val = raw[i];
+        // Count consecutive equal bytes; saturate at u8::MAX to stay in-range.
+        let mut run = 1u8;
+        while i + (run as usize) < raw.len()
+            && raw[i + (run as usize)] == val
+            && run < u8::MAX
+        {
+            run += 1;
+        }
+        pairs.push((val, run));
+        i += run as usize;
+    }
+
+    // Only switch to RLE when the encoded form is strictly smaller than the
+    // raw payload.  Because each pair costs 2 bytes and the raw payload is
+    // 128 bytes, the condition pairs.len() * 2 < 128 also guarantees that
+    // pairs.len() ≤ 63, which fits in bits 1-7 of the flags byte.
+    if pairs.len() * 2 < raw.len() {
+        let n = pairs.len();
+        debug_assert!(n <= 63, "RLE pair count {n} unexpectedly exceeds 63");
+
+        let mut qs = [0u8; QK_K / 2];
+        for (k, &(val, count)) in pairs.iter().enumerate() {
+            qs[2 * k]     = val;
+            qs[2 * k + 1] = count;
+        }
+
+        BlockQ4KRle {
+            d:      block.d,
+            dmin:   block.dmin,
+            scales: block.scales,
+            flags:  IS_RLE | ((n as u8) << 1),
+            qs,
+        }
+    } else {
+        // No space savings — copy raw bytes and leave IS_RLE clear.
+        BlockQ4KRle {
+            d:      block.d,
+            dmin:   block.dmin,
+            scales: block.scales,
+            flags:  0,
+            qs:     block.qs,
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Decoding helpers
+// ---------------------------------------------------------------------------
+
+/// Expand the `qs` field of a [`BlockQ4KRle`] block into the 128-byte raw
+/// packed-nibble array, handling both raw and RLE modes transparently.
+///
+/// # Panics (debug builds only)
+///
+/// Panics if the decoded RLE stream does not sum to exactly 128 bytes.
+fn decode_qs(block: &BlockQ4KRle) -> [u8; QK_K / 2] {
+    if !block.is_rle() {
+        return block.qs;
+    }
+
+    let n   = block.rle_len();
+    let mut raw = [0u8; QK_K / 2];
+    let mut pos = 0usize;
+
+    for i in 0..n {
+        let val   = block.qs[2 * i];
+        let count = block.qs[2 * i + 1] as usize;
+        raw[pos..pos + count].fill(val);
+        pos += count;
+    }
+
+    debug_assert_eq!(
+        pos,
+        QK_K / 2,
+        "RLE run lengths sum to {pos}, expected {}",
+        QK_K / 2
+    );
+    raw
+}
+
+// ---------------------------------------------------------------------------
+// Dequantisation
+// ---------------------------------------------------------------------------
+
+/// Dequantise one [`BlockQ4KRle`] super-block into [`QK_K`] (256) `f32` values.
+///
+/// When `IS_RLE` is set the RLE stream is first expanded into a 128-byte raw
+/// buffer; thereafter the dequantisation is identical to
+/// [`crate::dequantize_block_q4k`]:
+///
+/// ```text
+/// out[i] = d * scale[s] * nibble[i]  -  dmin * min[s]
+/// ```
+///
+/// where `s` is the sub-block index (0..8) that the element belongs to.
+pub fn dequantize_block_q4k_rle(block: &BlockQ4KRle, out: &mut [f32; QK_K]) {
+    let d    = fp16_to_f32(block.d);
+    let dmin = fp16_to_f32(block.dmin);
+    let qs   = decode_qs(block);
+
+    let mut q_off   = 0usize; // byte cursor into the raw qs array
+    let mut out_off = 0usize; // element cursor into `out`
+    let mut is      = 0usize; // sub-block pair index (0, 2, 4, 6)
+
+    while out_off < QK_K {
+        let (sc1, mn1) = get_scale_min(is,     &block.scales);
+        let (sc2, mn2) = get_scale_min(is + 1, &block.scales);
+
+        let d1 = d    * sc1 as f32;
+        let m1 = dmin * mn1 as f32;
+        let d2 = d    * sc2 as f32;
+        let m2 = dmin * mn2 as f32;
+
+        for l in 0..32 {
+            out[out_off + l]      = d1 * (qs[q_off + l] & 0x0F) as f32 - m1;
+        }
+        for l in 0..32 {
+            out[out_off + 32 + l] = d2 * (qs[q_off + l] >>   4) as f32 - m2;
+        }
+
+        q_off   += 32;
+        out_off += 64;
+        is      += 2;
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Matrix multiplication  C = A × B
+// ---------------------------------------------------------------------------
+
+/// Multiply a Q4_K_RLE matrix **A** by an FP16 matrix **B**, producing an f32
+/// matrix **C**.
+///
+/// Identical semantics to [`crate::matmul_q4k_fp16`] but accepts
+/// [`BlockQ4KRle`] blocks.  Each block is dequantised on the fly via
+/// [`dequantize_block_q4k_rle`], transparently handling mixed raw/RLE blocks
+/// within the same matrix.
+///
+/// # Arguments
+///
+/// * `a` – Row-major slice of [`BlockQ4KRle`].  Row `i` occupies blocks
+///   `a[i * blocks_per_row .. (i+1) * blocks_per_row]`.
+/// * `b` – Row-major fp16 matrix stored as raw `u16` bits, shape \[K, N\].
+///   Element `(ki, j)` is at index `ki * n + j`.
+/// * `m` – Number of rows in A (and C).
+/// * `k` – Number of columns in A = number of rows in B.
+///   **Must** be a multiple of [`QK_K`] (256).
+/// * `n` – Number of columns in B (and C).
+///
+/// # Returns
+///
+/// A flat row-major `Vec<f32>` of shape \[M, N\].
+///
+/// # Panics
+///
+/// Panics if `k` is not a multiple of `QK_K`, or if the lengths of `a` or `b`
+/// do not match the declared dimensions.
+pub fn matmul_q4k_rle_fp16(
+    a: &[BlockQ4KRle],
+    b: &[u16],
+    m: usize,
+    k: usize,
+    n: usize,
+) -> Vec<f32> {
+    assert_eq!(
+        k % QK_K,
+        0,
+        "k ({k}) must be a multiple of QK_K ({QK_K})"
+    );
+    let blocks_per_row = k / QK_K;
+    assert_eq!(
+        a.len(),
+        m * blocks_per_row,
+        "A block count mismatch: expected {} blocks, got {}",
+        m * blocks_per_row,
+        a.len()
+    );
+    assert_eq!(
+        b.len(),
+        k * n,
+        "B element count mismatch: expected {}, got {}",
+        k * n,
+        b.len()
+    );
+
+    let mut c         = vec![0.0f32; m * n];
+    let mut a_row     = vec![0.0f32; k];
+    let mut block_buf = [0.0f32; QK_K];
+
+    for i in 0..m {
+        // Dequantise row i of A into a_row (f32).
+        for b_idx in 0..blocks_per_row {
+            let block = &a[i * blocks_per_row + b_idx];
+            dequantize_block_q4k_rle(block, &mut block_buf);
+            let start = b_idx * QK_K;
+            a_row[start..start + QK_K].copy_from_slice(&block_buf);
+        }
+
+        // Dot-product with each column of B.
+        for j in 0..n {
+            let mut sum = 0.0f32;
+            for ki in 0..k {
+                sum += a_row[ki] * fp16_to_f32(b[ki * n + j]);
+            }
+            c[i * n + j] = sum;
+        }
+    }
+
+    c
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{dequantize_block_q4k, matmul_q4k_fp16, BlockQ4K};
+
+    // -------------------------------------------------------------------------
+    // Test helpers
+    // -------------------------------------------------------------------------
+
+    /// Convert a normal finite f32 to its IEEE 754 fp16 bit pattern.
+    ///
+    /// Panics if the value falls outside the representable fp16 normal range.
+    fn f32_to_fp16_bits(f: f32) -> u16 {
+        if f == 0.0               { return 0x0000; }
+        if f == f32::INFINITY     { return 0x7C00; }
+        if f == f32::NEG_INFINITY { return 0xFC00; }
+        if f.is_nan()             { return 0x7E00; }
+
+        let bits     = f.to_bits();
+        let sign     = ((bits >> 31) as u16) << 15;
+        let exp      = (bits >> 23) & 0xFF;
+        let mant     = bits & 0x007F_FFFF;
+        let fp16_exp = exp as i32 - 127 + 15;
+        assert!(
+            fp16_exp > 0 && fp16_exp < 31,
+            "f32 value {f} is outside the representable fp16 normal range"
+        );
+        sign | ((fp16_exp as u16) << 10) | ((mant >> 13) as u16)
+    }
+
+    /// Build a [`BlockQ4K`] where all 8 sub-blocks share the same `scale` and
+    /// `min` (both < 16), and every byte in `qs` is `qs_byte`.
+    fn make_block(d: f32, dmin: f32, scale: u8, min: u8, qs_byte: u8) -> BlockQ4K {
+        assert!(
+            scale < 16 && min < 16,
+            "make_block: scale ({scale}) and min ({min}) must both be < 16"
+        );
+        let mut scales = [0u8; K_SCALE_SIZE];
+        for j in 0..4 {
+            scales[j]     = scale;
+            scales[j + 4] = min;
+        }
+        for j in 8..12 {
+            scales[j] = (scale & 0x0F) | ((min & 0x0F) << 4);
+        }
+        BlockQ4K {
+            d:    f32_to_fp16_bits(d),
+            dmin: f32_to_fp16_bits(dmin),
+            scales,
+            qs: [qs_byte; QK_K / 2],
+        }
+    }
+
+    /// Build a [`BlockQ4K`] with a custom `qs` array.
+    fn make_block_with_qs(
+        d: f32,
+        dmin: f32,
+        scale: u8,
+        min: u8,
+        qs: [u8; QK_K / 2],
+    ) -> BlockQ4K {
+        assert!(
+            scale < 16 && min < 16,
+            "make_block_with_qs: scale ({scale}) and min ({min}) must both be < 16"
+        );
+        let mut scales = [0u8; K_SCALE_SIZE];
+        for j in 0..4 {
+            scales[j]     = scale;
+            scales[j + 4] = min;
+        }
+        for j in 8..12 {
+            scales[j] = (scale & 0x0F) | ((min & 0x0F) << 4);
+        }
+        BlockQ4K {
+            d:    f32_to_fp16_bits(d),
+            dmin: f32_to_fp16_bits(dmin),
+            scales,
+            qs,
+        }
+    }
+
+    fn assert_close(got: f32, expected: f32, tol: f32) {
+        assert!(
+            (got - expected).abs() <= tol,
+            "got {got}, expected {expected} (tol {tol})"
+        );
+    }
+
+    fn assert_all_close(got: &[f32], expected_scalar: f32, tol: f32) {
+        for (i, &g) in got.iter().enumerate() {
+            assert!(
+                (g - expected_scalar).abs() <= tol,
+                "element {i}: got {g}, expected {expected_scalar} (tol {tol})"
+            );
+        }
+    }
+
+    fn assert_slices_close(got: &[f32], expected: &[f32], tol: f32) {
+        assert_eq!(got.len(), expected.len(), "slice length mismatch");
+        for (i, (&g, &e)) in got.iter().zip(expected.iter()).enumerate() {
+            assert!(
+                (g - e).abs() <= tol,
+                "element {i}: got {g}, expected {e} (tol {tol})"
+            );
+        }
+    }
+
+    fn fp16_uniform(k: usize, n: usize, value: f32) -> Vec<u16> {
+        vec![f32_to_fp16_bits(value); k * n]
+    }
+
+    // =========================================================================
+    // Struct layout
+    // =========================================================================
+
+    #[test]
+    fn block_q4k_rle_size_is_146_bytes() {
+        // d(2) + dmin(2) + scales(12) + flags(1) + qs(128) = 145 raw bytes,
+        // rounded up to 146 by the 2-byte alignment imposed by the u16 fields.
+        assert_eq!(core::mem::size_of::<BlockQ4KRle>(), 146);
+    }
+
+    #[test]
+    fn block_q4k_rle_is_two_bytes_larger_than_block_q4k() {
+        assert_eq!(
+            core::mem::size_of::<BlockQ4KRle>(),
+            core::mem::size_of::<BlockQ4K>() + 2,
+        );
+    }
+
+    // =========================================================================
+    // is_rle / rle_len
+    // =========================================================================
+
+    #[test]
+    fn is_rle_false_when_flag_clear() {
+        let b = BlockQ4KRle {
+            d: 0, dmin: 0, scales: [0; K_SCALE_SIZE], flags: 0, qs: [0; QK_K / 2],
+        };
+        assert!(!b.is_rle());
+    }
+
+    #[test]
+    fn rle_len_zero_when_flag_clear() {
+        let b = BlockQ4KRle {
+            d: 0, dmin: 0, scales: [0; K_SCALE_SIZE], flags: 0, qs: [0; QK_K / 2],
+        };
+        assert_eq!(b.rle_len(), 0);
+    }
+
+    #[test]
+    fn is_rle_true_when_flag_set() {
+        let b = BlockQ4KRle {
+            d: 0, dmin: 0, scales: [0; K_SCALE_SIZE],
+            flags: IS_RLE | (5u8 << 1),
+            qs: [0; QK_K / 2],
+        };
+        assert!(b.is_rle());
+    }
+
+    #[test]
+    fn rle_len_reports_pair_count_from_flags() {
+        for n in [0usize, 1, 7, 31, 63] {
+            let b = BlockQ4KRle {
+                d: 0, dmin: 0, scales: [0; K_SCALE_SIZE],
+                flags: IS_RLE | ((n as u8) << 1),
+                qs: [0; QK_K / 2],
+            };
+            assert_eq!(b.rle_len(), n, "expected rle_len {n}");
+        }
+    }
+
+    // =========================================================================
+    // encode: mode selection
+    // =========================================================================
+
+    #[test]
+    fn encode_uniform_qs_uses_rle() {
+        // 128 identical bytes → 1 pair → 2 bytes < 128 raw.
+        let src = make_block(1.0, 0.0, 1, 0, 0x77);
+        let rle = encode(&src);
+        assert!(rle.is_rle(), "uniform qs should trigger RLE mode");
+    }
+
+    #[test]
+    fn encode_uniform_qs_rle_len_is_one() {
+        let src = make_block(1.0, 0.0, 1, 0, 0x55);
+        let rle = encode(&src);
+        assert_eq!(rle.rle_len(), 1);
+    }
+
+    #[test]
+    fn encode_uniform_qs_rle_entry_is_correct() {
+        let src = make_block(1.0, 0.0, 1, 0, 0xAB);
+        let rle = encode(&src);
+        assert_eq!(rle.qs[0], 0xAB, "RLE value byte should equal the repeated byte");
+        assert_eq!(rle.qs[1], 128,  "RLE run length should be 128 bytes");
+    }
+
+    #[test]
+    fn encode_alternating_bytes_stays_raw() {
+        // 128 single-byte runs → 128 pairs → 256 bytes ≥ 128 raw → raw mode.
+        let mut qs = [0u8; QK_K / 2];
+        for (i, b) in qs.iter_mut().enumerate() {
+            *b = if i % 2 == 0 { 0xAA } else { 0x55 };
+        }
+        let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
+        let rle = encode(&src);
+        assert!(!rle.is_rle(), "alternating bytes cannot be compressed → raw mode");
+    }
+
+    #[test]
+    fn encode_raw_mode_copies_qs_verbatim() {
+        let mut qs = [0u8; QK_K / 2];
+        for (i, b) in qs.iter_mut().enumerate() {
+            // Three-byte cycle of distinct values → 128 runs of 1 byte each.
+            *b = match i % 3 { 0 => 0x11, 1 => 0x22, _ => 0x33 };
+        }
+        let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
+        let rle = encode(&src);
+        assert!(!rle.is_rle());
+        assert_eq!(rle.qs, qs, "raw mode must preserve qs bytes unchanged");
+    }
+
+    #[test]
+    fn encode_two_runs_uses_rle_and_stores_correct_pairs() {
+        // Two distinct runs: 64 bytes of 0x11 followed by 64 bytes of 0x22.
+        // → 2 pairs = 4 bytes < 128 bytes raw.
+        let mut qs = [0u8; QK_K / 2];
+        qs[..64].fill(0x11);
+        qs[64..].fill(0x22);
+        let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
+        let rle = encode(&src);
+        assert!(rle.is_rle());
+        assert_eq!(rle.rle_len(), 2);
+        assert_eq!(rle.qs[0], 0x11, "first pair: value");
+        assert_eq!(rle.qs[1], 64,   "first pair: run length");
+        assert_eq!(rle.qs[2], 0x22, "second pair: value");
+        assert_eq!(rle.qs[3], 64,   "second pair: run length");
+    }
+
+    #[test]
+    fn encode_63_pairs_uses_rle() {
+        // Build 62 runs of 2 bytes each (124 bytes) + 1 run of 4 bytes = 128 bytes.
+        // 63 pairs × 2 = 126 bytes < 128 → RLE should be chosen.
+        let mut qs = [0u8; QK_K / 2];
+        let mut pos = 0usize;
+        for run in 0..62usize {
+            // Use a stride-3 sequence so consecutive values are always distinct.
+            let v = (run as u8).wrapping_mul(3).wrapping_add(1);
+            qs[pos]     = v;
+            qs[pos + 1] = v;
+            pos += 2;
+        }
+        // Final run: 4 bytes, value chosen to differ from the previous one.
+        qs[pos..].fill(0xFE);
+
+        let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
+        let rle = encode(&src);
+        assert!(rle.is_rle(), "63 pairs should use RLE");
+        assert_eq!(rle.rle_len(), 63);
+    }
+
+    #[test]
+    fn encode_64_pairs_stays_raw() {
+        // 64 runs of 2 bytes each = 128 bytes total.
+        // 64 pairs × 2 = 128 bytes, which is NOT strictly less than 128 → raw.
+        let mut qs = [0u8; QK_K / 2];
+        let mut pos = 0usize;
+        for run in 0..64usize {
+            let v = (run as u8).wrapping_mul(3).wrapping_add(1);
+            qs[pos]     = v;
+            qs[pos + 1] = v;
+            pos += 2;
+        }
+        let src = make_block_with_qs(1.0, 0.0, 1, 0, qs);
+        let rle = encode(&src);
+        assert!(!rle.is_rle(), "64 pairs offers no saving → raw mode");
+    }
+
+    #[test]
+    fn encode_preserves_d_dmin_scales() {
+        let src = make_block(2.0, 0.5, 3, 2, 0x00);
+        let rle = encode(&src);
+        assert_eq!(rle.d,      src.d);
+        assert_eq!(rle.dmin,   src.dmin);
+        assert_eq!(rle.scales, src.scales);
+    }
+
+    // =========================================================================
+    // decode_qs (tested indirectly through dequantise, but also directly)
+    // =========================================================================
+
+    #[test]
+    fn decode_qs_raw_mode_returns_qs_unchanged() {
+        // Build a raw BlockQ4KRle (flags = 0) with a non-trivial qs pattern.
+        let mut qs = [0u8; QK_K / 2];
+        for (i, b) in qs.iter_mut().enumerate() { *b = i as u8; }
+        let rle = BlockQ4KRle {
+            d: 0, dmin: 0, scales: [0; K_SCALE_SIZE], flags: 0, qs,
+        };
+        assert_eq!(decode_qs(&rle), qs);
+    }
+
+    #[test]
+    fn decode_qs_rle_expands_two_pair_stream() {
+        // Hand-craft an RLE block: [0xAA × 64, 0xBB × 64].
+        let mut qs = [0u8; QK_K / 2];
+        qs[0] = 0xAA; qs[1] = 64;
+        qs[2] = 0xBB; qs[3] = 64;
+        let rle = BlockQ4KRle {
+            d: 0, dmin: 0, scales: [0; K_SCALE_SIZE],
+            flags: IS_RLE | (2u8 << 1),
+            qs,
+        };
+        let expanded = decode_qs(&rle);
+        assert!(expanded[..64].iter().all(|&b| b == 0xAA), "first 64 bytes must be 0xAA");
+        assert!(expanded[64..].iter().all(|&b| b == 0xBB), "last 64 bytes must be 0xBB");
+    }
+
+    #[test]
+    fn decode_qs_rle_single_run_covers_all() {
+        let mut qs = [0u8; QK_K / 2];
+        qs[0] = 0xCD; qs[1] = 128; // one run of 128 bytes
+        let rle = BlockQ4KRle {
+            d: 0, dmin: 0, scales: [0; K_SCALE_SIZE],
+            flags: IS_RLE | (1u8 << 1),
+            qs,
+        };
+        let expanded = decode_qs(&rle);
+        assert!(expanded.iter().all(|&b| b == 0xCD));
+    }
+
+    // =========================================================================
+    // dequantize_block_q4k_rle
+    // =========================================================================
+
+    #[test]
+    fn dequant_rle_zero_d_all_outputs_zero() {
+        let src = make_block(0.0, 0.0, 1, 0, 0x77);
+        let rle = encode(&src);
+        let mut out = [0.0f32; QK_K];
+        dequantize_block_q4k_rle(&rle, &mut out);
+        assert_all_close(&out, 0.0, 0.0);
+    }
+
+    #[test]
+    fn dequant_rle_uniform_nibble_one_scale_one() {
+        // qs_byte = 0x11 → both nibbles = 1; scale = 1, d = 1.0, min = 0.
+        // expected: 1.0 * 1 * 1 - 0.0 = 1.0
+        let src = make_block(1.0, 0.0, 1, 0, 0x11);
+        let rle = encode(&src);
+        let mut out = [0.0f32; QK_K];
+        dequantize_block_q4k_rle(&rle, &mut out);
+        assert_all_close(&out, 1.0, 1e-5);
+    }
+
+    #[test]
+    fn dequant_rle_non_zero_min_subtracts() {
+        // nibble = 0, scale = 1, d = 1.0, min = 2, dmin = 1.0
+        // expected: 1.0 * 1 * 0 - 1.0 * 2 = -2.0
+        let src = make_block(1.0, 1.0, 1, 2, 0x00);
+        let rle = encode(&src);
+        let mut out = [0.0f32; QK_K];
+        dequantize_block_q4k_rle(&rle, &mut out);
+        assert_all_close(&out, -2.0, 1e-5);
+    }
+
+    #[test]
+    fn dequant_rle_max_nibble_15() {
+        // qs_byte = 0xFF → both nibbles = 15; scale = 1, d = 1.0, min = 0.
+        // expected: 1.0 * 1 * 15 - 0.0 = 15.0
+        let src = make_block(1.0, 0.0, 1, 0, 0xFF);
+        let rle = encode(&src);
+        let mut out = [0.0f32; QK_K];
+        dequantize_block_q4k_rle(&rle, &mut out);
+        assert_all_close(&out, 15.0, 1e-5);
+    }
+
+    #[test]
+    fn dequant_rle_output_count_is_qk_k() {
+        let src = make_block(1.0, 0.0, 1, 0, 0x00);
+        let rle = encode(&src);
+        let mut out = [0.0f32; QK_K];
+        dequantize_block_q4k_rle(&rle, &mut out);
+        assert_eq!(out.len(), QK_K);
+    }
+
+    #[test]
+    fn dequant_rle_larger_scale_multiplies() {
+        // nibble = 3, scale = 4, d = 2.0, min = 0
+        // expected: 2.0 * 4 * 3 - 0.0 = 24.0
+        // qs_byte = 0x33 → both nibbles = 3
+        let src = make_block(2.0, 0.0, 4, 0, 0x33);
+        let rle = encode(&src);
+        let mut out = [0.0f32; QK_K];
+        dequantize_block_q4k_rle(&rle, &mut out);
+        assert_all_close(&out, 24.0, 1e-4);
+    }
+
+    // =========================================================================
+    // Roundtrip: encode → dequantize must match original dequantize
+    // =========================================================================
+
+    #[test]
+    fn roundtrip_rle_mode_matches_original() {
+        // Uniform qs → RLE mode selected.
+        let src = make_block(2.0, 0.5, 3, 1, 0x37);
+        let rle = encode(&src);
+        assert!(rle.is_rle());
+
+        let mut got      = [0.0f32; QK_K];
+        let mut expected = [0.0f32; QK_K];
+        dequantize_block_q4k_rle(&rle, &mut got);
+        dequantize_block_q4k(&src,     &mut expected);
+        assert_slices_close(&got, &expected, 1e-5);
+    }
+
+    #[test]
+    fn roundtrip_raw_mode_matches_original() {
+        // Alternating bytes → raw mode selected; output must still be correct.
+        let mut qs = [0u8; QK_K / 2];
+        for (i, b) in qs.iter_mut().enumerate() {
+            *b = if i % 2 == 0 { 0x13 } else { 0x24 };
+        }
+        let src = make_block_with_qs(1.5, 0.25, 2, 1, qs);
+        let rle = encode(&src);
+        assert!(!rle.is_rle());
+
+        let mut got      = [0.0f32; QK_K];
+        let mut expected = [0.0f32; QK_K];
+        dequantize_block_q4k_rle(&rle, &mut got);
+        dequantize_block_q4k(&src,     &mut expected);
+        assert_slices_close(&got, &expected, 1e-5);
+    }
+
+    #[test]
+    fn roundtrip_two_run_block_matches_original() {
+        let mut qs = [0u8; QK_K / 2];
+        qs[..64].fill(0x59);
+        qs[64..].fill(0x8C);
+        let src = make_block_with_qs(3.0, 1.0, 5, 2, qs);
+        let rle = encode(&src);
+        assert!(rle.is_rle());
+
+        let mut got      = [0.0f32; QK_K];
+        let mut expected = [0.0f32; QK_K];
+        dequantize_block_q4k_rle(&rle, &mut got);
+        dequantize_block_q4k(&src,     &mut expected);
+        assert_slices_close(&got, &expected, 1e-5);
+    }
+
+    #[test]
+    fn roundtrip_many_short_runs_matches_original() {
+        // Four distinct runs of varying lengths → still compresses.
+        let mut qs = [0u8; QK_K / 2];
+        qs[..10].fill(0x11);
+        qs[10..30].fill(0x22);
+        qs[30..31].fill(0x33);
+        qs[31..].fill(0x44);
+        let src = make_block_with_qs(1.0, 0.5, 7, 3, qs);
+        let rle = encode(&src);
+        assert!(rle.is_rle(), "4-run block should compress");
+        assert_eq!(rle.rle_len(), 4);
+
+        let mut got      = [0.0f32; QK_K];
+        let mut expected = [0.0f32; QK_K];
+        dequantize_block_q4k_rle(&rle, &mut got);
+        dequantize_block_q4k(&src,     &mut expected);
+        assert_slices_close(&got, &expected, 1e-5);
+    }
+
+    #[test]
+    fn roundtrip_zero_qs_matches_original() {
+        let src = make_block(1.0, 0.5, 2, 1, 0x00);
+        let rle = encode(&src);
+        let mut got      = [0.0f32; QK_K];
+        let mut expected = [0.0f32; QK_K];
+        dequantize_block_q4k_rle(&rle, &mut got);
+        dequantize_block_q4k(&src,     &mut expected);
+        assert_slices_close(&got, &expected, 1e-5);
+    }
+
+    #[test]
+    fn roundtrip_nibble_split_low_high_correct() {
+        // qs_byte = 0x37: low nibble = 7 (sub-block 0 path), high nibble = 3
+        // (sub-block 1 path).  Verify both halves are dequantised correctly.
+        let src = make_block(1.0, 0.0, 1, 0, 0x37);
+        let rle = encode(&src);
+        let mut got      = [0.0f32; QK_K];
+        let mut expected = [0.0f32; QK_K];
+        dequantize_block_q4k_rle(&rle, &mut got);
+        dequantize_block_q4k(&src,     &mut expected);
+        assert_slices_close(&got, &expected, 1e-5);
+        // First 32 elements of each 64-element group → low nibble = 7.
+        assert_close(got[0],  7.0, 1e-5);
+        // Next 32 elements → high nibble = 3.
+        assert_close(got[32], 3.0, 1e-5);
+    }
+
+    // =========================================================================
+    // matmul_q4k_rle_fp16
+    // =========================================================================
+
+    #[test]
+    fn matmul_rle_1x256_times_256x1_all_ones() {
+        // A: 1×256, all weights = nibble 1, scale = 1, d = 1.0
+        // B: 256×1, all fp16 1.0
+        // C = dot([1.0; 256], [1.0; 256]) = 256.0
+        let src = make_block(1.0, 0.0, 1, 0, 0x11);
+        let a   = vec![encode(&src)];
+        let b   = fp16_uniform(QK_K, 1, 1.0);
+        let c   = matmul_q4k_rle_fp16(&a, &b, 1, QK_K, 1);
+        assert_eq!(c.len(), 1);
+        assert_close(c[0], 256.0, 1e-3);
+    }
+
+    #[test]
+    fn matmul_rle_2x256_times_256x3_all_ones() {
+        let src = make_block(1.0, 0.0, 1, 0, 0x11);
+        let a   = vec![encode(&src), encode(&src)];
+        let b   = fp16_uniform(QK_K, 3, 1.0);
+        let c   = matmul_q4k_rle_fp16(&a, &b, 2, QK_K, 3);
+        assert_eq!(c.len(), 6);
+        assert_all_close(&c, 256.0, 1e-3);
+    }
+
+    #[test]
+    fn matmul_rle_zero_a_gives_zero_c() {
+        let src = make_block(0.0, 0.0, 1, 0, 0xFF);
+        let a   = vec![encode(&src)];
+        let b   = fp16_uniform(QK_K, 4, 1.0);
+        let c   = matmul_q4k_rle_fp16(&a, &b, 1, QK_K, 4);
+        assert_all_close(&c, 0.0, 0.0);
+    }
+
+    #[test]
+    fn matmul_rle_zero_b_gives_zero_c() {
+        let src = make_block(1.0, 0.0, 1, 0, 0x11);
+        let a   = vec![encode(&src)];
+        let b   = fp16_uniform(QK_K, 2, 0.0);
+        let c   = matmul_q4k_rle_fp16(&a, &b, 1, QK_K, 2);
+        assert_all_close(&c, 0.0, 0.0);
+    }
+
+    #[test]
+    fn matmul_rle_two_blocks_per_row() {
+        // A: 1×512, two blocks, all nibble-1 weights; B: 512×1, all 1.0.
+        // Expected: 512.0
+        let src = make_block(1.0, 0.0, 1, 0, 0x11);
+        let a   = vec![encode(&src), encode(&src)];
+        let b   = fp16_uniform(2 * QK_K, 1, 1.0);
+        let c   = matmul_q4k_rle_fp16(&a, &b, 1, 2 * QK_K, 1);
+        assert_eq!(c.len(), 1);
+        assert_close(c[0], 512.0, 1e-3);
+    }
+
+    #[test]
+    fn matmul_rle_output_shape_m_times_n() {
+        // A: 3×512 (6 blocks), B: 512×4 → C: 3×4 = 12 elements.
+        let src = make_block(1.0, 0.0, 1, 0, 0x00);
+        let a: Vec<BlockQ4KRle> = (0..6).map(|_| encode(&src)).collect();
+        let b = fp16_uniform(2 * QK_K, 4, 0.0);
+        let c = matmul_q4k_rle_fp16(&a, &b, 3, 2 * QK_K, 4);
+        assert_eq!(c.len(), 12);
+    }
+
+    #[test]
+    fn matmul_rle_scalar_b_scales_output() {
+        // Multiplying B by a scalar should scale C by the same factor.
+        let src = make_block(1.0, 0.0, 1, 0, 0x22); // nibble 2 → weight 2.0
+        let a   = vec![encode(&src)];
+        let b1  = fp16_uniform(QK_K, 1, 1.0);
+        let b2  = fp16_uniform(QK_K, 1, 3.0);
+        let c1  = matmul_q4k_rle_fp16(&a, &b1, 1, QK_K, 1);
+        let c2  = matmul_q4k_rle_fp16(&a, &b2, 1, QK_K, 1);
+        assert_close(c2[0], c1[0] * 3.0, 1e-2);
+    }
+
+    #[test]
+    fn matmul_rle_matches_original_matmul_mixed_blocks() {
+        // Mix: first block uniform (RLE), second block alternating (raw).
+        // Both matmul implementations should produce identical results.
+        let src_rle = make_block(2.0, 0.5, 3, 1, 0x37);
+
+        let mut qs_raw = [0u8; QK_K / 2];
+        for (i, b) in qs_raw.iter_mut().enumerate() {
+            *b = if i % 2 == 0 { 0x13 } else { 0x24 };
+        }
+        let src_raw = make_block_with_qs(1.5, 0.25, 2, 1, qs_raw);
+
+        let a_orig: Vec<BlockQ4K>    = vec![src_rle, src_raw];
+        let a_rle:  Vec<BlockQ4KRle> = a_orig.iter().map(encode).collect();
+
+        // A: 1×512, B: 512×2
+        let b = fp16_uniform(2 * QK_K, 2, 1.0);
+        let c_orig = matmul_q4k_fp16(&a_orig, &b, 1, 2 * QK_K, 2);
+        let c_rle  = matmul_q4k_rle_fp16(&a_rle, &b, 1, 2 * QK_K, 2);
+        assert_slices_close(&c_rle, &c_orig, 1e-4);
+    }
+
+    #[test]
+    fn matmul_rle_multiple_rows_multiple_blocks_per_row() {
+        // A: 2×512 (4 blocks), B: 512×3, all weights 1 in A, all 1.0 in B.
+        // Each row dot product = 512.0; C should be all 512.0.
+        let src = make_block(1.0, 0.0, 1, 0, 0x11);
+        let a: Vec<BlockQ4KRle> = (0..4).map(|_| encode(&src)).collect();
+        let b = fp16_uniform(2 * QK_K, 3, 1.0);
+        let c = matmul_q4k_rle_fp16(&a, &b, 2, 2 * QK_K, 3);
+        assert_eq!(c.len(), 6);
+        assert_all_close(&c, 512.0, 1e-3);
+    }
+
+    // =========================================================================
+    // Panic / contract checks
+    // =========================================================================
+
+    #[test]
+    fn matmul_rle_panics_when_k_not_multiple_of_qkk() {
+        let src = make_block(1.0, 0.0, 1, 0, 0x00);
+        let a   = vec![encode(&src)];
+        let b   = vec![0u16; 512];
+        let result = std::panic::catch_unwind(move || {
+            matmul_q4k_rle_fp16(&a, &b, 1, 512, 2);
+        });
+        assert!(result.is_err(), "should panic when k is not a multiple of QK_K");
+    }
+
+    #[test]
+    fn matmul_rle_panics_on_wrong_a_length() {
+        let src = make_block(1.0, 0.0, 1, 0, 0x00);
+        // m=2, k=QK_K requires 2 blocks; only 1 is provided.
+        let a = vec![encode(&src)];
+        let b = fp16_uniform(QK_K, 1, 1.0);
+        let result = std::panic::catch_unwind(move || {
+            matmul_q4k_rle_fp16(&a, &b, 2, QK_K, 1);
+        });
+        assert!(result.is_err(), "should panic on wrong A block count");
+    }
+
+    #[test]
+    fn matmul_rle_panics_on_wrong_b_length() {
+        let src = make_block(1.0, 0.0, 1, 0, 0x00);
+        let a = vec![encode(&src)];
+        // B is too short for k=QK_K, n=3.
+        let b = vec![0u16; 10];
+        let result = std::panic::catch_unwind(move || {
+            matmul_q4k_rle_fp16(&a, &b, 1, QK_K, 3);
+        });
+        assert!(result.is_err(), "should panic on wrong B element count");
+    }
+}