diff --git a/src/bin/gguf_matmul.rs b/src/bin/gguf_matmul.rs
index e2da916..0b4ef81 100644
--- a/src/bin/gguf_matmul.rs
+++ b/src/bin/gguf_matmul.rs
@@ -4,274 +4,40 @@
 //! # Usage
 //!
 //! ```text
-//! cargo run --release --bin gguf_matmul -- <model.gguf> [n]
+//! cargo run --release --bin gguf_matmul -- <model.gguf> [n] [trials]
 //! ```
 //!
 //! `n` is the number of activation columns (token count / batch size).
 //! Defaults to 1 (single-token inference).
-//!
-//! # GGUF layout (v2 / v3)
-//!
-//! ```text
-//! ┌─────────────────────────────────────────────────────┐
-//! │ magic u32 │ version u32 │ n_tensors u64 │ n_kv u64  │
-//! ├─────────────────────────────────────────────────────┤
-//! │ metadata key-value pairs  (variable length)         │
-//! ├─────────────────────────────────────────────────────┤
-//! │ tensor info records       (variable length)         │
-//! ├─────────────────────────────────────────────────────┤
-//! │ padding to `alignment` boundary (default 32 bytes)  │
-//! ├─────────────────────────────────────────────────────┤
-//! │ tensor data (concatenated, each individually padded)│
-//! └─────────────────────────────────────────────────────┘
-//! ```
-//!
-//! Each Q4_K block is 144 bytes:
-//! `d(2) + dmin(2) + scales(12) + qs(128)` — identical to our `BlockQ4K`.
 
 use std::{
     env,
     error::Error,
     fs::File,
-    io::{self, BufReader, Read, Seek, SeekFrom},
+    io::BufReader,
     time::{Duration, Instant},
 };
 
 use matrix_testing::{
-    matmul_q4k_fp16, BlockQ4K, K_SCALE_SIZE, QK_K,
+    matmul_q4k_fp16,
+    gguf::{parse_header, load_blocks, GGML_TYPE_Q4_K},
     rle::{encode, matmul_q4k_rle_fp16, BlockQ4KRle},
 };
 
-// ---------------------------------------------------------------------------
-// GGUF constants
-// ---------------------------------------------------------------------------
-
-/// File magic: bytes b"GGUF" interpreted as a little-endian u32.
-const GGUF_MAGIC: u32 = 0x4655_4747;
-
-/// Default tensor data alignment when not overridden by `general.alignment`.
-const GGUF_DEFAULT_ALIGNMENT: u64 = 32;
-
-/// GGML tensor type code for Q4_K (matches ggml.h `GGML_TYPE_Q4_K`).
-const GGML_TYPE_Q4_K: u32 = 12;
-
-/// Size in bytes of one Q4_K block: d(2) + dmin(2) + scales(12) + qs(128).
-const BLOCK_BYTES: usize = 2 + 2 + K_SCALE_SIZE + QK_K / 2; // 144
-
-// GGUF metadata value type tags (gguf-spec §3.2).
-const GTYPE_U8:   u32 = 0;
-const GTYPE_I8:   u32 = 1;
-const GTYPE_U16:  u32 = 2;
-const GTYPE_I16:  u32 = 3;
-const GTYPE_U32:  u32 = 4;
-const GTYPE_I32:  u32 = 5;
-const GTYPE_F32:  u32 = 6;
-const GTYPE_BOOL: u32 = 7;
-const GTYPE_STR:  u32 = 8;
-const GTYPE_ARR:  u32 = 9;
-const GTYPE_U64:  u32 = 10;
-const GTYPE_I64:  u32 = 11;
-const GTYPE_F64:  u32 = 12;
-
-// ---------------------------------------------------------------------------
-// Primitive binary readers (little-endian, no deps)
-// ---------------------------------------------------------------------------
-
-fn read_u8(r: &mut impl Read) -> io::Result<u8> {
-    let mut b = [0u8; 1];
-    r.read_exact(&mut b)?;
-    Ok(b[0])
-}
-
-fn read_u16(r: &mut impl Read) -> io::Result<u16> {
-    let mut b = [0u8; 2];
-    r.read_exact(&mut b)?;
-    Ok(u16::from_le_bytes(b))
-}
-
-fn read_u32(r: &mut impl Read) -> io::Result<u32> {
-    let mut b = [0u8; 4];
-    r.read_exact(&mut b)?;
-    Ok(u32::from_le_bytes(b))
-}
-
-fn read_u64(r: &mut impl Read) -> io::Result<u64> {
-    let mut b = [0u8; 8];
-    r.read_exact(&mut b)?;
-    Ok(u64::from_le_bytes(b))
-}
-
-/// Read a GGUF length-prefixed UTF-8 string.
-fn read_str(r: &mut impl Read) -> io::Result<String> {
-    let len = read_u64(r)? as usize;
-    let mut buf = vec![0u8; len];
-    r.read_exact(&mut buf)?;
-    Ok(String::from_utf8_lossy(&buf).into_owned())
-}
-
-/// Skip one GGUF metadata value of the given type tag without storing it.
-fn skip_value(r: &mut impl Read, tag: u32) -> io::Result<()> {
-    match tag {
-        GTYPE_U8 | GTYPE_I8 | GTYPE_BOOL         => { read_u8(r)?; }
-        GTYPE_U16 | GTYPE_I16                     => { read_u16(r)?; }
-        GTYPE_U32 | GTYPE_I32 | GTYPE_F32         => { read_u32(r)?; }
-        GTYPE_U64 | GTYPE_I64 | GTYPE_F64         => { read_u64(r)?; }
-        GTYPE_STR                                 => { read_str(r)?; }
-        GTYPE_ARR => {
-            let elem_tag = read_u32(r)?;
-            let count    = read_u64(r)?;
-            for _ in 0..count {
-                skip_value(r, elem_tag)?;
-            }
-        }
-        t => return Err(io::Error::new(
-            io::ErrorKind::InvalidData,
-            format!("unknown GGUF value type {t}"),
-        )),
-    }
-    Ok(())
-}
-
-// ---------------------------------------------------------------------------
-// Tensor info
-// ---------------------------------------------------------------------------
-
-struct TensorInfo {
-    name:   String,
-    /// Dimensions in GGML order: dims[0] is the innermost (fastest-varying).
-    /// For a 2-D weight matrix: dims[0] = K (in-features), dims[1] = M (out-features).
-    dims:   Vec<u64>,
-    dtype:  u32,
-    /// Byte offset of this tensor's data measured from the start of the data
-    /// section (i.e. add `data_start` to get the absolute file offset).
-    offset: u64,
-}
-
-impl TensorInfo {
-    fn n_elements(&self) -> u64 {
-        self.dims.iter().product()
-    }
-
-    fn data_bytes(&self) -> u64 {
-        debug_assert_eq!(self.dtype, GGML_TYPE_Q4_K);
-        (self.n_elements() / QK_K as u64) * BLOCK_BYTES as u64
-    }
-
-    /// Return (m, k) matrix dimensions.
-    /// dims[0] = K (column / inner dim), dims[1] = M (row / outer dim).
-    fn matrix_dims(&self) -> (usize, usize) {
-        assert_eq!(self.dims.len(), 2, "expected 2-D tensor");
-        let k = self.dims[0] as usize;
-        let m = self.dims[1] as usize;
-        (m, k)
-    }
-}
-
-// ---------------------------------------------------------------------------
-// GGUF header parser
-// ---------------------------------------------------------------------------
-
-/// Parse the GGUF file header and return `(tensor_infos, data_start_offset)`.
-///
-/// `data_start_offset` is the absolute byte position where tensor data begins.
-fn parse_header(path: &str) -> Result<(Vec<TensorInfo>, u64), Box<dyn Error>> {
-    let mut r = BufReader::new(File::open(path)?);
-
-    // Magic + version
-    let magic = read_u32(&mut r)?;
-    if magic != GGUF_MAGIC {
-        return Err(format!(
-            "not a GGUF file (expected magic {GGUF_MAGIC:#010x}, got {magic:#010x})"
-        ).into());
-    }
-    let version = read_u32(&mut r)?;
-    if !(2..=3).contains(&version) {
-        eprintln!("warning: unexpected GGUF version {version} — proceeding anyway");
-    }
-
-    let n_tensors  = read_u64(&mut r)? as usize;
-    let n_metadata = read_u64(&mut r)?;
-
-    // Scan metadata KV pairs; capture `general.alignment` if present.
-    let mut alignment = GGUF_DEFAULT_ALIGNMENT;
-    for _ in 0..n_metadata {
-        let key = read_str(&mut r)?;
-        let tag = read_u32(&mut r)?;
-        if key == "general.alignment" && tag == GTYPE_U32 {
-            alignment = read_u32(&mut r)? as u64;
-        } else {
-            skip_value(&mut r, tag)?;
-        }
-    }
-
-    // Tensor info records.
-    let mut tensors = Vec::with_capacity(n_tensors);
-    for _ in 0..n_tensors {
-        let name   = read_str(&mut r)?;
-        let n_dims = read_u32(&mut r)? as usize;
-        let dims: Vec<u64> = (0..n_dims)
-            .map(|_| read_u64(&mut r))
-            .collect::<io::Result<_>>()?;
-        let dtype  = read_u32(&mut r)?;
-        let offset = read_u64(&mut r)?;
-        tensors.push(TensorInfo { name, dims, dtype, offset });
-    }
-
-    // Data starts at the next `alignment`-byte boundary after the header.
-    let header_end = r.stream_position()?;
-    let data_start = header_end.div_ceil(alignment) * alignment;
-
-    Ok((tensors, data_start))
-}
-
-// ---------------------------------------------------------------------------
-// Block loader
-// ---------------------------------------------------------------------------
-
-/// Seek to the tensor's data and read its Q4_K blocks into a Vec.
-fn load_blocks(
-    r:          &mut (impl Read + Seek),
-    data_start: u64,
-    tensor:     &TensorInfo,
-) -> io::Result<Vec<BlockQ4K>> {
-    r.seek(SeekFrom::Start(data_start + tensor.offset))?;
-
-    let n_blocks = (tensor.n_elements() / QK_K as u64) as usize;
-    let mut blocks = Vec::with_capacity(n_blocks);
-    let mut buf = [0u8; BLOCK_BYTES];
-
-    for _ in 0..n_blocks {
-        r.read_exact(&mut buf)?;
-        // Parse field by field — safe, no transmute.
-        // Layout: d(0..2) dmin(2..4) scales(4..16) qs(16..144)
-        blocks.push(BlockQ4K {
-            d:      u16::from_le_bytes([buf[0], buf[1]]),
-            dmin:   u16::from_le_bytes([buf[2], buf[3]]),
-            scales: buf[4..16].try_into().unwrap(),
-            qs:     buf[16..BLOCK_BYTES].try_into().unwrap(),
-        });
-    }
-
-    Ok(blocks)
-}
-
 // ---------------------------------------------------------------------------
 // Random FP16 activation matrix (no external rand dep)
 // ---------------------------------------------------------------------------
 
-/// Minimal 64-bit LCG (Knuth / PCG constants).
 struct Lcg(u64);
 
 impl Lcg {
     fn new(seed: u64) -> Self { Self(seed) }
 
-    /// Return the next pseudo-random f32 in (−0.05, +0.05).
-    /// This is a plausible scale for normalised transformer activations.
+    /// Returns the next pseudo-random f32 in (−0.05, +0.05).
     fn next_f32(&mut self) -> f32 {
         self.0 = self.0
             .wrapping_mul(6_364_136_223_846_793_005)
             .wrapping_add(1_442_695_040_888_963_407);
-        // Map high 32 bits to [0, 1) then shift to (−0.05, +0.05).
         (self.0 >> 32) as f32 / 4_294_967_296.0 * 0.10 - 0.05
     }
 }
@@ -283,8 +49,8 @@ fn f32_to_fp16(f: f32) -> u16 {
     let sign     = ((bits >> 31) as u16) << 15;
     let exp      = ((bits >> 23) & 0xFF) as i32 - 127 + 15;
     let mantissa = (bits & 0x007F_FFFF) >> 13;
-    if exp <= 0  { return sign; }           // underflow → signed zero
-    if exp >= 31 { return sign | 0x7C00; }  // overflow  → signed infinity
+    if exp <= 0  { return sign; }
+    if exp >= 31 { return sign | 0x7C00; }
     sign | ((exp as u16) << 10) | mantissa as u16
 }
 
@@ -335,23 +101,17 @@ fn main() -> Result<(), Box<dyn Error>> {
     println!("Parsing {path} …");
     let (tensors, data_start) = parse_header(path)?;
 
-    // List all tensor names and types for context.
     let n_q4k = tensors.iter().filter(|t| t.dtype == GGML_TYPE_Q4_K).count();
     println!("  {} tensors total, {} are Q4_K", tensors.len(), n_q4k);
 
     // ── Select the first suitable 2-D Q4_K tensor ───────────────────────────
-    // "Suitable" means 2-D and K divisible by QK_K (required for our matmul).
     let tensor = tensors
         .iter()
-        .find(|t| {
-            t.dtype == GGML_TYPE_Q4_K
-                && t.dims.len() == 2
-                && t.dims[0] % QK_K as u64 == 0
-        })
+        .find(|t| t.is_usable_q4k())
         .ok_or("no suitable 2-D Q4_K tensor found in this GGUF file")?;
 
     let (m, k)   = tensor.matrix_dims();
-    let n_blocks = m * (k / QK_K);
+    let n_blocks = tensor.n_blocks();
     let data_mib = tensor.data_bytes() as f64 / (1u64 << 20) as f64;
 
     println!();
@@ -368,9 +128,9 @@ fn main() -> Result<(), Box<dyn Error>> {
     let t0 = Instant::now();
     let mut file = BufReader::new(File::open(path)?);
     let blocks = load_blocks(&mut file, data_start, tensor)?;
-    println!("{:.3} s  ({} blocks × {} B)", t0.elapsed().as_secs_f64(), n_blocks, BLOCK_BYTES);
+    println!("{:.3} s", t0.elapsed().as_secs_f64());
 
-    // ── Build random activation matrix [K × N] ───────────────────────────────
+    // ── Build random activation matrix [K × N] ──────────────────────────────
     let b_fp16 = random_fp16_matrix(k, n, 0xDEAD_BEEF_CAFE_1234);
 
     // ── Baseline matmul (best of `trials`) ──────────────────────────────────
@@ -388,26 +148,19 @@ fn main() -> Result<(), Box<dyn Error>> {
         blocks.iter().map(encode).collect()
     });
 
-    for block in &rle_blocks {
-        println!("Got value {:?}", block);
-        for pair in block.qs {
-            println!("top {} bottom {}", (pair >> 4), (pair & 0b1111));
-        }
-        break;
-    }
-
-    let n_rle       = rle_blocks.iter().filter(|b| b.is_rle()).count();
-    let n_raw       = n_blocks - n_rle;
-    let avg_pairs   = if n_rle > 0 {
+    let n_rle     = rle_blocks.iter().filter(|b| b.is_rle()).count();
+    let n_raw     = n_blocks - n_rle;
+    let avg_pairs = if n_rle > 0 {
         rle_blocks.iter()
             .filter(|b| b.is_rle())
             .map(|b| b.rle_len() as f64)
             .sum::<f64>() / n_rle as f64
-    } else { 0.0 };
-
+    } else {
+        0.0
+    };
     println!(
-        "Encode  : {:.3} s   RLE {n_rle}/{n_blocks} blocks ({:.1}%),  \
-         raw {n_raw}/{n_blocks} ({:.1}%),  avg {avg_pairs:.1} pairs/RLE block",
+        "Encode  : {:.3} s   RLE {n_rle}/{n_blocks} ({:.1}%)  \
+         raw {n_raw}/{n_blocks} ({:.1}%)  avg {avg_pairs:.1} pairs/RLE block",
         t_enc.as_secs_f64(),
         100.0 * n_rle as f64 / n_blocks as f64,
         100.0 * n_raw as f64 / n_blocks as f64,
@@ -438,11 +191,15 @@ fn main() -> Result<(), Box<dyn Error>> {
 
     // ── Summary ──────────────────────────────────────────────────────────────
     println!();
-    println!("Speedup (matmul only):          {:.2}×", t_base.as_secs_f64() / t_rle.as_secs_f64());
-    println!("Speedup (matmul + encode once): {:.2}×",
-        t_base.as_secs_f64() / (t_rle + t_enc).as_secs_f64());
+    println!(
+        "Speedup (matmul only):          {:.2}×",
+        t_base.as_secs_f64() / t_rle.as_secs_f64()
+    );
+    println!(
+        "Speedup (matmul + encode once): {:.2}×",
+        t_base.as_secs_f64() / (t_rle + t_enc).as_secs_f64()
+    );
 
-    // Show a small slice of the output so it's clear something real happened.
     let show = n.min(4);
     print!("First {show} output(s) of row 0: ");
     for j in 0..show {
diff --git a/src/bin/gguf_scan.rs b/src/bin/gguf_scan.rs
new file mode 100644
index 0000000..d930496
--- /dev/null
+++ b/src/bin/gguf_scan.rs
@@ -0,0 +1,197 @@
+//! Scan every Q4_K tensor in a GGUF file and report per-tensor RLE compression
+//! statistics, without running a full matrix multiply.
+//!
+//! Each block is streamed from disk (one 144-byte buffer on the stack) and
+//! immediately encoded; no Vec of blocks is retained between tensors.
+//!
+//! # Usage
+//!
+//! ```text
+//! cargo run --release --bin gguf_scan -- <model.gguf>
+//! ```
+
+use std::{
+    env,
+    error::Error,
+    fs::File,
+    io::{self, BufReader, Write},
+    time::Instant,
+};
+
+use matrix_testing::{
+    gguf::{for_each_block, parse_header},
+    rle::encode,
+};
+
+// ---------------------------------------------------------------------------
+// Per-tensor RLE statistics
+// ---------------------------------------------------------------------------
+
+#[derive(Default)]
+struct TensorStats {
+    n_blocks:   usize,
+    n_rle:      usize,
+    total_pairs: usize, // sum of rle_len() for RLE blocks only
+    min_pairs:  usize,
+    max_pairs:  usize,
+}
+
+impl TensorStats {
+    fn new() -> Self {
+        Self { min_pairs: usize::MAX, ..Default::default() }
+    }
+
+    fn observe(&mut self, is_rle: bool, pairs: usize) {
+        self.n_blocks += 1;
+        if is_rle {
+            self.n_rle      += 1;
+            self.total_pairs += pairs;
+            self.min_pairs   = self.min_pairs.min(pairs);
+            self.max_pairs   = self.max_pairs.max(pairs);
+        }
+    }
+
+    fn rle_pct(&self) -> f64 {
+        if self.n_blocks == 0 { return 0.0; }
+        100.0 * self.n_rle as f64 / self.n_blocks as f64
+    }
+
+    fn avg_pairs(&self) -> Option<f64> {
+        if self.n_rle == 0 { return None; }
+        Some(self.total_pairs as f64 / self.n_rle as f64)
+    }
+
+    fn min_pairs(&self) -> Option<usize> {
+        if self.n_rle == 0 { None } else { Some(self.min_pairs) }
+    }
+
+    fn max_pairs(&self) -> Option<usize> {
+        if self.n_rle == 0 { None } else { Some(self.max_pairs) }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Formatting helpers
+// ---------------------------------------------------------------------------
+
+fn fmt_count(n: usize) -> String {
+    // Insert thousands separators.
+    let s = n.to_string();
+    let mut out = String::new();
+    for (i, ch) in s.chars().rev().enumerate() {
+        if i > 0 && i % 3 == 0 { out.push('_'); }
+        out.push(ch);
+    }
+    out.chars().rev().collect()
+}
+
+/// Truncate or pad `s` to exactly `width` characters.
+fn fixed(s: &str, width: usize) -> String {
+    if s.len() >= width {
+        format!("{:.width$}", &s[..width])
+    } else {
+        format!("{s:<width$}")
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+fn main() -> Result<(), Box<dyn Error>> {
+    let args: Vec<String> = env::args().collect();
+    if args.len() < 2 {
+        eprintln!("usage: {} <model.gguf>", args[0]);
+        std::process::exit(1);
+    }
+    let path = &args[1];
+
+    // ── Parse header ─────────────────────────────────────────────────────────
+    eprintln!("Parsing {path} …");
+    let (tensors, data_start) = parse_header(path)?;
+
+    let q4k_tensors: Vec<_> = tensors
+        .iter()
+        .filter(|t| t.is_usable_q4k())
+        .collect();
+
+    let other_count = tensors.len() - q4k_tensors.len();
+    eprintln!(
+        "  {} tensors total: {} Q4_K (will scan), {} other (skipped)",
+        tensors.len(),
+        q4k_tensors.len(),
+        other_count,
+    );
+    eprintln!();
+
+    // ── Header row ───────────────────────────────────────────────────────────
+    // Columns: Name(40) | Shape(18) | Blocks(9) | RLE%(7) | AvgPairs(9) | Range(14)
+    println!(
+        "{:<40}  {:>9} {:>9}  {:>6}  {:>9}  {}",
+        "Tensor", "Rows", "Cols", "Blocks", "RLE%", "Pairs (avg / min–max)"
+    );
+    println!("{}", "─".repeat(100));
+
+    // ── Scan each tensor ─────────────────────────────────────────────────────
+    let mut file   = BufReader::new(File::open(path)?);
+    let scan_start = Instant::now();
+
+    let mut global_blocks = 0usize;
+    let mut global_rle    = 0usize;
+    let mut any_rle       = false;
+
+    for tensor in &q4k_tensors {
+        let (m, k) = tensor.matrix_dims();
+        let mut stats = TensorStats::new();
+
+        for_each_block(&mut file, data_start, tensor, |block| {
+            let rle_block = encode(block);
+            stats.observe(rle_block.is_rle(), rle_block.rle_len());
+        })?;
+
+        global_blocks += stats.n_blocks;
+        global_rle    += stats.n_rle;
+        if stats.n_rle > 0 { any_rle = true; }
+
+        // Format the pairs column — blank when no RLE blocks found.
+        let pairs_col = match (stats.avg_pairs(), stats.min_pairs(), stats.max_pairs()) {
+            (Some(avg), Some(lo), Some(hi)) => format!("{avg:.1}  ({lo}–{hi})"),
+            _                               => "—".to_string(),
+        };
+
+        println!(
+            "{}  {:>9} {:>9}  {:>6}  {:>6.1}%  {}",
+            fixed(&tensor.name, 40),
+            fmt_count(m),
+            fmt_count(k),
+            fmt_count(stats.n_blocks),
+            stats.rle_pct(),
+            pairs_col,
+        );
+
+        // Flush so the user sees progress on slow storage.
+        let _ = io::stdout().flush();
+    }
+
+    // ── Summary ──────────────────────────────────────────────────────────────
+    let elapsed = scan_start.elapsed();
+    println!("{}", "─".repeat(100));
+    println!(
+        "Tensors : {}   Blocks scanned: {}   RLE blocks: {} ({:.2}%)   Time: {:.1} s",
+        fmt_count(q4k_tensors.len()),
+        fmt_count(global_blocks),
+        fmt_count(global_rle),
+        100.0 * global_rle as f64 / global_blocks.max(1) as f64,
+        elapsed.as_secs_f64(),
+    );
+
+    if !any_rle {
+        println!();
+        println!("No blocks compressed with RLE — all weights are effectively random at");
+        println!("the byte level, which is typical for trained Q4_K quantised weights.");
+        println!("RLE compression only helps for structured weight matrices (binary,");
+        println!("ternary, heavily pruned, or synthetic).");
+    }
+
+    Ok(())
+}
diff --git a/src/gguf.rs b/src/gguf.rs
new file mode 100644
index 0000000..0815564
--- /dev/null
+++ b/src/gguf.rs
@@ -0,0 +1,329 @@
+//! Minimal GGUF (v2 / v3) file parser.
+//!
+//! Provides just enough to locate Q4_K tensor data inside a GGUF file and
+//! stream its raw blocks without loading the whole file into memory.
+//!
+//! # GGUF file layout
+//!
+//! ```text
+//! ┌────────────────────────────────────────────────────────────┐
+//! │ magic u32 │ version u32 │ n_tensors u64 │ n_kv_pairs u64  │
+//! ├────────────────────────────────────────────────────────────┤
+//! │ metadata key-value pairs  (variable length)                │
+//! ├────────────────────────────────────────────────────────────┤
+//! │ tensor info records       (fixed structure, variable count)│
+//! ├────────────────────────────────────────────────────────────┤
+//! │ padding to `alignment` boundary (default 32 bytes)         │
+//! ├────────────────────────────────────────────────────────────┤
+//! │ tensor data — each tensor at its declared offset           │
+//! └────────────────────────────────────────────────────────────┘
+//! ```
+//!
+//! Each Q4_K block on disk is 144 bytes and is binary-compatible with
+//! [`crate::BlockQ4K`]:
+//!
+//! ```text
+//! d(2) + dmin(2) + scales(12) + qs(128) = 144 bytes
+//! ```
+
+use std::{
+    error::Error,
+    fs::File,
+    io::{self, BufReader, Read, Seek, SeekFrom},
+};
+
+use crate::{BlockQ4K, K_SCALE_SIZE, QK_K};
+
+// ---------------------------------------------------------------------------
+// Public constants
+// ---------------------------------------------------------------------------
+
+/// File magic: the bytes `b"GGUF"` read as a little-endian `u32`.
+pub const GGUF_MAGIC: u32 = 0x4655_4747;
+
+/// Tensor data alignment used when `general.alignment` is absent.
+pub const GGUF_DEFAULT_ALIGNMENT: u64 = 32;
+
+/// GGML tensor type code for Q4_K (ggml.h `GGML_TYPE_Q4_K`).
+pub const GGML_TYPE_Q4_K: u32 = 12;
+
+/// Raw byte size of one Q4_K block: `d(2) + dmin(2) + scales(12) + qs(128)`.
+pub const BLOCK_BYTES: usize = 2 + 2 + K_SCALE_SIZE + QK_K / 2; // 144
+
+// ---------------------------------------------------------------------------
+// GGUF metadata value type tags (gguf-spec §3.2)
+// ---------------------------------------------------------------------------
+
+const GTYPE_U8:   u32 = 0;
+const GTYPE_I8:   u32 = 1;
+const GTYPE_U16:  u32 = 2;
+const GTYPE_I16:  u32 = 3;
+const GTYPE_U32:  u32 = 4;
+const GTYPE_I32:  u32 = 5;
+const GTYPE_F32:  u32 = 6;
+const GTYPE_BOOL: u32 = 7;
+const GTYPE_STR:  u32 = 8;
+const GTYPE_ARR:  u32 = 9;
+const GTYPE_U64:  u32 = 10;
+const GTYPE_I64:  u32 = 11;
+const GTYPE_F64:  u32 = 12;
+
+// ---------------------------------------------------------------------------
+// Private byte-level readers (all little-endian)
+// ---------------------------------------------------------------------------
+
+fn read_u8(r: &mut impl Read) -> io::Result<u8> {
+    let mut b = [0u8; 1];
+    r.read_exact(&mut b)?;
+    Ok(b[0])
+}
+
+fn read_u16(r: &mut impl Read) -> io::Result<u16> {
+    let mut b = [0u8; 2];
+    r.read_exact(&mut b)?;
+    Ok(u16::from_le_bytes(b))
+}
+
+fn read_u32(r: &mut impl Read) -> io::Result<u32> {
+    let mut b = [0u8; 4];
+    r.read_exact(&mut b)?;
+    Ok(u32::from_le_bytes(b))
+}
+
+fn read_u64(r: &mut impl Read) -> io::Result<u64> {
+    let mut b = [0u8; 8];
+    r.read_exact(&mut b)?;
+    Ok(u64::from_le_bytes(b))
+}
+
+/// Read a GGUF length-prefixed UTF-8 string (`u64` byte count followed by
+/// raw bytes; not NUL-terminated).
+fn read_str(r: &mut impl Read) -> io::Result<String> {
+    let len = read_u64(r)? as usize;
+    let mut buf = vec![0u8; len];
+    r.read_exact(&mut buf)?;
+    Ok(String::from_utf8_lossy(&buf).into_owned())
+}
+
+/// Consume and discard one GGUF metadata value of type `tag`.
+///
+/// Recurses for array elements; does not allocate for scalar types.
+fn skip_value(r: &mut impl Read, tag: u32) -> io::Result<()> {
+    match tag {
+        GTYPE_U8 | GTYPE_I8 | GTYPE_BOOL             => { read_u8(r)?; }
+        GTYPE_U16 | GTYPE_I16                         => { read_u16(r)?; }
+        GTYPE_U32 | GTYPE_I32 | GTYPE_F32             => { read_u32(r)?; }
+        GTYPE_U64 | GTYPE_I64 | GTYPE_F64             => { read_u64(r)?; }
+        GTYPE_STR                                     => { read_str(r)?; }
+        GTYPE_ARR => {
+            let elem_tag = read_u32(r)?;
+            let count    = read_u64(r)?;
+            for _ in 0..count {
+                skip_value(r, elem_tag)?;
+            }
+        }
+        t => return Err(io::Error::new(
+            io::ErrorKind::InvalidData,
+            format!("unknown GGUF metadata value type {t}"),
+        )),
+    }
+    Ok(())
+}
+
+// ---------------------------------------------------------------------------
+// TensorInfo
+// ---------------------------------------------------------------------------
+
+/// Metadata for one tensor as recorded in the GGUF header.
+#[derive(Debug, Clone)]
+pub struct TensorInfo {
+    /// Tensor name as stored in the file (e.g. `"blk.0.attn_q.weight"`).
+    pub name: String,
+
+    /// Shape in GGML dimension order: `dims[0]` is the innermost
+    /// (fastest-varying) axis.
+    ///
+    /// For a 2-D weight matrix:
+    /// - `dims[0]` = K (input features / column count)
+    /// - `dims[1]` = M (output features / row count)
+    pub dims: Vec<u64>,
+
+    /// GGML type code (e.g. [`GGML_TYPE_Q4_K`] = 12).
+    pub dtype: u32,
+
+    /// Byte offset of this tensor's data measured **from the start of the
+    /// data section** (add `data_start` from [`parse_header`] to get the
+    /// absolute file offset).
+    pub offset: u64,
+}
+
+impl TensorInfo {
+    /// Total number of elements across all dimensions.
+    pub fn n_elements(&self) -> u64 {
+        self.dims.iter().product()
+    }
+
+    /// Number of Q4_K blocks (only meaningful when `dtype == GGML_TYPE_Q4_K`).
+    pub fn n_blocks(&self) -> usize {
+        (self.n_elements() / QK_K as u64) as usize
+    }
+
+    /// Raw byte count occupied by this tensor's data on disk.
+    pub fn data_bytes(&self) -> u64 {
+        debug_assert_eq!(self.dtype, GGML_TYPE_Q4_K);
+        self.n_blocks() as u64 * BLOCK_BYTES as u64
+    }
+
+    /// Return `(m, k)` matrix dimensions for a 2-D tensor.
+    ///
+    /// - `m` = `dims[1]` — number of rows (output features)
+    /// - `k` = `dims[0]` — number of columns (input features)
+    pub fn matrix_dims(&self) -> (usize, usize) {
+        assert_eq!(self.dims.len(), 2, "expected a 2-D tensor");
+        (self.dims[1] as usize, self.dims[0] as usize)
+    }
+
+    /// Returns `true` if this tensor is a 2-D Q4_K matrix whose inner
+    /// dimension is a multiple of `QK_K` (required by our matmul).
+    pub fn is_usable_q4k(&self) -> bool {
+        self.dtype == GGML_TYPE_Q4_K
+            && self.dims.len() == 2
+            && self.dims[0] % QK_K as u64 == 0
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Header parser
+// ---------------------------------------------------------------------------
+
+/// Open the GGUF file at `path`, parse its header, and return
+/// `(tensor_infos, data_start_offset)`.
+///
+/// `data_start_offset` is the **absolute** byte position in the file where
+/// the first tensor's data begins (after header + alignment padding).
+///
+/// # Errors
+///
+/// Returns an error if the file is not a valid GGUF file, if I/O fails, or
+/// if an unknown metadata value type is encountered.
+pub fn parse_header(path: &str) -> Result<(Vec<TensorInfo>, u64), Box<dyn Error>> {
+    let mut r = BufReader::new(File::open(path)?);
+
+    // Magic number
+    let magic = read_u32(&mut r)?;
+    if magic != GGUF_MAGIC {
+        return Err(format!(
+            "not a GGUF file (expected {GGUF_MAGIC:#010x}, got {magic:#010x})"
+        ).into());
+    }
+
+    // Version — we support 2 and 3; warn on anything else and try anyway.
+    let version = read_u32(&mut r)?;
+    if !(2..=3).contains(&version) {
+        eprintln!("warning: unexpected GGUF version {version} — proceeding anyway");
+    }
+
+    let n_tensors  = read_u64(&mut r)? as usize;
+    let n_metadata = read_u64(&mut r)?;
+
+    // Scan metadata KV pairs; pull out `general.alignment` if present.
+    let mut alignment = GGUF_DEFAULT_ALIGNMENT;
+    for _ in 0..n_metadata {
+        let key = read_str(&mut r)?;
+        let tag = read_u32(&mut r)?;
+        if key == "general.alignment" && tag == GTYPE_U32 {
+            alignment = read_u32(&mut r)? as u64;
+        } else {
+            skip_value(&mut r, tag)?;
+        }
+    }
+
+    // Tensor info records.
+    let mut tensors = Vec::with_capacity(n_tensors);
+    for _ in 0..n_tensors {
+        let name   = read_str(&mut r)?;
+        let n_dims = read_u32(&mut r)? as usize;
+        let dims: Vec<u64> = (0..n_dims)
+            .map(|_| read_u64(&mut r))
+            .collect::<io::Result<_>>()?;
+        let dtype  = read_u32(&mut r)?;
+        let offset = read_u64(&mut r)?;
+        tensors.push(TensorInfo { name, dims, dtype, offset });
+    }
+
+    // Data section starts at the next multiple of `alignment` after the header.
+    let header_end = r.stream_position()?;
+    let data_start = header_end.div_ceil(alignment) * alignment;
+
+    Ok((tensors, data_start))
+}
+
+// ---------------------------------------------------------------------------
+// Block readers
+// ---------------------------------------------------------------------------
+
+/// Parse one [`BlockQ4K`] from a 144-byte buffer.
+///
+/// The buffer must be in GGUF / GGML on-disk layout:
+/// bytes 0–1 = `d`, 2–3 = `dmin`, 4–15 = `scales`, 16–143 = `qs`.
+#[inline]
+pub fn parse_block(buf: &[u8; BLOCK_BYTES]) -> BlockQ4K {
+    BlockQ4K {
+        d:      u16::from_le_bytes([buf[0], buf[1]]),
+        dmin:   u16::from_le_bytes([buf[2], buf[3]]),
+        scales: buf[4..16].try_into().unwrap(),
+        qs:     buf[16..BLOCK_BYTES].try_into().unwrap(),
+    }
+}
+
+/// Load all Q4_K blocks in `tensor` into a freshly allocated `Vec`.
+///
+/// The reader is seeked to the correct file position automatically.
+/// `data_start` must be the value returned by [`parse_header`].
+pub fn load_blocks(
+    r:          &mut (impl Read + Seek),
+    data_start: u64,
+    tensor:     &TensorInfo,
+) -> io::Result<Vec<BlockQ4K>> {
+    r.seek(SeekFrom::Start(data_start + tensor.offset))?;
+
+    let n = tensor.n_blocks();
+    let mut blocks = Vec::with_capacity(n);
+    let mut buf = [0u8; BLOCK_BYTES];
+
+    for _ in 0..n {
+        r.read_exact(&mut buf)?;
+        blocks.push(parse_block(&buf));
+    }
+
+    Ok(blocks)
+}
+
+/// Iterate over every Q4_K block in `tensor`, calling `f` with a reference
+/// to each parsed [`BlockQ4K`] in turn.
+///
+/// Only one block-sized buffer (144 bytes) is allocated on the stack; no
+/// `Vec` is built.  Use this for streaming scans that do not need to retain
+/// blocks (e.g. computing statistics or RLE compression ratios).
+///
+/// The reader is seeked to the correct file position automatically.
+/// `data_start` must be the value returned by [`parse_header`].
+pub fn for_each_block<F>(
+    r:          &mut (impl Read + Seek),
+    data_start: u64,
+    tensor:     &TensorInfo,
+    mut f:      F,
+) -> io::Result<()>
+where
+    F: FnMut(&BlockQ4K),
+{
+    r.seek(SeekFrom::Start(data_start + tensor.offset))?;
+
+    let mut buf = [0u8; BLOCK_BYTES];
+    for _ in 0..tensor.n_blocks() {
+        r.read_exact(&mut buf)?;
+        f(&parse_block(&buf));
+    }
+
+    Ok(())
+}
diff --git a/src/lib.rs b/src/lib.rs
index ded81b3..9063444 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -19,6 +19,7 @@
 //! f32, accumulate dot-products.  No SIMD, no tiling, no tricks.
 
 pub mod rle;
+pub mod gguf;
 
 // ---------------------------------------------------------------------------
 // Constants matching GGML's ggml-common.h