diff --git a/src/bin/gguf_matmul.rs b/src/bin/gguf_matmul.rs index e2da916..0b4ef81 100644 --- a/src/bin/gguf_matmul.rs +++ b/src/bin/gguf_matmul.rs @@ -4,274 +4,40 @@ //! # Usage //! //! ```text -//! cargo run --release --bin gguf_matmul -- [n] +//! cargo run --release --bin gguf_matmul -- [n] [trials] //! ``` //! //! `n` is the number of activation columns (token count / batch size). //! Defaults to 1 (single-token inference). -//! -//! # GGUF layout (v2 / v3) -//! -//! ```text -//! ┌─────────────────────────────────────────────────────┐ -//! │ magic u32 │ version u32 │ n_tensors u64 │ n_kv u64 │ -//! ├─────────────────────────────────────────────────────┤ -//! │ metadata key-value pairs (variable length) │ -//! ├─────────────────────────────────────────────────────┤ -//! │ tensor info records (variable length) │ -//! ├─────────────────────────────────────────────────────┤ -//! │ padding to `alignment` boundary (default 32 bytes) │ -//! ├─────────────────────────────────────────────────────┤ -//! │ tensor data (concatenated, each individually padded)│ -//! └─────────────────────────────────────────────────────┘ -//! ``` -//! -//! Each Q4_K block is 144 bytes: -//! `d(2) + dmin(2) + scales(12) + qs(128)` — identical to our `BlockQ4K`. use std::{ env, error::Error, fs::File, - io::{self, BufReader, Read, Seek, SeekFrom}, + io::BufReader, time::{Duration, Instant}, }; use matrix_testing::{ - matmul_q4k_fp16, BlockQ4K, K_SCALE_SIZE, QK_K, + matmul_q4k_fp16, + gguf::{parse_header, load_blocks, GGML_TYPE_Q4_K}, rle::{encode, matmul_q4k_rle_fp16, BlockQ4KRle}, }; -// --------------------------------------------------------------------------- -// GGUF constants -// --------------------------------------------------------------------------- - -/// File magic: bytes b"GGUF" interpreted as a little-endian u32. -const GGUF_MAGIC: u32 = 0x4655_4747; - -/// Default tensor data alignment when not overridden by `general.alignment`. -const GGUF_DEFAULT_ALIGNMENT: u64 = 32; - -/// GGML tensor type code for Q4_K (matches ggml.h `GGML_TYPE_Q4_K`). -const GGML_TYPE_Q4_K: u32 = 12; - -/// Size in bytes of one Q4_K block: d(2) + dmin(2) + scales(12) + qs(128). -const BLOCK_BYTES: usize = 2 + 2 + K_SCALE_SIZE + QK_K / 2; // 144 - -// GGUF metadata value type tags (gguf-spec §3.2). -const GTYPE_U8: u32 = 0; -const GTYPE_I8: u32 = 1; -const GTYPE_U16: u32 = 2; -const GTYPE_I16: u32 = 3; -const GTYPE_U32: u32 = 4; -const GTYPE_I32: u32 = 5; -const GTYPE_F32: u32 = 6; -const GTYPE_BOOL: u32 = 7; -const GTYPE_STR: u32 = 8; -const GTYPE_ARR: u32 = 9; -const GTYPE_U64: u32 = 10; -const GTYPE_I64: u32 = 11; -const GTYPE_F64: u32 = 12; - -// --------------------------------------------------------------------------- -// Primitive binary readers (little-endian, no deps) -// --------------------------------------------------------------------------- - -fn read_u8(r: &mut impl Read) -> io::Result { - let mut b = [0u8; 1]; - r.read_exact(&mut b)?; - Ok(b[0]) -} - -fn read_u16(r: &mut impl Read) -> io::Result { - let mut b = [0u8; 2]; - r.read_exact(&mut b)?; - Ok(u16::from_le_bytes(b)) -} - -fn read_u32(r: &mut impl Read) -> io::Result { - let mut b = [0u8; 4]; - r.read_exact(&mut b)?; - Ok(u32::from_le_bytes(b)) -} - -fn read_u64(r: &mut impl Read) -> io::Result { - let mut b = [0u8; 8]; - r.read_exact(&mut b)?; - Ok(u64::from_le_bytes(b)) -} - -/// Read a GGUF length-prefixed UTF-8 string. -fn read_str(r: &mut impl Read) -> io::Result { - let len = read_u64(r)? as usize; - let mut buf = vec![0u8; len]; - r.read_exact(&mut buf)?; - Ok(String::from_utf8_lossy(&buf).into_owned()) -} - -/// Skip one GGUF metadata value of the given type tag without storing it. -fn skip_value(r: &mut impl Read, tag: u32) -> io::Result<()> { - match tag { - GTYPE_U8 | GTYPE_I8 | GTYPE_BOOL => { read_u8(r)?; } - GTYPE_U16 | GTYPE_I16 => { read_u16(r)?; } - GTYPE_U32 | GTYPE_I32 | GTYPE_F32 => { read_u32(r)?; } - GTYPE_U64 | GTYPE_I64 | GTYPE_F64 => { read_u64(r)?; } - GTYPE_STR => { read_str(r)?; } - GTYPE_ARR => { - let elem_tag = read_u32(r)?; - let count = read_u64(r)?; - for _ in 0..count { - skip_value(r, elem_tag)?; - } - } - t => return Err(io::Error::new( - io::ErrorKind::InvalidData, - format!("unknown GGUF value type {t}"), - )), - } - Ok(()) -} - -// --------------------------------------------------------------------------- -// Tensor info -// --------------------------------------------------------------------------- - -struct TensorInfo { - name: String, - /// Dimensions in GGML order: dims[0] is the innermost (fastest-varying). - /// For a 2-D weight matrix: dims[0] = K (in-features), dims[1] = M (out-features). - dims: Vec, - dtype: u32, - /// Byte offset of this tensor's data measured from the start of the data - /// section (i.e. add `data_start` to get the absolute file offset). - offset: u64, -} - -impl TensorInfo { - fn n_elements(&self) -> u64 { - self.dims.iter().product() - } - - fn data_bytes(&self) -> u64 { - debug_assert_eq!(self.dtype, GGML_TYPE_Q4_K); - (self.n_elements() / QK_K as u64) * BLOCK_BYTES as u64 - } - - /// Return (m, k) matrix dimensions. - /// dims[0] = K (column / inner dim), dims[1] = M (row / outer dim). - fn matrix_dims(&self) -> (usize, usize) { - assert_eq!(self.dims.len(), 2, "expected 2-D tensor"); - let k = self.dims[0] as usize; - let m = self.dims[1] as usize; - (m, k) - } -} - -// --------------------------------------------------------------------------- -// GGUF header parser -// --------------------------------------------------------------------------- - -/// Parse the GGUF file header and return `(tensor_infos, data_start_offset)`. -/// -/// `data_start_offset` is the absolute byte position where tensor data begins. -fn parse_header(path: &str) -> Result<(Vec, u64), Box> { - let mut r = BufReader::new(File::open(path)?); - - // Magic + version - let magic = read_u32(&mut r)?; - if magic != GGUF_MAGIC { - return Err(format!( - "not a GGUF file (expected magic {GGUF_MAGIC:#010x}, got {magic:#010x})" - ).into()); - } - let version = read_u32(&mut r)?; - if !(2..=3).contains(&version) { - eprintln!("warning: unexpected GGUF version {version} — proceeding anyway"); - } - - let n_tensors = read_u64(&mut r)? as usize; - let n_metadata = read_u64(&mut r)?; - - // Scan metadata KV pairs; capture `general.alignment` if present. - let mut alignment = GGUF_DEFAULT_ALIGNMENT; - for _ in 0..n_metadata { - let key = read_str(&mut r)?; - let tag = read_u32(&mut r)?; - if key == "general.alignment" && tag == GTYPE_U32 { - alignment = read_u32(&mut r)? as u64; - } else { - skip_value(&mut r, tag)?; - } - } - - // Tensor info records. - let mut tensors = Vec::with_capacity(n_tensors); - for _ in 0..n_tensors { - let name = read_str(&mut r)?; - let n_dims = read_u32(&mut r)? as usize; - let dims: Vec = (0..n_dims) - .map(|_| read_u64(&mut r)) - .collect::>()?; - let dtype = read_u32(&mut r)?; - let offset = read_u64(&mut r)?; - tensors.push(TensorInfo { name, dims, dtype, offset }); - } - - // Data starts at the next `alignment`-byte boundary after the header. - let header_end = r.stream_position()?; - let data_start = header_end.div_ceil(alignment) * alignment; - - Ok((tensors, data_start)) -} - -// --------------------------------------------------------------------------- -// Block loader -// --------------------------------------------------------------------------- - -/// Seek to the tensor's data and read its Q4_K blocks into a Vec. -fn load_blocks( - r: &mut (impl Read + Seek), - data_start: u64, - tensor: &TensorInfo, -) -> io::Result> { - r.seek(SeekFrom::Start(data_start + tensor.offset))?; - - let n_blocks = (tensor.n_elements() / QK_K as u64) as usize; - let mut blocks = Vec::with_capacity(n_blocks); - let mut buf = [0u8; BLOCK_BYTES]; - - for _ in 0..n_blocks { - r.read_exact(&mut buf)?; - // Parse field by field — safe, no transmute. - // Layout: d(0..2) dmin(2..4) scales(4..16) qs(16..144) - blocks.push(BlockQ4K { - d: u16::from_le_bytes([buf[0], buf[1]]), - dmin: u16::from_le_bytes([buf[2], buf[3]]), - scales: buf[4..16].try_into().unwrap(), - qs: buf[16..BLOCK_BYTES].try_into().unwrap(), - }); - } - - Ok(blocks) -} - // --------------------------------------------------------------------------- // Random FP16 activation matrix (no external rand dep) // --------------------------------------------------------------------------- -/// Minimal 64-bit LCG (Knuth / PCG constants). struct Lcg(u64); impl Lcg { fn new(seed: u64) -> Self { Self(seed) } - /// Return the next pseudo-random f32 in (−0.05, +0.05). - /// This is a plausible scale for normalised transformer activations. + /// Returns the next pseudo-random f32 in (−0.05, +0.05). fn next_f32(&mut self) -> f32 { self.0 = self.0 .wrapping_mul(6_364_136_223_846_793_005) .wrapping_add(1_442_695_040_888_963_407); - // Map high 32 bits to [0, 1) then shift to (−0.05, +0.05). (self.0 >> 32) as f32 / 4_294_967_296.0 * 0.10 - 0.05 } } @@ -283,8 +49,8 @@ fn f32_to_fp16(f: f32) -> u16 { let sign = ((bits >> 31) as u16) << 15; let exp = ((bits >> 23) & 0xFF) as i32 - 127 + 15; let mantissa = (bits & 0x007F_FFFF) >> 13; - if exp <= 0 { return sign; } // underflow → signed zero - if exp >= 31 { return sign | 0x7C00; } // overflow → signed infinity + if exp <= 0 { return sign; } + if exp >= 31 { return sign | 0x7C00; } sign | ((exp as u16) << 10) | mantissa as u16 } @@ -335,23 +101,17 @@ fn main() -> Result<(), Box> { println!("Parsing {path} …"); let (tensors, data_start) = parse_header(path)?; - // List all tensor names and types for context. let n_q4k = tensors.iter().filter(|t| t.dtype == GGML_TYPE_Q4_K).count(); println!(" {} tensors total, {} are Q4_K", tensors.len(), n_q4k); // ── Select the first suitable 2-D Q4_K tensor ─────────────────────────── - // "Suitable" means 2-D and K divisible by QK_K (required for our matmul). let tensor = tensors .iter() - .find(|t| { - t.dtype == GGML_TYPE_Q4_K - && t.dims.len() == 2 - && t.dims[0] % QK_K as u64 == 0 - }) + .find(|t| t.is_usable_q4k()) .ok_or("no suitable 2-D Q4_K tensor found in this GGUF file")?; let (m, k) = tensor.matrix_dims(); - let n_blocks = m * (k / QK_K); + let n_blocks = tensor.n_blocks(); let data_mib = tensor.data_bytes() as f64 / (1u64 << 20) as f64; println!(); @@ -368,9 +128,9 @@ fn main() -> Result<(), Box> { let t0 = Instant::now(); let mut file = BufReader::new(File::open(path)?); let blocks = load_blocks(&mut file, data_start, tensor)?; - println!("{:.3} s ({} blocks × {} B)", t0.elapsed().as_secs_f64(), n_blocks, BLOCK_BYTES); + println!("{:.3} s", t0.elapsed().as_secs_f64()); - // ── Build random activation matrix [K × N] ─────────────────────────────── + // ── Build random activation matrix [K × N] ────────────────────────────── let b_fp16 = random_fp16_matrix(k, n, 0xDEAD_BEEF_CAFE_1234); // ── Baseline matmul (best of `trials`) ────────────────────────────────── @@ -388,26 +148,19 @@ fn main() -> Result<(), Box> { blocks.iter().map(encode).collect() }); - for block in &rle_blocks { - println!("Got value {:?}", block); - for pair in block.qs { - println!("top {} bottom {}", (pair >> 4), (pair & 0b1111)); - } - break; - } - - let n_rle = rle_blocks.iter().filter(|b| b.is_rle()).count(); - let n_raw = n_blocks - n_rle; - let avg_pairs = if n_rle > 0 { + let n_rle = rle_blocks.iter().filter(|b| b.is_rle()).count(); + let n_raw = n_blocks - n_rle; + let avg_pairs = if n_rle > 0 { rle_blocks.iter() .filter(|b| b.is_rle()) .map(|b| b.rle_len() as f64) .sum::() / n_rle as f64 - } else { 0.0 }; - + } else { + 0.0 + }; println!( - "Encode : {:.3} s RLE {n_rle}/{n_blocks} blocks ({:.1}%), \ - raw {n_raw}/{n_blocks} ({:.1}%), avg {avg_pairs:.1} pairs/RLE block", + "Encode : {:.3} s RLE {n_rle}/{n_blocks} ({:.1}%) \ + raw {n_raw}/{n_blocks} ({:.1}%) avg {avg_pairs:.1} pairs/RLE block", t_enc.as_secs_f64(), 100.0 * n_rle as f64 / n_blocks as f64, 100.0 * n_raw as f64 / n_blocks as f64, @@ -438,11 +191,15 @@ fn main() -> Result<(), Box> { // ── Summary ────────────────────────────────────────────────────────────── println!(); - println!("Speedup (matmul only): {:.2}×", t_base.as_secs_f64() / t_rle.as_secs_f64()); - println!("Speedup (matmul + encode once): {:.2}×", - t_base.as_secs_f64() / (t_rle + t_enc).as_secs_f64()); + println!( + "Speedup (matmul only): {:.2}×", + t_base.as_secs_f64() / t_rle.as_secs_f64() + ); + println!( + "Speedup (matmul + encode once): {:.2}×", + t_base.as_secs_f64() / (t_rle + t_enc).as_secs_f64() + ); - // Show a small slice of the output so it's clear something real happened. let show = n.min(4); print!("First {show} output(s) of row 0: "); for j in 0..show { diff --git a/src/bin/gguf_scan.rs b/src/bin/gguf_scan.rs new file mode 100644 index 0000000..d930496 --- /dev/null +++ b/src/bin/gguf_scan.rs @@ -0,0 +1,197 @@ +//! Scan every Q4_K tensor in a GGUF file and report per-tensor RLE compression +//! statistics, without running a full matrix multiply. +//! +//! Each block is streamed from disk (one 144-byte buffer on the stack) and +//! immediately encoded; no Vec of blocks is retained between tensors. +//! +//! # Usage +//! +//! ```text +//! cargo run --release --bin gguf_scan -- +//! ``` + +use std::{ + env, + error::Error, + fs::File, + io::{self, BufReader, Write}, + time::Instant, +}; + +use matrix_testing::{ + gguf::{for_each_block, parse_header}, + rle::encode, +}; + +// --------------------------------------------------------------------------- +// Per-tensor RLE statistics +// --------------------------------------------------------------------------- + +#[derive(Default)] +struct TensorStats { + n_blocks: usize, + n_rle: usize, + total_pairs: usize, // sum of rle_len() for RLE blocks only + min_pairs: usize, + max_pairs: usize, +} + +impl TensorStats { + fn new() -> Self { + Self { min_pairs: usize::MAX, ..Default::default() } + } + + fn observe(&mut self, is_rle: bool, pairs: usize) { + self.n_blocks += 1; + if is_rle { + self.n_rle += 1; + self.total_pairs += pairs; + self.min_pairs = self.min_pairs.min(pairs); + self.max_pairs = self.max_pairs.max(pairs); + } + } + + fn rle_pct(&self) -> f64 { + if self.n_blocks == 0 { return 0.0; } + 100.0 * self.n_rle as f64 / self.n_blocks as f64 + } + + fn avg_pairs(&self) -> Option { + if self.n_rle == 0 { return None; } + Some(self.total_pairs as f64 / self.n_rle as f64) + } + + fn min_pairs(&self) -> Option { + if self.n_rle == 0 { None } else { Some(self.min_pairs) } + } + + fn max_pairs(&self) -> Option { + if self.n_rle == 0 { None } else { Some(self.max_pairs) } + } +} + +// --------------------------------------------------------------------------- +// Formatting helpers +// --------------------------------------------------------------------------- + +fn fmt_count(n: usize) -> String { + // Insert thousands separators. + let s = n.to_string(); + let mut out = String::new(); + for (i, ch) in s.chars().rev().enumerate() { + if i > 0 && i % 3 == 0 { out.push('_'); } + out.push(ch); + } + out.chars().rev().collect() +} + +/// Truncate or pad `s` to exactly `width` characters. +fn fixed(s: &str, width: usize) -> String { + if s.len() >= width { + format!("{:.width$}", &s[..width]) + } else { + format!("{s: Result<(), Box> { + let args: Vec = env::args().collect(); + if args.len() < 2 { + eprintln!("usage: {} ", args[0]); + std::process::exit(1); + } + let path = &args[1]; + + // ── Parse header ───────────────────────────────────────────────────────── + eprintln!("Parsing {path} …"); + let (tensors, data_start) = parse_header(path)?; + + let q4k_tensors: Vec<_> = tensors + .iter() + .filter(|t| t.is_usable_q4k()) + .collect(); + + let other_count = tensors.len() - q4k_tensors.len(); + eprintln!( + " {} tensors total: {} Q4_K (will scan), {} other (skipped)", + tensors.len(), + q4k_tensors.len(), + other_count, + ); + eprintln!(); + + // ── Header row ─────────────────────────────────────────────────────────── + // Columns: Name(40) | Shape(18) | Blocks(9) | RLE%(7) | AvgPairs(9) | Range(14) + println!( + "{:<40} {:>9} {:>9} {:>6} {:>9} {}", + "Tensor", "Rows", "Cols", "Blocks", "RLE%", "Pairs (avg / min–max)" + ); + println!("{}", "─".repeat(100)); + + // ── Scan each tensor ───────────────────────────────────────────────────── + let mut file = BufReader::new(File::open(path)?); + let scan_start = Instant::now(); + + let mut global_blocks = 0usize; + let mut global_rle = 0usize; + let mut any_rle = false; + + for tensor in &q4k_tensors { + let (m, k) = tensor.matrix_dims(); + let mut stats = TensorStats::new(); + + for_each_block(&mut file, data_start, tensor, |block| { + let rle_block = encode(block); + stats.observe(rle_block.is_rle(), rle_block.rle_len()); + })?; + + global_blocks += stats.n_blocks; + global_rle += stats.n_rle; + if stats.n_rle > 0 { any_rle = true; } + + // Format the pairs column — blank when no RLE blocks found. + let pairs_col = match (stats.avg_pairs(), stats.min_pairs(), stats.max_pairs()) { + (Some(avg), Some(lo), Some(hi)) => format!("{avg:.1} ({lo}–{hi})"), + _ => "—".to_string(), + }; + + println!( + "{} {:>9} {:>9} {:>6} {:>6.1}% {}", + fixed(&tensor.name, 40), + fmt_count(m), + fmt_count(k), + fmt_count(stats.n_blocks), + stats.rle_pct(), + pairs_col, + ); + + // Flush so the user sees progress on slow storage. + let _ = io::stdout().flush(); + } + + // ── Summary ────────────────────────────────────────────────────────────── + let elapsed = scan_start.elapsed(); + println!("{}", "─".repeat(100)); + println!( + "Tensors : {} Blocks scanned: {} RLE blocks: {} ({:.2}%) Time: {:.1} s", + fmt_count(q4k_tensors.len()), + fmt_count(global_blocks), + fmt_count(global_rle), + 100.0 * global_rle as f64 / global_blocks.max(1) as f64, + elapsed.as_secs_f64(), + ); + + if !any_rle { + println!(); + println!("No blocks compressed with RLE — all weights are effectively random at"); + println!("the byte level, which is typical for trained Q4_K quantised weights."); + println!("RLE compression only helps for structured weight matrices (binary,"); + println!("ternary, heavily pruned, or synthetic)."); + } + + Ok(()) +} diff --git a/src/gguf.rs b/src/gguf.rs new file mode 100644 index 0000000..0815564 --- /dev/null +++ b/src/gguf.rs @@ -0,0 +1,329 @@ +//! Minimal GGUF (v2 / v3) file parser. +//! +//! Provides just enough to locate Q4_K tensor data inside a GGUF file and +//! stream its raw blocks without loading the whole file into memory. +//! +//! # GGUF file layout +//! +//! ```text +//! ┌────────────────────────────────────────────────────────────┐ +//! │ magic u32 │ version u32 │ n_tensors u64 │ n_kv_pairs u64 │ +//! ├────────────────────────────────────────────────────────────┤ +//! │ metadata key-value pairs (variable length) │ +//! ├────────────────────────────────────────────────────────────┤ +//! │ tensor info records (fixed structure, variable count)│ +//! ├────────────────────────────────────────────────────────────┤ +//! │ padding to `alignment` boundary (default 32 bytes) │ +//! ├────────────────────────────────────────────────────────────┤ +//! │ tensor data — each tensor at its declared offset │ +//! └────────────────────────────────────────────────────────────┘ +//! ``` +//! +//! Each Q4_K block on disk is 144 bytes and is binary-compatible with +//! [`crate::BlockQ4K`]: +//! +//! ```text +//! d(2) + dmin(2) + scales(12) + qs(128) = 144 bytes +//! ``` + +use std::{ + error::Error, + fs::File, + io::{self, BufReader, Read, Seek, SeekFrom}, +}; + +use crate::{BlockQ4K, K_SCALE_SIZE, QK_K}; + +// --------------------------------------------------------------------------- +// Public constants +// --------------------------------------------------------------------------- + +/// File magic: the bytes `b"GGUF"` read as a little-endian `u32`. +pub const GGUF_MAGIC: u32 = 0x4655_4747; + +/// Tensor data alignment used when `general.alignment` is absent. +pub const GGUF_DEFAULT_ALIGNMENT: u64 = 32; + +/// GGML tensor type code for Q4_K (ggml.h `GGML_TYPE_Q4_K`). +pub const GGML_TYPE_Q4_K: u32 = 12; + +/// Raw byte size of one Q4_K block: `d(2) + dmin(2) + scales(12) + qs(128)`. +pub const BLOCK_BYTES: usize = 2 + 2 + K_SCALE_SIZE + QK_K / 2; // 144 + +// --------------------------------------------------------------------------- +// GGUF metadata value type tags (gguf-spec §3.2) +// --------------------------------------------------------------------------- + +const GTYPE_U8: u32 = 0; +const GTYPE_I8: u32 = 1; +const GTYPE_U16: u32 = 2; +const GTYPE_I16: u32 = 3; +const GTYPE_U32: u32 = 4; +const GTYPE_I32: u32 = 5; +const GTYPE_F32: u32 = 6; +const GTYPE_BOOL: u32 = 7; +const GTYPE_STR: u32 = 8; +const GTYPE_ARR: u32 = 9; +const GTYPE_U64: u32 = 10; +const GTYPE_I64: u32 = 11; +const GTYPE_F64: u32 = 12; + +// --------------------------------------------------------------------------- +// Private byte-level readers (all little-endian) +// --------------------------------------------------------------------------- + +fn read_u8(r: &mut impl Read) -> io::Result { + let mut b = [0u8; 1]; + r.read_exact(&mut b)?; + Ok(b[0]) +} + +fn read_u16(r: &mut impl Read) -> io::Result { + let mut b = [0u8; 2]; + r.read_exact(&mut b)?; + Ok(u16::from_le_bytes(b)) +} + +fn read_u32(r: &mut impl Read) -> io::Result { + let mut b = [0u8; 4]; + r.read_exact(&mut b)?; + Ok(u32::from_le_bytes(b)) +} + +fn read_u64(r: &mut impl Read) -> io::Result { + let mut b = [0u8; 8]; + r.read_exact(&mut b)?; + Ok(u64::from_le_bytes(b)) +} + +/// Read a GGUF length-prefixed UTF-8 string (`u64` byte count followed by +/// raw bytes; not NUL-terminated). +fn read_str(r: &mut impl Read) -> io::Result { + let len = read_u64(r)? as usize; + let mut buf = vec![0u8; len]; + r.read_exact(&mut buf)?; + Ok(String::from_utf8_lossy(&buf).into_owned()) +} + +/// Consume and discard one GGUF metadata value of type `tag`. +/// +/// Recurses for array elements; does not allocate for scalar types. +fn skip_value(r: &mut impl Read, tag: u32) -> io::Result<()> { + match tag { + GTYPE_U8 | GTYPE_I8 | GTYPE_BOOL => { read_u8(r)?; } + GTYPE_U16 | GTYPE_I16 => { read_u16(r)?; } + GTYPE_U32 | GTYPE_I32 | GTYPE_F32 => { read_u32(r)?; } + GTYPE_U64 | GTYPE_I64 | GTYPE_F64 => { read_u64(r)?; } + GTYPE_STR => { read_str(r)?; } + GTYPE_ARR => { + let elem_tag = read_u32(r)?; + let count = read_u64(r)?; + for _ in 0..count { + skip_value(r, elem_tag)?; + } + } + t => return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unknown GGUF metadata value type {t}"), + )), + } + Ok(()) +} + +// --------------------------------------------------------------------------- +// TensorInfo +// --------------------------------------------------------------------------- + +/// Metadata for one tensor as recorded in the GGUF header. +#[derive(Debug, Clone)] +pub struct TensorInfo { + /// Tensor name as stored in the file (e.g. `"blk.0.attn_q.weight"`). + pub name: String, + + /// Shape in GGML dimension order: `dims[0]` is the innermost + /// (fastest-varying) axis. + /// + /// For a 2-D weight matrix: + /// - `dims[0]` = K (input features / column count) + /// - `dims[1]` = M (output features / row count) + pub dims: Vec, + + /// GGML type code (e.g. [`GGML_TYPE_Q4_K`] = 12). + pub dtype: u32, + + /// Byte offset of this tensor's data measured **from the start of the + /// data section** (add `data_start` from [`parse_header`] to get the + /// absolute file offset). + pub offset: u64, +} + +impl TensorInfo { + /// Total number of elements across all dimensions. + pub fn n_elements(&self) -> u64 { + self.dims.iter().product() + } + + /// Number of Q4_K blocks (only meaningful when `dtype == GGML_TYPE_Q4_K`). + pub fn n_blocks(&self) -> usize { + (self.n_elements() / QK_K as u64) as usize + } + + /// Raw byte count occupied by this tensor's data on disk. + pub fn data_bytes(&self) -> u64 { + debug_assert_eq!(self.dtype, GGML_TYPE_Q4_K); + self.n_blocks() as u64 * BLOCK_BYTES as u64 + } + + /// Return `(m, k)` matrix dimensions for a 2-D tensor. + /// + /// - `m` = `dims[1]` — number of rows (output features) + /// - `k` = `dims[0]` — number of columns (input features) + pub fn matrix_dims(&self) -> (usize, usize) { + assert_eq!(self.dims.len(), 2, "expected a 2-D tensor"); + (self.dims[1] as usize, self.dims[0] as usize) + } + + /// Returns `true` if this tensor is a 2-D Q4_K matrix whose inner + /// dimension is a multiple of `QK_K` (required by our matmul). + pub fn is_usable_q4k(&self) -> bool { + self.dtype == GGML_TYPE_Q4_K + && self.dims.len() == 2 + && self.dims[0] % QK_K as u64 == 0 + } +} + +// --------------------------------------------------------------------------- +// Header parser +// --------------------------------------------------------------------------- + +/// Open the GGUF file at `path`, parse its header, and return +/// `(tensor_infos, data_start_offset)`. +/// +/// `data_start_offset` is the **absolute** byte position in the file where +/// the first tensor's data begins (after header + alignment padding). +/// +/// # Errors +/// +/// Returns an error if the file is not a valid GGUF file, if I/O fails, or +/// if an unknown metadata value type is encountered. +pub fn parse_header(path: &str) -> Result<(Vec, u64), Box> { + let mut r = BufReader::new(File::open(path)?); + + // Magic number + let magic = read_u32(&mut r)?; + if magic != GGUF_MAGIC { + return Err(format!( + "not a GGUF file (expected {GGUF_MAGIC:#010x}, got {magic:#010x})" + ).into()); + } + + // Version — we support 2 and 3; warn on anything else and try anyway. + let version = read_u32(&mut r)?; + if !(2..=3).contains(&version) { + eprintln!("warning: unexpected GGUF version {version} — proceeding anyway"); + } + + let n_tensors = read_u64(&mut r)? as usize; + let n_metadata = read_u64(&mut r)?; + + // Scan metadata KV pairs; pull out `general.alignment` if present. + let mut alignment = GGUF_DEFAULT_ALIGNMENT; + for _ in 0..n_metadata { + let key = read_str(&mut r)?; + let tag = read_u32(&mut r)?; + if key == "general.alignment" && tag == GTYPE_U32 { + alignment = read_u32(&mut r)? as u64; + } else { + skip_value(&mut r, tag)?; + } + } + + // Tensor info records. + let mut tensors = Vec::with_capacity(n_tensors); + for _ in 0..n_tensors { + let name = read_str(&mut r)?; + let n_dims = read_u32(&mut r)? as usize; + let dims: Vec = (0..n_dims) + .map(|_| read_u64(&mut r)) + .collect::>()?; + let dtype = read_u32(&mut r)?; + let offset = read_u64(&mut r)?; + tensors.push(TensorInfo { name, dims, dtype, offset }); + } + + // Data section starts at the next multiple of `alignment` after the header. + let header_end = r.stream_position()?; + let data_start = header_end.div_ceil(alignment) * alignment; + + Ok((tensors, data_start)) +} + +// --------------------------------------------------------------------------- +// Block readers +// --------------------------------------------------------------------------- + +/// Parse one [`BlockQ4K`] from a 144-byte buffer. +/// +/// The buffer must be in GGUF / GGML on-disk layout: +/// bytes 0–1 = `d`, 2–3 = `dmin`, 4–15 = `scales`, 16–143 = `qs`. +#[inline] +pub fn parse_block(buf: &[u8; BLOCK_BYTES]) -> BlockQ4K { + BlockQ4K { + d: u16::from_le_bytes([buf[0], buf[1]]), + dmin: u16::from_le_bytes([buf[2], buf[3]]), + scales: buf[4..16].try_into().unwrap(), + qs: buf[16..BLOCK_BYTES].try_into().unwrap(), + } +} + +/// Load all Q4_K blocks in `tensor` into a freshly allocated `Vec`. +/// +/// The reader is seeked to the correct file position automatically. +/// `data_start` must be the value returned by [`parse_header`]. +pub fn load_blocks( + r: &mut (impl Read + Seek), + data_start: u64, + tensor: &TensorInfo, +) -> io::Result> { + r.seek(SeekFrom::Start(data_start + tensor.offset))?; + + let n = tensor.n_blocks(); + let mut blocks = Vec::with_capacity(n); + let mut buf = [0u8; BLOCK_BYTES]; + + for _ in 0..n { + r.read_exact(&mut buf)?; + blocks.push(parse_block(&buf)); + } + + Ok(blocks) +} + +/// Iterate over every Q4_K block in `tensor`, calling `f` with a reference +/// to each parsed [`BlockQ4K`] in turn. +/// +/// Only one block-sized buffer (144 bytes) is allocated on the stack; no +/// `Vec` is built. Use this for streaming scans that do not need to retain +/// blocks (e.g. computing statistics or RLE compression ratios). +/// +/// The reader is seeked to the correct file position automatically. +/// `data_start` must be the value returned by [`parse_header`]. +pub fn for_each_block( + r: &mut (impl Read + Seek), + data_start: u64, + tensor: &TensorInfo, + mut f: F, +) -> io::Result<()> +where + F: FnMut(&BlockQ4K), +{ + r.seek(SeekFrom::Start(data_start + tensor.offset))?; + + let mut buf = [0u8; BLOCK_BYTES]; + for _ in 0..tensor.n_blocks() { + r.read_exact(&mut buf)?; + f(&parse_block(&buf)); + } + + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs index ded81b3..9063444 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,6 +19,7 @@ //! f32, accumulate dot-products. No SIMD, no tiling, no tricks. pub mod rle; +pub mod gguf; // --------------------------------------------------------------------------- // Constants matching GGML's ggml-common.h