Limited impact; there does not seem to be many RLE cases in real data
This commit is contained in:
@@ -4,274 +4,40 @@
|
|||||||
//! # Usage
|
//! # Usage
|
||||||
//!
|
//!
|
||||||
//! ```text
|
//! ```text
|
||||||
//! cargo run --release --bin gguf_matmul -- <model.gguf> [n]
|
//! cargo run --release --bin gguf_matmul -- <model.gguf> [n] [trials]
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! `n` is the number of activation columns (token count / batch size).
|
//! `n` is the number of activation columns (token count / batch size).
|
||||||
//! Defaults to 1 (single-token inference).
|
//! Defaults to 1 (single-token inference).
|
||||||
//!
|
|
||||||
//! # GGUF layout (v2 / v3)
|
|
||||||
//!
|
|
||||||
//! ```text
|
|
||||||
//! ┌─────────────────────────────────────────────────────┐
|
|
||||||
//! │ magic u32 │ version u32 │ n_tensors u64 │ n_kv u64 │
|
|
||||||
//! ├─────────────────────────────────────────────────────┤
|
|
||||||
//! │ metadata key-value pairs (variable length) │
|
|
||||||
//! ├─────────────────────────────────────────────────────┤
|
|
||||||
//! │ tensor info records (variable length) │
|
|
||||||
//! ├─────────────────────────────────────────────────────┤
|
|
||||||
//! │ padding to `alignment` boundary (default 32 bytes) │
|
|
||||||
//! ├─────────────────────────────────────────────────────┤
|
|
||||||
//! │ tensor data (concatenated, each individually padded)│
|
|
||||||
//! └─────────────────────────────────────────────────────┘
|
|
||||||
//! ```
|
|
||||||
//!
|
|
||||||
//! Each Q4_K block is 144 bytes:
|
|
||||||
//! `d(2) + dmin(2) + scales(12) + qs(128)` — identical to our `BlockQ4K`.
|
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
env,
|
env,
|
||||||
error::Error,
|
error::Error,
|
||||||
fs::File,
|
fs::File,
|
||||||
io::{self, BufReader, Read, Seek, SeekFrom},
|
io::BufReader,
|
||||||
time::{Duration, Instant},
|
time::{Duration, Instant},
|
||||||
};
|
};
|
||||||
|
|
||||||
use matrix_testing::{
|
use matrix_testing::{
|
||||||
matmul_q4k_fp16, BlockQ4K, K_SCALE_SIZE, QK_K,
|
matmul_q4k_fp16,
|
||||||
|
gguf::{parse_header, load_blocks, GGML_TYPE_Q4_K},
|
||||||
rle::{encode, matmul_q4k_rle_fp16, BlockQ4KRle},
|
rle::{encode, matmul_q4k_rle_fp16, BlockQ4KRle},
|
||||||
};
|
};
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// GGUF constants
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
/// File magic: bytes b"GGUF" interpreted as a little-endian u32.
|
|
||||||
const GGUF_MAGIC: u32 = 0x4655_4747;
|
|
||||||
|
|
||||||
/// Default tensor data alignment when not overridden by `general.alignment`.
|
|
||||||
const GGUF_DEFAULT_ALIGNMENT: u64 = 32;
|
|
||||||
|
|
||||||
/// GGML tensor type code for Q4_K (matches ggml.h `GGML_TYPE_Q4_K`).
|
|
||||||
const GGML_TYPE_Q4_K: u32 = 12;
|
|
||||||
|
|
||||||
/// Size in bytes of one Q4_K block: d(2) + dmin(2) + scales(12) + qs(128).
|
|
||||||
const BLOCK_BYTES: usize = 2 + 2 + K_SCALE_SIZE + QK_K / 2; // 144
|
|
||||||
|
|
||||||
// GGUF metadata value type tags (gguf-spec §3.2).
|
|
||||||
const GTYPE_U8: u32 = 0;
|
|
||||||
const GTYPE_I8: u32 = 1;
|
|
||||||
const GTYPE_U16: u32 = 2;
|
|
||||||
const GTYPE_I16: u32 = 3;
|
|
||||||
const GTYPE_U32: u32 = 4;
|
|
||||||
const GTYPE_I32: u32 = 5;
|
|
||||||
const GTYPE_F32: u32 = 6;
|
|
||||||
const GTYPE_BOOL: u32 = 7;
|
|
||||||
const GTYPE_STR: u32 = 8;
|
|
||||||
const GTYPE_ARR: u32 = 9;
|
|
||||||
const GTYPE_U64: u32 = 10;
|
|
||||||
const GTYPE_I64: u32 = 11;
|
|
||||||
const GTYPE_F64: u32 = 12;
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Primitive binary readers (little-endian, no deps)
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
fn read_u8(r: &mut impl Read) -> io::Result<u8> {
|
|
||||||
let mut b = [0u8; 1];
|
|
||||||
r.read_exact(&mut b)?;
|
|
||||||
Ok(b[0])
|
|
||||||
}
|
|
||||||
|
|
||||||
fn read_u16(r: &mut impl Read) -> io::Result<u16> {
|
|
||||||
let mut b = [0u8; 2];
|
|
||||||
r.read_exact(&mut b)?;
|
|
||||||
Ok(u16::from_le_bytes(b))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn read_u32(r: &mut impl Read) -> io::Result<u32> {
|
|
||||||
let mut b = [0u8; 4];
|
|
||||||
r.read_exact(&mut b)?;
|
|
||||||
Ok(u32::from_le_bytes(b))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn read_u64(r: &mut impl Read) -> io::Result<u64> {
|
|
||||||
let mut b = [0u8; 8];
|
|
||||||
r.read_exact(&mut b)?;
|
|
||||||
Ok(u64::from_le_bytes(b))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Read a GGUF length-prefixed UTF-8 string.
|
|
||||||
fn read_str(r: &mut impl Read) -> io::Result<String> {
|
|
||||||
let len = read_u64(r)? as usize;
|
|
||||||
let mut buf = vec![0u8; len];
|
|
||||||
r.read_exact(&mut buf)?;
|
|
||||||
Ok(String::from_utf8_lossy(&buf).into_owned())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Skip one GGUF metadata value of the given type tag without storing it.
|
|
||||||
fn skip_value(r: &mut impl Read, tag: u32) -> io::Result<()> {
|
|
||||||
match tag {
|
|
||||||
GTYPE_U8 | GTYPE_I8 | GTYPE_BOOL => { read_u8(r)?; }
|
|
||||||
GTYPE_U16 | GTYPE_I16 => { read_u16(r)?; }
|
|
||||||
GTYPE_U32 | GTYPE_I32 | GTYPE_F32 => { read_u32(r)?; }
|
|
||||||
GTYPE_U64 | GTYPE_I64 | GTYPE_F64 => { read_u64(r)?; }
|
|
||||||
GTYPE_STR => { read_str(r)?; }
|
|
||||||
GTYPE_ARR => {
|
|
||||||
let elem_tag = read_u32(r)?;
|
|
||||||
let count = read_u64(r)?;
|
|
||||||
for _ in 0..count {
|
|
||||||
skip_value(r, elem_tag)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
t => return Err(io::Error::new(
|
|
||||||
io::ErrorKind::InvalidData,
|
|
||||||
format!("unknown GGUF value type {t}"),
|
|
||||||
)),
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Tensor info
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
struct TensorInfo {
|
|
||||||
name: String,
|
|
||||||
/// Dimensions in GGML order: dims[0] is the innermost (fastest-varying).
|
|
||||||
/// For a 2-D weight matrix: dims[0] = K (in-features), dims[1] = M (out-features).
|
|
||||||
dims: Vec<u64>,
|
|
||||||
dtype: u32,
|
|
||||||
/// Byte offset of this tensor's data measured from the start of the data
|
|
||||||
/// section (i.e. add `data_start` to get the absolute file offset).
|
|
||||||
offset: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TensorInfo {
|
|
||||||
fn n_elements(&self) -> u64 {
|
|
||||||
self.dims.iter().product()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn data_bytes(&self) -> u64 {
|
|
||||||
debug_assert_eq!(self.dtype, GGML_TYPE_Q4_K);
|
|
||||||
(self.n_elements() / QK_K as u64) * BLOCK_BYTES as u64
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return (m, k) matrix dimensions.
|
|
||||||
/// dims[0] = K (column / inner dim), dims[1] = M (row / outer dim).
|
|
||||||
fn matrix_dims(&self) -> (usize, usize) {
|
|
||||||
assert_eq!(self.dims.len(), 2, "expected 2-D tensor");
|
|
||||||
let k = self.dims[0] as usize;
|
|
||||||
let m = self.dims[1] as usize;
|
|
||||||
(m, k)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// GGUF header parser
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
/// Parse the GGUF file header and return `(tensor_infos, data_start_offset)`.
|
|
||||||
///
|
|
||||||
/// `data_start_offset` is the absolute byte position where tensor data begins.
|
|
||||||
fn parse_header(path: &str) -> Result<(Vec<TensorInfo>, u64), Box<dyn Error>> {
|
|
||||||
let mut r = BufReader::new(File::open(path)?);
|
|
||||||
|
|
||||||
// Magic + version
|
|
||||||
let magic = read_u32(&mut r)?;
|
|
||||||
if magic != GGUF_MAGIC {
|
|
||||||
return Err(format!(
|
|
||||||
"not a GGUF file (expected magic {GGUF_MAGIC:#010x}, got {magic:#010x})"
|
|
||||||
).into());
|
|
||||||
}
|
|
||||||
let version = read_u32(&mut r)?;
|
|
||||||
if !(2..=3).contains(&version) {
|
|
||||||
eprintln!("warning: unexpected GGUF version {version} — proceeding anyway");
|
|
||||||
}
|
|
||||||
|
|
||||||
let n_tensors = read_u64(&mut r)? as usize;
|
|
||||||
let n_metadata = read_u64(&mut r)?;
|
|
||||||
|
|
||||||
// Scan metadata KV pairs; capture `general.alignment` if present.
|
|
||||||
let mut alignment = GGUF_DEFAULT_ALIGNMENT;
|
|
||||||
for _ in 0..n_metadata {
|
|
||||||
let key = read_str(&mut r)?;
|
|
||||||
let tag = read_u32(&mut r)?;
|
|
||||||
if key == "general.alignment" && tag == GTYPE_U32 {
|
|
||||||
alignment = read_u32(&mut r)? as u64;
|
|
||||||
} else {
|
|
||||||
skip_value(&mut r, tag)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tensor info records.
|
|
||||||
let mut tensors = Vec::with_capacity(n_tensors);
|
|
||||||
for _ in 0..n_tensors {
|
|
||||||
let name = read_str(&mut r)?;
|
|
||||||
let n_dims = read_u32(&mut r)? as usize;
|
|
||||||
let dims: Vec<u64> = (0..n_dims)
|
|
||||||
.map(|_| read_u64(&mut r))
|
|
||||||
.collect::<io::Result<_>>()?;
|
|
||||||
let dtype = read_u32(&mut r)?;
|
|
||||||
let offset = read_u64(&mut r)?;
|
|
||||||
tensors.push(TensorInfo { name, dims, dtype, offset });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Data starts at the next `alignment`-byte boundary after the header.
|
|
||||||
let header_end = r.stream_position()?;
|
|
||||||
let data_start = header_end.div_ceil(alignment) * alignment;
|
|
||||||
|
|
||||||
Ok((tensors, data_start))
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Block loader
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
/// Seek to the tensor's data and read its Q4_K blocks into a Vec.
|
|
||||||
fn load_blocks(
|
|
||||||
r: &mut (impl Read + Seek),
|
|
||||||
data_start: u64,
|
|
||||||
tensor: &TensorInfo,
|
|
||||||
) -> io::Result<Vec<BlockQ4K>> {
|
|
||||||
r.seek(SeekFrom::Start(data_start + tensor.offset))?;
|
|
||||||
|
|
||||||
let n_blocks = (tensor.n_elements() / QK_K as u64) as usize;
|
|
||||||
let mut blocks = Vec::with_capacity(n_blocks);
|
|
||||||
let mut buf = [0u8; BLOCK_BYTES];
|
|
||||||
|
|
||||||
for _ in 0..n_blocks {
|
|
||||||
r.read_exact(&mut buf)?;
|
|
||||||
// Parse field by field — safe, no transmute.
|
|
||||||
// Layout: d(0..2) dmin(2..4) scales(4..16) qs(16..144)
|
|
||||||
blocks.push(BlockQ4K {
|
|
||||||
d: u16::from_le_bytes([buf[0], buf[1]]),
|
|
||||||
dmin: u16::from_le_bytes([buf[2], buf[3]]),
|
|
||||||
scales: buf[4..16].try_into().unwrap(),
|
|
||||||
qs: buf[16..BLOCK_BYTES].try_into().unwrap(),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(blocks)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Random FP16 activation matrix (no external rand dep)
|
// Random FP16 activation matrix (no external rand dep)
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
/// Minimal 64-bit LCG (Knuth / PCG constants).
|
|
||||||
struct Lcg(u64);
|
struct Lcg(u64);
|
||||||
|
|
||||||
impl Lcg {
|
impl Lcg {
|
||||||
fn new(seed: u64) -> Self { Self(seed) }
|
fn new(seed: u64) -> Self { Self(seed) }
|
||||||
|
|
||||||
/// Return the next pseudo-random f32 in (−0.05, +0.05).
|
/// Returns the next pseudo-random f32 in (−0.05, +0.05).
|
||||||
/// This is a plausible scale for normalised transformer activations.
|
|
||||||
fn next_f32(&mut self) -> f32 {
|
fn next_f32(&mut self) -> f32 {
|
||||||
self.0 = self.0
|
self.0 = self.0
|
||||||
.wrapping_mul(6_364_136_223_846_793_005)
|
.wrapping_mul(6_364_136_223_846_793_005)
|
||||||
.wrapping_add(1_442_695_040_888_963_407);
|
.wrapping_add(1_442_695_040_888_963_407);
|
||||||
// Map high 32 bits to [0, 1) then shift to (−0.05, +0.05).
|
|
||||||
(self.0 >> 32) as f32 / 4_294_967_296.0 * 0.10 - 0.05
|
(self.0 >> 32) as f32 / 4_294_967_296.0 * 0.10 - 0.05
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -283,8 +49,8 @@ fn f32_to_fp16(f: f32) -> u16 {
|
|||||||
let sign = ((bits >> 31) as u16) << 15;
|
let sign = ((bits >> 31) as u16) << 15;
|
||||||
let exp = ((bits >> 23) & 0xFF) as i32 - 127 + 15;
|
let exp = ((bits >> 23) & 0xFF) as i32 - 127 + 15;
|
||||||
let mantissa = (bits & 0x007F_FFFF) >> 13;
|
let mantissa = (bits & 0x007F_FFFF) >> 13;
|
||||||
if exp <= 0 { return sign; } // underflow → signed zero
|
if exp <= 0 { return sign; }
|
||||||
if exp >= 31 { return sign | 0x7C00; } // overflow → signed infinity
|
if exp >= 31 { return sign | 0x7C00; }
|
||||||
sign | ((exp as u16) << 10) | mantissa as u16
|
sign | ((exp as u16) << 10) | mantissa as u16
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -335,23 +101,17 @@ fn main() -> Result<(), Box<dyn Error>> {
|
|||||||
println!("Parsing {path} …");
|
println!("Parsing {path} …");
|
||||||
let (tensors, data_start) = parse_header(path)?;
|
let (tensors, data_start) = parse_header(path)?;
|
||||||
|
|
||||||
// List all tensor names and types for context.
|
|
||||||
let n_q4k = tensors.iter().filter(|t| t.dtype == GGML_TYPE_Q4_K).count();
|
let n_q4k = tensors.iter().filter(|t| t.dtype == GGML_TYPE_Q4_K).count();
|
||||||
println!(" {} tensors total, {} are Q4_K", tensors.len(), n_q4k);
|
println!(" {} tensors total, {} are Q4_K", tensors.len(), n_q4k);
|
||||||
|
|
||||||
// ── Select the first suitable 2-D Q4_K tensor ───────────────────────────
|
// ── Select the first suitable 2-D Q4_K tensor ───────────────────────────
|
||||||
// "Suitable" means 2-D and K divisible by QK_K (required for our matmul).
|
|
||||||
let tensor = tensors
|
let tensor = tensors
|
||||||
.iter()
|
.iter()
|
||||||
.find(|t| {
|
.find(|t| t.is_usable_q4k())
|
||||||
t.dtype == GGML_TYPE_Q4_K
|
|
||||||
&& t.dims.len() == 2
|
|
||||||
&& t.dims[0] % QK_K as u64 == 0
|
|
||||||
})
|
|
||||||
.ok_or("no suitable 2-D Q4_K tensor found in this GGUF file")?;
|
.ok_or("no suitable 2-D Q4_K tensor found in this GGUF file")?;
|
||||||
|
|
||||||
let (m, k) = tensor.matrix_dims();
|
let (m, k) = tensor.matrix_dims();
|
||||||
let n_blocks = m * (k / QK_K);
|
let n_blocks = tensor.n_blocks();
|
||||||
let data_mib = tensor.data_bytes() as f64 / (1u64 << 20) as f64;
|
let data_mib = tensor.data_bytes() as f64 / (1u64 << 20) as f64;
|
||||||
|
|
||||||
println!();
|
println!();
|
||||||
@@ -368,9 +128,9 @@ fn main() -> Result<(), Box<dyn Error>> {
|
|||||||
let t0 = Instant::now();
|
let t0 = Instant::now();
|
||||||
let mut file = BufReader::new(File::open(path)?);
|
let mut file = BufReader::new(File::open(path)?);
|
||||||
let blocks = load_blocks(&mut file, data_start, tensor)?;
|
let blocks = load_blocks(&mut file, data_start, tensor)?;
|
||||||
println!("{:.3} s ({} blocks × {} B)", t0.elapsed().as_secs_f64(), n_blocks, BLOCK_BYTES);
|
println!("{:.3} s", t0.elapsed().as_secs_f64());
|
||||||
|
|
||||||
// ── Build random activation matrix [K × N] ───────────────────────────────
|
// ── Build random activation matrix [K × N] ──────────────────────────────
|
||||||
let b_fp16 = random_fp16_matrix(k, n, 0xDEAD_BEEF_CAFE_1234);
|
let b_fp16 = random_fp16_matrix(k, n, 0xDEAD_BEEF_CAFE_1234);
|
||||||
|
|
||||||
// ── Baseline matmul (best of `trials`) ──────────────────────────────────
|
// ── Baseline matmul (best of `trials`) ──────────────────────────────────
|
||||||
@@ -388,14 +148,6 @@ fn main() -> Result<(), Box<dyn Error>> {
|
|||||||
blocks.iter().map(encode).collect()
|
blocks.iter().map(encode).collect()
|
||||||
});
|
});
|
||||||
|
|
||||||
for block in &rle_blocks {
|
|
||||||
println!("Got value {:?}", block);
|
|
||||||
for pair in block.qs {
|
|
||||||
println!("top {} bottom {}", (pair >> 4), (pair & 0b1111));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
let n_rle = rle_blocks.iter().filter(|b| b.is_rle()).count();
|
let n_rle = rle_blocks.iter().filter(|b| b.is_rle()).count();
|
||||||
let n_raw = n_blocks - n_rle;
|
let n_raw = n_blocks - n_rle;
|
||||||
let avg_pairs = if n_rle > 0 {
|
let avg_pairs = if n_rle > 0 {
|
||||||
@@ -403,11 +155,12 @@ fn main() -> Result<(), Box<dyn Error>> {
|
|||||||
.filter(|b| b.is_rle())
|
.filter(|b| b.is_rle())
|
||||||
.map(|b| b.rle_len() as f64)
|
.map(|b| b.rle_len() as f64)
|
||||||
.sum::<f64>() / n_rle as f64
|
.sum::<f64>() / n_rle as f64
|
||||||
} else { 0.0 };
|
} else {
|
||||||
|
0.0
|
||||||
|
};
|
||||||
println!(
|
println!(
|
||||||
"Encode : {:.3} s RLE {n_rle}/{n_blocks} blocks ({:.1}%), \
|
"Encode : {:.3} s RLE {n_rle}/{n_blocks} ({:.1}%) \
|
||||||
raw {n_raw}/{n_blocks} ({:.1}%), avg {avg_pairs:.1} pairs/RLE block",
|
raw {n_raw}/{n_blocks} ({:.1}%) avg {avg_pairs:.1} pairs/RLE block",
|
||||||
t_enc.as_secs_f64(),
|
t_enc.as_secs_f64(),
|
||||||
100.0 * n_rle as f64 / n_blocks as f64,
|
100.0 * n_rle as f64 / n_blocks as f64,
|
||||||
100.0 * n_raw as f64 / n_blocks as f64,
|
100.0 * n_raw as f64 / n_blocks as f64,
|
||||||
@@ -438,11 +191,15 @@ fn main() -> Result<(), Box<dyn Error>> {
|
|||||||
|
|
||||||
// ── Summary ──────────────────────────────────────────────────────────────
|
// ── Summary ──────────────────────────────────────────────────────────────
|
||||||
println!();
|
println!();
|
||||||
println!("Speedup (matmul only): {:.2}×", t_base.as_secs_f64() / t_rle.as_secs_f64());
|
println!(
|
||||||
println!("Speedup (matmul + encode once): {:.2}×",
|
"Speedup (matmul only): {:.2}×",
|
||||||
t_base.as_secs_f64() / (t_rle + t_enc).as_secs_f64());
|
t_base.as_secs_f64() / t_rle.as_secs_f64()
|
||||||
|
);
|
||||||
|
println!(
|
||||||
|
"Speedup (matmul + encode once): {:.2}×",
|
||||||
|
t_base.as_secs_f64() / (t_rle + t_enc).as_secs_f64()
|
||||||
|
);
|
||||||
|
|
||||||
// Show a small slice of the output so it's clear something real happened.
|
|
||||||
let show = n.min(4);
|
let show = n.min(4);
|
||||||
print!("First {show} output(s) of row 0: ");
|
print!("First {show} output(s) of row 0: ");
|
||||||
for j in 0..show {
|
for j in 0..show {
|
||||||
|
|||||||
197
src/bin/gguf_scan.rs
Normal file
197
src/bin/gguf_scan.rs
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
//! Scan every Q4_K tensor in a GGUF file and report per-tensor RLE compression
|
||||||
|
//! statistics, without running a full matrix multiply.
|
||||||
|
//!
|
||||||
|
//! Each block is streamed from disk (one 144-byte buffer on the stack) and
|
||||||
|
//! immediately encoded; no Vec of blocks is retained between tensors.
|
||||||
|
//!
|
||||||
|
//! # Usage
|
||||||
|
//!
|
||||||
|
//! ```text
|
||||||
|
//! cargo run --release --bin gguf_scan -- <model.gguf>
|
||||||
|
//! ```
|
||||||
|
|
||||||
|
use std::{
|
||||||
|
env,
|
||||||
|
error::Error,
|
||||||
|
fs::File,
|
||||||
|
io::{self, BufReader, Write},
|
||||||
|
time::Instant,
|
||||||
|
};
|
||||||
|
|
||||||
|
use matrix_testing::{
|
||||||
|
gguf::{for_each_block, parse_header},
|
||||||
|
rle::encode,
|
||||||
|
};
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Per-tensor RLE statistics
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
struct TensorStats {
|
||||||
|
n_blocks: usize,
|
||||||
|
n_rle: usize,
|
||||||
|
total_pairs: usize, // sum of rle_len() for RLE blocks only
|
||||||
|
min_pairs: usize,
|
||||||
|
max_pairs: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TensorStats {
|
||||||
|
fn new() -> Self {
|
||||||
|
Self { min_pairs: usize::MAX, ..Default::default() }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn observe(&mut self, is_rle: bool, pairs: usize) {
|
||||||
|
self.n_blocks += 1;
|
||||||
|
if is_rle {
|
||||||
|
self.n_rle += 1;
|
||||||
|
self.total_pairs += pairs;
|
||||||
|
self.min_pairs = self.min_pairs.min(pairs);
|
||||||
|
self.max_pairs = self.max_pairs.max(pairs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn rle_pct(&self) -> f64 {
|
||||||
|
if self.n_blocks == 0 { return 0.0; }
|
||||||
|
100.0 * self.n_rle as f64 / self.n_blocks as f64
|
||||||
|
}
|
||||||
|
|
||||||
|
fn avg_pairs(&self) -> Option<f64> {
|
||||||
|
if self.n_rle == 0 { return None; }
|
||||||
|
Some(self.total_pairs as f64 / self.n_rle as f64)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn min_pairs(&self) -> Option<usize> {
|
||||||
|
if self.n_rle == 0 { None } else { Some(self.min_pairs) }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn max_pairs(&self) -> Option<usize> {
|
||||||
|
if self.n_rle == 0 { None } else { Some(self.max_pairs) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Formatting helpers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn fmt_count(n: usize) -> String {
|
||||||
|
// Insert thousands separators.
|
||||||
|
let s = n.to_string();
|
||||||
|
let mut out = String::new();
|
||||||
|
for (i, ch) in s.chars().rev().enumerate() {
|
||||||
|
if i > 0 && i % 3 == 0 { out.push('_'); }
|
||||||
|
out.push(ch);
|
||||||
|
}
|
||||||
|
out.chars().rev().collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Truncate or pad `s` to exactly `width` characters.
|
||||||
|
fn fixed(s: &str, width: usize) -> String {
|
||||||
|
if s.len() >= width {
|
||||||
|
format!("{:.width$}", &s[..width])
|
||||||
|
} else {
|
||||||
|
format!("{s:<width$}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Main
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn main() -> Result<(), Box<dyn Error>> {
|
||||||
|
let args: Vec<String> = env::args().collect();
|
||||||
|
if args.len() < 2 {
|
||||||
|
eprintln!("usage: {} <model.gguf>", args[0]);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
let path = &args[1];
|
||||||
|
|
||||||
|
// ── Parse header ─────────────────────────────────────────────────────────
|
||||||
|
eprintln!("Parsing {path} …");
|
||||||
|
let (tensors, data_start) = parse_header(path)?;
|
||||||
|
|
||||||
|
let q4k_tensors: Vec<_> = tensors
|
||||||
|
.iter()
|
||||||
|
.filter(|t| t.is_usable_q4k())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let other_count = tensors.len() - q4k_tensors.len();
|
||||||
|
eprintln!(
|
||||||
|
" {} tensors total: {} Q4_K (will scan), {} other (skipped)",
|
||||||
|
tensors.len(),
|
||||||
|
q4k_tensors.len(),
|
||||||
|
other_count,
|
||||||
|
);
|
||||||
|
eprintln!();
|
||||||
|
|
||||||
|
// ── Header row ───────────────────────────────────────────────────────────
|
||||||
|
// Columns: Name(40) | Shape(18) | Blocks(9) | RLE%(7) | AvgPairs(9) | Range(14)
|
||||||
|
println!(
|
||||||
|
"{:<40} {:>9} {:>9} {:>6} {:>9} {}",
|
||||||
|
"Tensor", "Rows", "Cols", "Blocks", "RLE%", "Pairs (avg / min–max)"
|
||||||
|
);
|
||||||
|
println!("{}", "─".repeat(100));
|
||||||
|
|
||||||
|
// ── Scan each tensor ─────────────────────────────────────────────────────
|
||||||
|
let mut file = BufReader::new(File::open(path)?);
|
||||||
|
let scan_start = Instant::now();
|
||||||
|
|
||||||
|
let mut global_blocks = 0usize;
|
||||||
|
let mut global_rle = 0usize;
|
||||||
|
let mut any_rle = false;
|
||||||
|
|
||||||
|
for tensor in &q4k_tensors {
|
||||||
|
let (m, k) = tensor.matrix_dims();
|
||||||
|
let mut stats = TensorStats::new();
|
||||||
|
|
||||||
|
for_each_block(&mut file, data_start, tensor, |block| {
|
||||||
|
let rle_block = encode(block);
|
||||||
|
stats.observe(rle_block.is_rle(), rle_block.rle_len());
|
||||||
|
})?;
|
||||||
|
|
||||||
|
global_blocks += stats.n_blocks;
|
||||||
|
global_rle += stats.n_rle;
|
||||||
|
if stats.n_rle > 0 { any_rle = true; }
|
||||||
|
|
||||||
|
// Format the pairs column — blank when no RLE blocks found.
|
||||||
|
let pairs_col = match (stats.avg_pairs(), stats.min_pairs(), stats.max_pairs()) {
|
||||||
|
(Some(avg), Some(lo), Some(hi)) => format!("{avg:.1} ({lo}–{hi})"),
|
||||||
|
_ => "—".to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"{} {:>9} {:>9} {:>6} {:>6.1}% {}",
|
||||||
|
fixed(&tensor.name, 40),
|
||||||
|
fmt_count(m),
|
||||||
|
fmt_count(k),
|
||||||
|
fmt_count(stats.n_blocks),
|
||||||
|
stats.rle_pct(),
|
||||||
|
pairs_col,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Flush so the user sees progress on slow storage.
|
||||||
|
let _ = io::stdout().flush();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Summary ──────────────────────────────────────────────────────────────
|
||||||
|
let elapsed = scan_start.elapsed();
|
||||||
|
println!("{}", "─".repeat(100));
|
||||||
|
println!(
|
||||||
|
"Tensors : {} Blocks scanned: {} RLE blocks: {} ({:.2}%) Time: {:.1} s",
|
||||||
|
fmt_count(q4k_tensors.len()),
|
||||||
|
fmt_count(global_blocks),
|
||||||
|
fmt_count(global_rle),
|
||||||
|
100.0 * global_rle as f64 / global_blocks.max(1) as f64,
|
||||||
|
elapsed.as_secs_f64(),
|
||||||
|
);
|
||||||
|
|
||||||
|
if !any_rle {
|
||||||
|
println!();
|
||||||
|
println!("No blocks compressed with RLE — all weights are effectively random at");
|
||||||
|
println!("the byte level, which is typical for trained Q4_K quantised weights.");
|
||||||
|
println!("RLE compression only helps for structured weight matrices (binary,");
|
||||||
|
println!("ternary, heavily pruned, or synthetic).");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
329
src/gguf.rs
Normal file
329
src/gguf.rs
Normal file
@@ -0,0 +1,329 @@
|
|||||||
|
//! Minimal GGUF (v2 / v3) file parser.
|
||||||
|
//!
|
||||||
|
//! Provides just enough to locate Q4_K tensor data inside a GGUF file and
|
||||||
|
//! stream its raw blocks without loading the whole file into memory.
|
||||||
|
//!
|
||||||
|
//! # GGUF file layout
|
||||||
|
//!
|
||||||
|
//! ```text
|
||||||
|
//! ┌────────────────────────────────────────────────────────────┐
|
||||||
|
//! │ magic u32 │ version u32 │ n_tensors u64 │ n_kv_pairs u64 │
|
||||||
|
//! ├────────────────────────────────────────────────────────────┤
|
||||||
|
//! │ metadata key-value pairs (variable length) │
|
||||||
|
//! ├────────────────────────────────────────────────────────────┤
|
||||||
|
//! │ tensor info records (fixed structure, variable count)│
|
||||||
|
//! ├────────────────────────────────────────────────────────────┤
|
||||||
|
//! │ padding to `alignment` boundary (default 32 bytes) │
|
||||||
|
//! ├────────────────────────────────────────────────────────────┤
|
||||||
|
//! │ tensor data — each tensor at its declared offset │
|
||||||
|
//! └────────────────────────────────────────────────────────────┘
|
||||||
|
//! ```
|
||||||
|
//!
|
||||||
|
//! Each Q4_K block on disk is 144 bytes and is binary-compatible with
|
||||||
|
//! [`crate::BlockQ4K`]:
|
||||||
|
//!
|
||||||
|
//! ```text
|
||||||
|
//! d(2) + dmin(2) + scales(12) + qs(128) = 144 bytes
|
||||||
|
//! ```
|
||||||
|
|
||||||
|
use std::{
|
||||||
|
error::Error,
|
||||||
|
fs::File,
|
||||||
|
io::{self, BufReader, Read, Seek, SeekFrom},
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::{BlockQ4K, K_SCALE_SIZE, QK_K};
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Public constants
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// File magic: the bytes `b"GGUF"` read as a little-endian `u32`.
|
||||||
|
pub const GGUF_MAGIC: u32 = 0x4655_4747;
|
||||||
|
|
||||||
|
/// Tensor data alignment used when `general.alignment` is absent.
|
||||||
|
pub const GGUF_DEFAULT_ALIGNMENT: u64 = 32;
|
||||||
|
|
||||||
|
/// GGML tensor type code for Q4_K (ggml.h `GGML_TYPE_Q4_K`).
|
||||||
|
pub const GGML_TYPE_Q4_K: u32 = 12;
|
||||||
|
|
||||||
|
/// Raw byte size of one Q4_K block: `d(2) + dmin(2) + scales(12) + qs(128)`.
|
||||||
|
pub const BLOCK_BYTES: usize = 2 + 2 + K_SCALE_SIZE + QK_K / 2; // 144
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// GGUF metadata value type tags (gguf-spec §3.2)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
const GTYPE_U8: u32 = 0;
|
||||||
|
const GTYPE_I8: u32 = 1;
|
||||||
|
const GTYPE_U16: u32 = 2;
|
||||||
|
const GTYPE_I16: u32 = 3;
|
||||||
|
const GTYPE_U32: u32 = 4;
|
||||||
|
const GTYPE_I32: u32 = 5;
|
||||||
|
const GTYPE_F32: u32 = 6;
|
||||||
|
const GTYPE_BOOL: u32 = 7;
|
||||||
|
const GTYPE_STR: u32 = 8;
|
||||||
|
const GTYPE_ARR: u32 = 9;
|
||||||
|
const GTYPE_U64: u32 = 10;
|
||||||
|
const GTYPE_I64: u32 = 11;
|
||||||
|
const GTYPE_F64: u32 = 12;
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Private byte-level readers (all little-endian)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
fn read_u8(r: &mut impl Read) -> io::Result<u8> {
|
||||||
|
let mut b = [0u8; 1];
|
||||||
|
r.read_exact(&mut b)?;
|
||||||
|
Ok(b[0])
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_u16(r: &mut impl Read) -> io::Result<u16> {
|
||||||
|
let mut b = [0u8; 2];
|
||||||
|
r.read_exact(&mut b)?;
|
||||||
|
Ok(u16::from_le_bytes(b))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_u32(r: &mut impl Read) -> io::Result<u32> {
|
||||||
|
let mut b = [0u8; 4];
|
||||||
|
r.read_exact(&mut b)?;
|
||||||
|
Ok(u32::from_le_bytes(b))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_u64(r: &mut impl Read) -> io::Result<u64> {
|
||||||
|
let mut b = [0u8; 8];
|
||||||
|
r.read_exact(&mut b)?;
|
||||||
|
Ok(u64::from_le_bytes(b))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read a GGUF length-prefixed UTF-8 string (`u64` byte count followed by
|
||||||
|
/// raw bytes; not NUL-terminated).
|
||||||
|
fn read_str(r: &mut impl Read) -> io::Result<String> {
|
||||||
|
let len = read_u64(r)? as usize;
|
||||||
|
let mut buf = vec![0u8; len];
|
||||||
|
r.read_exact(&mut buf)?;
|
||||||
|
Ok(String::from_utf8_lossy(&buf).into_owned())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consume and discard one GGUF metadata value of type `tag`.
|
||||||
|
///
|
||||||
|
/// Recurses for array elements; does not allocate for scalar types.
|
||||||
|
fn skip_value(r: &mut impl Read, tag: u32) -> io::Result<()> {
|
||||||
|
match tag {
|
||||||
|
GTYPE_U8 | GTYPE_I8 | GTYPE_BOOL => { read_u8(r)?; }
|
||||||
|
GTYPE_U16 | GTYPE_I16 => { read_u16(r)?; }
|
||||||
|
GTYPE_U32 | GTYPE_I32 | GTYPE_F32 => { read_u32(r)?; }
|
||||||
|
GTYPE_U64 | GTYPE_I64 | GTYPE_F64 => { read_u64(r)?; }
|
||||||
|
GTYPE_STR => { read_str(r)?; }
|
||||||
|
GTYPE_ARR => {
|
||||||
|
let elem_tag = read_u32(r)?;
|
||||||
|
let count = read_u64(r)?;
|
||||||
|
for _ in 0..count {
|
||||||
|
skip_value(r, elem_tag)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
t => return Err(io::Error::new(
|
||||||
|
io::ErrorKind::InvalidData,
|
||||||
|
format!("unknown GGUF metadata value type {t}"),
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// TensorInfo
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Metadata for one tensor as recorded in the GGUF header.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct TensorInfo {
|
||||||
|
/// Tensor name as stored in the file (e.g. `"blk.0.attn_q.weight"`).
|
||||||
|
pub name: String,
|
||||||
|
|
||||||
|
/// Shape in GGML dimension order: `dims[0]` is the innermost
|
||||||
|
/// (fastest-varying) axis.
|
||||||
|
///
|
||||||
|
/// For a 2-D weight matrix:
|
||||||
|
/// - `dims[0]` = K (input features / column count)
|
||||||
|
/// - `dims[1]` = M (output features / row count)
|
||||||
|
pub dims: Vec<u64>,
|
||||||
|
|
||||||
|
/// GGML type code (e.g. [`GGML_TYPE_Q4_K`] = 12).
|
||||||
|
pub dtype: u32,
|
||||||
|
|
||||||
|
/// Byte offset of this tensor's data measured **from the start of the
|
||||||
|
/// data section** (add `data_start` from [`parse_header`] to get the
|
||||||
|
/// absolute file offset).
|
||||||
|
pub offset: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TensorInfo {
|
||||||
|
/// Total number of elements across all dimensions.
|
||||||
|
pub fn n_elements(&self) -> u64 {
|
||||||
|
self.dims.iter().product()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Number of Q4_K blocks (only meaningful when `dtype == GGML_TYPE_Q4_K`).
|
||||||
|
pub fn n_blocks(&self) -> usize {
|
||||||
|
(self.n_elements() / QK_K as u64) as usize
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Raw byte count occupied by this tensor's data on disk.
|
||||||
|
pub fn data_bytes(&self) -> u64 {
|
||||||
|
debug_assert_eq!(self.dtype, GGML_TYPE_Q4_K);
|
||||||
|
self.n_blocks() as u64 * BLOCK_BYTES as u64
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return `(m, k)` matrix dimensions for a 2-D tensor.
|
||||||
|
///
|
||||||
|
/// - `m` = `dims[1]` — number of rows (output features)
|
||||||
|
/// - `k` = `dims[0]` — number of columns (input features)
|
||||||
|
pub fn matrix_dims(&self) -> (usize, usize) {
|
||||||
|
assert_eq!(self.dims.len(), 2, "expected a 2-D tensor");
|
||||||
|
(self.dims[1] as usize, self.dims[0] as usize)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `true` if this tensor is a 2-D Q4_K matrix whose inner
|
||||||
|
/// dimension is a multiple of `QK_K` (required by our matmul).
|
||||||
|
pub fn is_usable_q4k(&self) -> bool {
|
||||||
|
self.dtype == GGML_TYPE_Q4_K
|
||||||
|
&& self.dims.len() == 2
|
||||||
|
&& self.dims[0] % QK_K as u64 == 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Header parser
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Open the GGUF file at `path`, parse its header, and return
|
||||||
|
/// `(tensor_infos, data_start_offset)`.
|
||||||
|
///
|
||||||
|
/// `data_start_offset` is the **absolute** byte position in the file where
|
||||||
|
/// the first tensor's data begins (after header + alignment padding).
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
///
|
||||||
|
/// Returns an error if the file is not a valid GGUF file, if I/O fails, or
|
||||||
|
/// if an unknown metadata value type is encountered.
|
||||||
|
pub fn parse_header(path: &str) -> Result<(Vec<TensorInfo>, u64), Box<dyn Error>> {
|
||||||
|
let mut r = BufReader::new(File::open(path)?);
|
||||||
|
|
||||||
|
// Magic number
|
||||||
|
let magic = read_u32(&mut r)?;
|
||||||
|
if magic != GGUF_MAGIC {
|
||||||
|
return Err(format!(
|
||||||
|
"not a GGUF file (expected {GGUF_MAGIC:#010x}, got {magic:#010x})"
|
||||||
|
).into());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Version — we support 2 and 3; warn on anything else and try anyway.
|
||||||
|
let version = read_u32(&mut r)?;
|
||||||
|
if !(2..=3).contains(&version) {
|
||||||
|
eprintln!("warning: unexpected GGUF version {version} — proceeding anyway");
|
||||||
|
}
|
||||||
|
|
||||||
|
let n_tensors = read_u64(&mut r)? as usize;
|
||||||
|
let n_metadata = read_u64(&mut r)?;
|
||||||
|
|
||||||
|
// Scan metadata KV pairs; pull out `general.alignment` if present.
|
||||||
|
let mut alignment = GGUF_DEFAULT_ALIGNMENT;
|
||||||
|
for _ in 0..n_metadata {
|
||||||
|
let key = read_str(&mut r)?;
|
||||||
|
let tag = read_u32(&mut r)?;
|
||||||
|
if key == "general.alignment" && tag == GTYPE_U32 {
|
||||||
|
alignment = read_u32(&mut r)? as u64;
|
||||||
|
} else {
|
||||||
|
skip_value(&mut r, tag)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tensor info records.
|
||||||
|
let mut tensors = Vec::with_capacity(n_tensors);
|
||||||
|
for _ in 0..n_tensors {
|
||||||
|
let name = read_str(&mut r)?;
|
||||||
|
let n_dims = read_u32(&mut r)? as usize;
|
||||||
|
let dims: Vec<u64> = (0..n_dims)
|
||||||
|
.map(|_| read_u64(&mut r))
|
||||||
|
.collect::<io::Result<_>>()?;
|
||||||
|
let dtype = read_u32(&mut r)?;
|
||||||
|
let offset = read_u64(&mut r)?;
|
||||||
|
tensors.push(TensorInfo { name, dims, dtype, offset });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Data section starts at the next multiple of `alignment` after the header.
|
||||||
|
let header_end = r.stream_position()?;
|
||||||
|
let data_start = header_end.div_ceil(alignment) * alignment;
|
||||||
|
|
||||||
|
Ok((tensors, data_start))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Block readers
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Parse one [`BlockQ4K`] from a 144-byte buffer.
|
||||||
|
///
|
||||||
|
/// The buffer must be in GGUF / GGML on-disk layout:
|
||||||
|
/// bytes 0–1 = `d`, 2–3 = `dmin`, 4–15 = `scales`, 16–143 = `qs`.
|
||||||
|
#[inline]
|
||||||
|
pub fn parse_block(buf: &[u8; BLOCK_BYTES]) -> BlockQ4K {
|
||||||
|
BlockQ4K {
|
||||||
|
d: u16::from_le_bytes([buf[0], buf[1]]),
|
||||||
|
dmin: u16::from_le_bytes([buf[2], buf[3]]),
|
||||||
|
scales: buf[4..16].try_into().unwrap(),
|
||||||
|
qs: buf[16..BLOCK_BYTES].try_into().unwrap(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load all Q4_K blocks in `tensor` into a freshly allocated `Vec`.
|
||||||
|
///
|
||||||
|
/// The reader is seeked to the correct file position automatically.
|
||||||
|
/// `data_start` must be the value returned by [`parse_header`].
|
||||||
|
pub fn load_blocks(
|
||||||
|
r: &mut (impl Read + Seek),
|
||||||
|
data_start: u64,
|
||||||
|
tensor: &TensorInfo,
|
||||||
|
) -> io::Result<Vec<BlockQ4K>> {
|
||||||
|
r.seek(SeekFrom::Start(data_start + tensor.offset))?;
|
||||||
|
|
||||||
|
let n = tensor.n_blocks();
|
||||||
|
let mut blocks = Vec::with_capacity(n);
|
||||||
|
let mut buf = [0u8; BLOCK_BYTES];
|
||||||
|
|
||||||
|
for _ in 0..n {
|
||||||
|
r.read_exact(&mut buf)?;
|
||||||
|
blocks.push(parse_block(&buf));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(blocks)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Iterate over every Q4_K block in `tensor`, calling `f` with a reference
|
||||||
|
/// to each parsed [`BlockQ4K`] in turn.
|
||||||
|
///
|
||||||
|
/// Only one block-sized buffer (144 bytes) is allocated on the stack; no
|
||||||
|
/// `Vec` is built. Use this for streaming scans that do not need to retain
|
||||||
|
/// blocks (e.g. computing statistics or RLE compression ratios).
|
||||||
|
///
|
||||||
|
/// The reader is seeked to the correct file position automatically.
|
||||||
|
/// `data_start` must be the value returned by [`parse_header`].
|
||||||
|
pub fn for_each_block<F>(
|
||||||
|
r: &mut (impl Read + Seek),
|
||||||
|
data_start: u64,
|
||||||
|
tensor: &TensorInfo,
|
||||||
|
mut f: F,
|
||||||
|
) -> io::Result<()>
|
||||||
|
where
|
||||||
|
F: FnMut(&BlockQ4K),
|
||||||
|
{
|
||||||
|
r.seek(SeekFrom::Start(data_start + tensor.offset))?;
|
||||||
|
|
||||||
|
let mut buf = [0u8; BLOCK_BYTES];
|
||||||
|
for _ in 0..tensor.n_blocks() {
|
||||||
|
r.read_exact(&mut buf)?;
|
||||||
|
f(&parse_block(&buf));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -19,6 +19,7 @@
|
|||||||
//! f32, accumulate dot-products. No SIMD, no tiling, no tricks.
|
//! f32, accumulate dot-products. No SIMD, no tiling, no tricks.
|
||||||
|
|
||||||
pub mod rle;
|
pub mod rle;
|
||||||
|
pub mod gguf;
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// Constants matching GGML's ggml-common.h
|
// Constants matching GGML's ggml-common.h
|
||||||
|
|||||||
Reference in New Issue
Block a user