Step 01 — Data Block Writer

Goal

Implement the smallest unit of an SSTable: a data block builder that accumulates entries and emits the on-disk block bytes.

Block format (recap)

[count: u32 LE]
repeat count times (keys ascending within the block):
  [klen: u32 LE][vlen: u32 LE][type: u8][key][value]

The block does not carry its own size — the index entry that points to it does.

Encoded entry size

entry_size(klen, vlen) = 9 + klen + vlen   # 4 + 4 + 1 + key + value

A block that holds N entries occupies 4 + Σ entry_size.

Flush rule

Track current_size = 4 (the block header) and the buffer separately. For each candidate entry (k, v):

sz = entry_size(len(k), len(v))
if buffer_non_empty AND current_size + sz > BLOCK_TARGET:
    flush()                # emit block, capture index entry, reset
push entry
current_size += sz

This rule allows the block to grow up to and including BLOCK_TARGET bytes but never beyond. A single oversized entry is emitted alone in its own block (block size grows past the target only when one entry already exceeds it).

Side-by-side: Rust / Go / C++

Rust

#![allow(unused)]
fn main() {
const HEADER: usize = 4;
fn entry_size(k: usize, v: usize) -> usize { 9 + k + v }

struct BlockBuilder {
    buf: Vec<u8>,
    count: u32,
    first_key: Option<Vec<u8>>,
}

impl BlockBuilder {
    fn new() -> Self {
        let mut buf = Vec::with_capacity(BLOCK_TARGET);
        buf.extend_from_slice(&0u32.to_le_bytes()); // placeholder for count
        Self { buf, count: 0, first_key: None }
    }
    fn current_size(&self) -> usize { self.buf.len() }
    fn add(&mut self, key: &[u8], ty: u8, value: &[u8]) {
        if self.count == 0 { self.first_key = Some(key.to_vec()); }
        self.buf.extend_from_slice(&(key.len() as u32).to_le_bytes());
        self.buf.extend_from_slice(&(value.len() as u32).to_le_bytes());
        self.buf.push(ty);
        self.buf.extend_from_slice(key);
        self.buf.extend_from_slice(value);
        self.count += 1;
    }
    fn finish(mut self) -> (Vec<u8>, Vec<u8>) {
        self.buf[0..4].copy_from_slice(&self.count.to_le_bytes());
        (self.buf, self.first_key.unwrap_or_default())
    }
}
}

Go

type blockBuilder struct {
    buf      []byte
    count    uint32
    firstKey []byte
}

func newBlock() *blockBuilder {
    b := &blockBuilder{buf: make([]byte, 0, blockTarget)}
    b.buf = binary.LittleEndian.AppendUint32(b.buf, 0) // placeholder
    return b
}
func (b *blockBuilder) currentSize() int { return len(b.buf) }
func (b *blockBuilder) add(key []byte, ty byte, value []byte) {
    if b.count == 0 { b.firstKey = append([]byte(nil), key...) }
    b.buf = binary.LittleEndian.AppendUint32(b.buf, uint32(len(key)))
    b.buf = binary.LittleEndian.AppendUint32(b.buf, uint32(len(value)))
    b.buf = append(b.buf, ty)
    b.buf = append(b.buf, key...)
    b.buf = append(b.buf, value...)
    b.count++
}
func (b *blockBuilder) finish() (block, firstKey []byte) {
    binary.LittleEndian.PutUint32(b.buf[0:4], b.count)
    return b.buf, b.firstKey
}

C++

struct BlockBuilder {
    std::vector<uint8_t> buf;
    uint32_t count = 0;
    std::vector<uint8_t> first_key;

    BlockBuilder() {
        buf.reserve(kBlockTarget);
        put_u32_le(buf, 0);                // placeholder
    }
    size_t current_size() const { return buf.size(); }
    void add(const uint8_t* k, size_t klen,
             uint8_t ty,
             const uint8_t* v, size_t vlen) {
        if (count == 0) first_key.assign(k, k + klen);
        put_u32_le(buf, uint32_t(klen));
        put_u32_le(buf, uint32_t(vlen));
        buf.push_back(ty);
        buf.insert(buf.end(), k, k + klen);
        buf.insert(buf.end(), v, v + vlen);
        ++count;
    }
    std::pair<std::vector<uint8_t>, std::vector<uint8_t>> finish() {
        std::memcpy(buf.data(), &count, 4); // LE on the platforms we target
        return {std::move(buf), std::move(first_key)};
    }
};

(For portability, the C++ version uses put_u32_le to patch the count header too in the real implementation; the memcpy shortcut works on little-endian hosts but the lab uses the helper.)

Self-check

  • Empty finish returns (b"\x00\x00\x00\x00", b"").
  • After three adds the buffer length equals 4 + Σ entry_size.
  • first_key is captured on the first add and never overwritten.