Step 01 — Data Block Writer
Goal
Implement the smallest unit of an SSTable: a data block builder that accumulates entries and emits the on-disk block bytes.
Block format (recap)
[count: u32 LE]
repeat count times (keys ascending within the block):
[klen: u32 LE][vlen: u32 LE][type: u8][key][value]
The block does not carry its own size — the index entry that points to it does.
Encoded entry size
entry_size(klen, vlen) = 9 + klen + vlen # 4 + 4 + 1 + key + value
A block that holds N entries occupies 4 + Σ entry_size.
Flush rule
Track current_size = 4 (the block header) and the buffer separately.
For each candidate entry (k, v):
sz = entry_size(len(k), len(v))
if buffer_non_empty AND current_size + sz > BLOCK_TARGET:
flush() # emit block, capture index entry, reset
push entry
current_size += sz
This rule allows the block to grow up to and including BLOCK_TARGET
bytes but never beyond. A single oversized entry is emitted alone in
its own block (block size grows past the target only when one entry
already exceeds it).
Side-by-side: Rust / Go / C++
Rust
#![allow(unused)] fn main() { const HEADER: usize = 4; fn entry_size(k: usize, v: usize) -> usize { 9 + k + v } struct BlockBuilder { buf: Vec<u8>, count: u32, first_key: Option<Vec<u8>>, } impl BlockBuilder { fn new() -> Self { let mut buf = Vec::with_capacity(BLOCK_TARGET); buf.extend_from_slice(&0u32.to_le_bytes()); // placeholder for count Self { buf, count: 0, first_key: None } } fn current_size(&self) -> usize { self.buf.len() } fn add(&mut self, key: &[u8], ty: u8, value: &[u8]) { if self.count == 0 { self.first_key = Some(key.to_vec()); } self.buf.extend_from_slice(&(key.len() as u32).to_le_bytes()); self.buf.extend_from_slice(&(value.len() as u32).to_le_bytes()); self.buf.push(ty); self.buf.extend_from_slice(key); self.buf.extend_from_slice(value); self.count += 1; } fn finish(mut self) -> (Vec<u8>, Vec<u8>) { self.buf[0..4].copy_from_slice(&self.count.to_le_bytes()); (self.buf, self.first_key.unwrap_or_default()) } } }
Go
type blockBuilder struct {
buf []byte
count uint32
firstKey []byte
}
func newBlock() *blockBuilder {
b := &blockBuilder{buf: make([]byte, 0, blockTarget)}
b.buf = binary.LittleEndian.AppendUint32(b.buf, 0) // placeholder
return b
}
func (b *blockBuilder) currentSize() int { return len(b.buf) }
func (b *blockBuilder) add(key []byte, ty byte, value []byte) {
if b.count == 0 { b.firstKey = append([]byte(nil), key...) }
b.buf = binary.LittleEndian.AppendUint32(b.buf, uint32(len(key)))
b.buf = binary.LittleEndian.AppendUint32(b.buf, uint32(len(value)))
b.buf = append(b.buf, ty)
b.buf = append(b.buf, key...)
b.buf = append(b.buf, value...)
b.count++
}
func (b *blockBuilder) finish() (block, firstKey []byte) {
binary.LittleEndian.PutUint32(b.buf[0:4], b.count)
return b.buf, b.firstKey
}
C++
struct BlockBuilder {
std::vector<uint8_t> buf;
uint32_t count = 0;
std::vector<uint8_t> first_key;
BlockBuilder() {
buf.reserve(kBlockTarget);
put_u32_le(buf, 0); // placeholder
}
size_t current_size() const { return buf.size(); }
void add(const uint8_t* k, size_t klen,
uint8_t ty,
const uint8_t* v, size_t vlen) {
if (count == 0) first_key.assign(k, k + klen);
put_u32_le(buf, uint32_t(klen));
put_u32_le(buf, uint32_t(vlen));
buf.push_back(ty);
buf.insert(buf.end(), k, k + klen);
buf.insert(buf.end(), v, v + vlen);
++count;
}
std::pair<std::vector<uint8_t>, std::vector<uint8_t>> finish() {
std::memcpy(buf.data(), &count, 4); // LE on the platforms we target
return {std::move(buf), std::move(first_key)};
}
};
(For portability, the C++ version uses put_u32_le to patch the count
header too in the real implementation; the memcpy shortcut works on
little-endian hosts but the lab uses the helper.)
Self-check
- Empty
finishreturns(b"\x00\x00\x00\x00", b""). - After three adds the buffer length equals
4 + Σ entry_size. first_keyis captured on the first add and never overwritten.