Step 02 — Encode / Decode the dump

Serialize the MemTable to a byte-identical on-disk layout and parse it back.

Layout (recap from analysis.md)

  magic   "MMT1"            (4 bytes)
  count   u32 LE            (4 bytes)
  repeat count times, in ascending key order:
      klen   u32 LE
      vlen   u32 LE         (0 for tombstone)
      type   u8             (0 = Value, 1 = Tombstone)
      key    klen bytes
      value  vlen bytes

Rust

#![allow(unused)]
fn main() {
pub fn encode(&self) -> Vec<u8> {
    let mut out = Vec::with_capacity(self.size_bytes());
    out.extend_from_slice(b"MMT1");
    out.extend_from_slice(&(self.map.len() as u32).to_le_bytes());
    for (k, e) in &self.map {
        let (vlen, t, v) = match e {
            Entry::Value(v) => (v.len() as u32, 0u8, v.as_slice()),
            Entry::Tombstone => (0, 1, &[][..]),
        };
        out.extend_from_slice(&(k.len() as u32).to_le_bytes());
        out.extend_from_slice(&vlen.to_le_bytes());
        out.push(t);
        out.extend_from_slice(k);
        out.extend_from_slice(v);
    }
    out
}

pub fn decode(bytes: &[u8]) -> Result<Self, Error> {
    if bytes.len() < 8 { return Err(Error::Short); }
    if &bytes[..4] != b"MMT1" { return Err(Error::BadMagic); }
    let count = u32::from_le_bytes(bytes[4..8].try_into().unwrap()) as usize;
    let mut p = 8usize;
    let mut t = MemTable::new();
    let mut prev: Option<Vec<u8>> = None;
    for _ in 0..count {
        if p + 9 > bytes.len() { return Err(Error::Short); }
        let klen = u32::from_le_bytes(bytes[p..p+4].try_into().unwrap()) as usize;
        let vlen = u32::from_le_bytes(bytes[p+4..p+8].try_into().unwrap()) as usize;
        let ty = bytes[p+8];
        p += 9;
        if p + klen + vlen > bytes.len() { return Err(Error::Short); }
        let key = bytes[p..p+klen].to_vec();
        p += klen;
        let val = bytes[p..p+vlen].to_vec();
        p += vlen;
        if let Some(ref pk) = prev {
            if key.as_slice() <= pk.as_slice() { return Err(Error::Unsorted); }
        }
        let entry = match ty {
            0 => { Entry::Value(val) }
            1 => { if vlen != 0 { return Err(Error::BadTombstone); } Entry::Tombstone }
            _ => return Err(Error::BadType),
        };
        prev = Some(key.clone());
        t.insert_raw(key, entry);
    }
    if p != bytes.len() { return Err(Error::Trailing); }
    Ok(t)
}
}

Go

func (t *MemTable) Encode() []byte {
    out := make([]byte, 0, t.SizeBytes())
    out = append(out, 'M', 'M', 'T', '1')
    out = binary.LittleEndian.AppendUint32(out, uint32(len(t.m)))
    for _, k := range t.sortedKeys() {
        e := t.m[k]
        out = binary.LittleEndian.AppendUint32(out, uint32(len(k)))
        out = binary.LittleEndian.AppendUint32(out, uint32(len(e.Value)))
        out = append(out, byte(e.Type))
        out = append(out, k...)
        out = append(out, e.Value...)
    }
    return out
}

Decode mirrors the Rust shape: read header, then loop reading klen, vlen, type, key, value, validating ascending key order and rejecting trailing bytes.

C++

std::vector<std::uint8_t> MemTable::Encode() const {
    std::vector<std::uint8_t> out;
    out.reserve(SizeBytes());
    static constexpr std::uint8_t magic[4] = {'M','M','T','1'};
    out.insert(out.end(), magic, magic + 4);
    PutU32LE(out, static_cast<std::uint32_t>(map_.size()));
    for (auto const& [k, e] : map_) {
        PutU32LE(out, static_cast<std::uint32_t>(k.size()));
        PutU32LE(out, static_cast<std::uint32_t>(e.value.size()));
        out.push_back(static_cast<std::uint8_t>(e.type));
        out.insert(out.end(), k.begin(), k.end());
        out.insert(out.end(), e.value.begin(), e.value.end());
    }
    return out;
}

What the decoder must reject

InputWhy it must fail
< 8 bytesheader truncated
magic XXXXbad format
count says 5 but only 3 entries fittruncated body
type byte 2unknown type
tombstone with vlen != 0malformed
keys not strictly ascendingviolates order invariant
trailing bytes after last entrycorruption

How to spot-check

After encoding a 2-entry MemTable (alpha=first, beta tombstoned):

xxd /tmp/m.bin | head
00000000: 4d4d 5431 0200 0000 0500 0000 0500 0000  MMT1............
00000010: 0061 6c70 6861 6669 7273 7404 0000 0000  .alphafirst.....
00000020: 0000 0001 6265 7461                      ....beta

Three things to verify by eye:

  1. 4d4d5431 = MMT1.
  2. 0200 0000 = count of 2 (LE).
  3. The tombstone's vlen is 0000 0000 and the byte before its key is 01.

Test V6 — round-trip 50 entries

Mix puts and deletes:

for i in 0..50:
    if i % 5 == 0: t.delete(format!("key{i}").as_bytes())
    else:          t.put(format!("key{i}").as_bytes(), format!("val{i}").as_bytes())
encoded = t.encode()
roundtrip = MemTable::decode(&encoded).unwrap()
assert_eq!(roundtrip.len(), t.len())
for (k, e) in t.iter() {
    assert_eq!(roundtrip.get(k), Some(e))
}
assert_eq!(roundtrip.encode(), encoded)

The last line is the idempotence check — decoding and re-encoding produces the same bytes. If it doesn't, we have non-determinism in iteration order, which will also break cross-language interop.