// SPDX-FileCopyrightText: edef // SPDX-License-Identifier: OSL-3.0 use { byteorder::{BigEndian, ByteOrder}, prost::Message, std::{ collections::BTreeMap, fs, io::{self, BufRead, Read}, os::unix::fs::PermissionsExt, path::Path, }, }; pub mod store { include!(concat!(env!("OUT_DIR"), "/fossil.store.rs")); } const CHUNK_BYTES: usize = blake3::CHUNK_LEN; const DIGEST_BYTES: usize = blake3::OUT_LEN; const OFFSET_BYTES: usize = 4; pub struct Store { db: sled::Db, } impl Store { pub fn open(path: impl AsRef) -> io::Result { let db = sled::open(path)?; Ok(Store { db }) } pub fn add_path(&self, path: impl AsRef) -> Node { let path = path.as_ref(); let meta = fs::symlink_metadata(path).unwrap(); match meta.file_type() { ty if ty.is_dir() => { let mut d = Directory::new(); let mut size: u32 = 0; for entry in path.read_dir().unwrap() { let entry = entry.unwrap(); let name = entry.file_name().into_string().unwrap(); let child = self.add_path(entry.path()); size = size.checked_add(child.size()).expect("overflow"); d.children.insert(name, child); } let blob = d.into_pb().encode_to_vec(); Node::Directory(DirectoryRef { ident: self.write_blob(&blob), size, }) } ty if ty.is_file() => { let executable = (meta.permissions().mode() & 0o100) != 0; let blob = fs::read(path).unwrap(); Node::File(FileRef { executable, ident: self.write_blob(&blob), size: blob.len().try_into().expect("overflow"), }) } ty if ty.is_symlink() => { let target = path .read_link() .unwrap() .to_str() .expect("symlink target is invalid UTF-8") .to_owned(); Node::Link { target } } _ => panic!("not a symlink or a regular file"), } } fn write_blob(&self, data: &[u8]) -> Digest { let ident = { let mut h = blake3::Hasher::new(); h.update_with_join::(data); h.finalize() }; // TODO(edef): maybe don't use the default tree? // we should probably have a "blob" tree, // and reserve the default tree for DB metadata self.db .transaction::<_, _, sled::Error>(|db| { for (n, chunk) in data.chunks(CHUNK_BYTES).enumerate() { db.insert(chunk_key(&ident, n as u32).as_slice(), chunk)?; } Ok(()) }) .unwrap(); ident.into() } pub fn read_blob(&self, ident: Digest) -> Vec { let mut buffer = Vec::new(); self.raw_blob(ident).read_to_end(&mut buffer).unwrap(); let computed_ident = { let mut h = blake3::Hasher::new(); h.update_with_join::(&buffer); h.finalize() }; if computed_ident != ident { if buffer.is_empty() { panic!("blob not found"); } panic!("hash mismatch"); } buffer } fn raw_blob(&self, ident: Digest) -> RawBlob<'_> { RawBlob { store: self, ident, buf: None, off: 0, } } } fn chunk_key(ident: &Digest, chunk: u32) -> [u8; DIGEST_BYTES + OFFSET_BYTES] { let mut key = [0u8; DIGEST_BYTES + OFFSET_BYTES]; key[..DIGEST_BYTES].copy_from_slice(ident.as_bytes()); BigEndian::write_u32(&mut key[DIGEST_BYTES..], chunk as u32); key } fn chunk_id(offset: u64) -> u32 { (offset / CHUNK_BYTES as u64).try_into().unwrap() } struct RawBlob<'a> { store: &'a Store, ident: Digest, /// current chunk buf: Option, /// reader offset /// LSBs are intra-chunk, MSBs are chunk number off: u64, } impl io::BufRead for RawBlob<'_> { fn fill_buf(&mut self) -> io::Result<&[u8]> { let buf = match self.buf { Some(ref buf) => buf, None => { let chunk = chunk_id(self.off); match self.store.db.get(chunk_key(&self.ident, chunk))? { None => return Ok(&[]), Some(contents) => self.buf.insert(contents), } } }; let off = (self.off % CHUNK_BYTES as u64) as usize; Ok(buf.get(off..).unwrap_or_default()) } fn consume(&mut self, amt: usize) { let prev_offset = self.off; let next_offset = self.off.saturating_add(amt as u64); if chunk_id(next_offset) != chunk_id(prev_offset) { self.buf.take(); } self.off = next_offset; } } impl io::Read for RawBlob<'_> { fn read(&mut self, dst: &mut [u8]) -> io::Result { let src = self.fill_buf()?; let len = Ord::min(src.len(), dst.len()); dst[..len].copy_from_slice(&src[..len]); self.consume(len); Ok(len) } } pub type Digest = blake3::Hash; pub struct Directory { pub children: BTreeMap, } #[derive(Clone)] pub enum Node { Directory(DirectoryRef), File(FileRef), Link { target: String }, } #[derive(Clone)] pub struct DirectoryRef { pub ident: Digest, pub size: u32, } #[derive(Debug, Clone)] pub struct FileRef { pub ident: Digest, pub executable: bool, pub size: u32, } impl Node { fn size(&self) -> u32 { match self { &Node::Directory(DirectoryRef { size, .. }) => size.checked_add(1).expect("overflow"), _ => 1, } } } impl Directory { pub fn new() -> Directory { Directory { children: BTreeMap::new(), } } pub fn into_pb(self) -> store::Directory { let mut d = store::Directory::default(); for (name, node) in self.children.into_iter() { match node { Node::Directory(DirectoryRef { ident, size }) => { d.directories.push(store::DirectoryNode { name, size, r#ref: ident.as_bytes().to_vec(), }) } Node::File(FileRef { ident, executable, size, }) => d.files.push(store::FileNode { name, r#ref: ident.as_bytes().to_vec(), executable, size, }), Node::Link { target } => d.links.push(store::LinkNode { name, target }), } } d } pub fn from_pb(pb: store::Directory) -> Directory { let mut children = BTreeMap::new(); for child in pb.directories { children.insert( child.name, Node::Directory(DirectoryRef { ident: digest_from_bytes(&child.r#ref), size: child.size, }), ); } for child in pb.files { children.insert( child.name, Node::File(FileRef { ident: digest_from_bytes(&child.r#ref), executable: child.executable, size: child.size, }), ); } for child in pb.links { children.insert( child.name, Node::Link { target: child.target, }, ); } Directory { children } } } #[track_caller] pub fn digest_from_bytes(bytes: &[u8]) -> Digest { if bytes.len() != DIGEST_BYTES { panic!( "digest is {} bytes, expecting {} bytes", bytes.len(), DIGEST_BYTES ); } let mut buffer = [0; DIGEST_BYTES]; buffer.copy_from_slice(bytes); buffer.into() } #[test] /// Write a blob, and read it back. fn read_write() { let data = { let mut h = blake3::Hasher::new(); h.update(b"test vector"); let mut buf = Vec::new(); h.finalize_xof().take(2468).read_to_end(&mut buf).unwrap(); buf }; // TODO(edef): use a temporary file let store = Store::open("fossil.db").unwrap(); let ident = store.write_blob(&data); assert_eq!(data, store.read_blob(ident)); }