// SPDX-FileCopyrightText: edef // SPDX-License-Identifier: OSL-3.0 use { anyhow::{Context, Result}, byteorder::{BigEndian, ByteOrder}, prost::Message, sled::{transaction::ConflictableTransactionError, Transactional}, std::{ cell::{Cell, RefCell}, collections::BTreeMap, fs, io::{self, Read, Write}, os::unix::{fs::PermissionsExt, prelude::FileExt}, path::Path, }, }; pub mod store { include!(concat!(env!("OUT_DIR"), "/fossil.store.rs")); } const CHUNK_BYTES: usize = 0x400; const DIGEST_BYTES: usize = blake3::OUT_LEN; pub struct Store { db: sled::Db, meta: sled::Tree, chunks: RefCell, chunks_tail: Cell, } impl Store { pub fn open(path: impl AsRef) -> Result { let path = path.as_ref(); let db = sled::open(path)?; let meta = db.open_tree("meta")?; let chunks = fs::OpenOptions::new() .read(true) .append(true) .create(true) .open(path.join("chunks"))?; let chunks_tail = meta .get("chunks_tail")? .map(|v| BigEndian::read_u64(&v)) .unwrap_or_default(); chunks.set_len(chunks_tail)?; Ok(Store { db, meta, chunks: RefCell::new(chunks), chunks_tail: Cell::new(chunks_tail), }) } pub fn add_path(&self, path: impl AsRef) -> Node { let path = path.as_ref(); let meta = fs::symlink_metadata(path).unwrap(); match meta.file_type() { ty if ty.is_dir() => { let mut d = Directory::new(); let mut size: u32 = 0; for entry in path.read_dir().unwrap() { let entry = entry.unwrap(); let name = entry.file_name().into_string().unwrap(); let child = self.add_path(entry.path()); size = size.checked_add(child.size()).expect("overflow"); d.children.insert(name, child); } let blob = d.into_pb().encode_to_vec(); Node::Directory(DirectoryRef { ident: self.write_blob(&blob), size, }) } ty if ty.is_file() => { let executable = (meta.permissions().mode() & 0o100) != 0; let blob = fs::read(path).unwrap(); Node::File(FileRef { executable, ident: self.write_blob(&blob), size: blob.len().try_into().expect("overflow"), }) } ty if ty.is_symlink() => { let target = path .read_link() .unwrap() .to_str() .expect("symlink target is invalid UTF-8") .to_owned(); Node::Link { target } } _ => panic!("not a symlink or a regular file"), } } fn write_blob(&self, data: &[u8]) -> Digest { let ident = { let mut h = blake3::Hasher::new(); h.update_rayon(data); h.finalize() }; if self.db.contains_key(&*ident.as_bytes()).unwrap() { // key already exists return ident; } let mut chunks_file = self.chunks.borrow_mut(); let offset = self.chunks_tail.get(); chunks_file.write_all(data).unwrap(); let chunks_tail = offset + data.len() as u64; // TODO(edef): maybe don't use the default tree? // we should probably have a "blob" tree, // and reserve the default tree for DB metadata let slice = Slice { offset, length: data.len() as u64, }; let slice_buf = { let mut buf = [0u8; 16]; BigEndian::write_u64_into(&[slice.offset, slice.length], &mut buf); buf }; let chunks_tail_buf = { let mut buf = [0u8; 8]; BigEndian::write_u64(&mut buf, chunks_tail); buf }; // TODO(edef): figure out fsync for durability (&*self.db, &self.meta) .transaction(|(db, meta)| { db.insert(&*ident.as_bytes(), &slice_buf)?; meta.insert("chunks_tail", &chunks_tail_buf)?; Ok::<_, ConflictableTransactionError>(()) }) .unwrap(); self.chunks_tail.set(chunks_tail); ident.into() } pub fn read_blob(&self, ident: Digest) -> Vec { let mut buffer = Vec::new(); self.raw_blob(ident).read_to_end(&mut buffer).unwrap(); let mut outboard = Vec::new(); let computed_ident: blake3::Hash = { let mut encoder = bao::encode::Encoder::new_outboard(io::Cursor::new(&mut outboard)); encoder.write_all(&buffer).unwrap(); encoder.finalize().unwrap() }; if computed_ident != ident { panic!("hash mismatch"); } buffer } fn raw_blob(&self, ident: Digest) -> RawBlob<'_> { let slice_buf = self .db .get(&*ident.as_bytes()) .unwrap() .expect("blob not found"); let slice = Slice { offset: BigEndian::read_u64(&slice_buf[0..]), length: BigEndian::read_u64(&slice_buf[8..]), }; RawBlob { store: self, slice, position: 0, } } } /// a slice in the chunks file #[derive(Debug)] struct Slice { offset: u64, length: u64, } struct RawBlob<'a> { store: &'a Store, slice: Slice, position: u64, } impl io::Read for RawBlob<'_> { fn read(&mut self, dst: &mut [u8]) -> io::Result { let prev_pos = self.position; let next_pos = Ord::min( self.position.saturating_add(dst.len() as u64), self.slice.length, ); let len = (next_pos - prev_pos) as usize; let dst = &mut dst[..len]; let offset = self.slice.offset + prev_pos; self.store .chunks .borrow() .read_exact_at(dst, offset) .context("Couldn't read blob data") .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; self.position = next_pos; Ok(len) } } // TODO(edef): use checked_add_signed when mixed_integer_ops stabilises fn checked_add_signed(lhs: u64, rhs: i64) -> Option { if rhs >= 0 { lhs.checked_add(rhs as u64) } else { lhs.checked_sub(rhs.unsigned_abs()) } } impl io::Seek for RawBlob<'_> { fn seek(&mut self, pos: io::SeekFrom) -> io::Result { let pos = match pos { io::SeekFrom::Start(n) => Some(n), io::SeekFrom::End(n) => checked_add_signed(self.slice.length, n), io::SeekFrom::Current(n) => checked_add_signed(self.position, n), }; match pos { Some(n) if n <= self.slice.length => { self.position = n; Ok(self.position) } _ => Err(io::Error::new( io::ErrorKind::InvalidInput, "seek out of range", )), } } fn rewind(&mut self) -> io::Result<()> { self.position = 0; Ok(()) } fn stream_position(&mut self) -> io::Result { Ok(self.position) } } pub type Digest = blake3::Hash; pub struct Directory { pub children: BTreeMap, } #[derive(Clone)] pub enum Node { Directory(DirectoryRef), File(FileRef), Link { target: String }, } #[derive(Clone)] pub struct DirectoryRef { pub ident: Digest, pub size: u32, } #[derive(Debug, Clone)] pub struct FileRef { pub ident: Digest, pub executable: bool, pub size: u32, } impl Node { fn size(&self) -> u32 { match self { &Node::Directory(DirectoryRef { size, .. }) => size.checked_add(1).expect("overflow"), _ => 1, } } } impl Directory { pub fn new() -> Directory { Directory { children: BTreeMap::new(), } } pub fn into_pb(self) -> store::Directory { let mut d = store::Directory::default(); for (name, node) in self.children.into_iter() { match node { Node::Directory(DirectoryRef { ident, size }) => { d.directories.push(store::DirectoryNode { name, size, r#ref: ident.as_bytes().to_vec(), }) } Node::File(FileRef { ident, executable, size, }) => d.files.push(store::FileNode { name, r#ref: ident.as_bytes().to_vec(), executable, size, }), Node::Link { target } => d.links.push(store::LinkNode { name, target }), } } d } pub fn from_pb(pb: store::Directory) -> Directory { let mut children = BTreeMap::new(); for child in pb.directories { children.insert( child.name, Node::Directory(DirectoryRef { ident: digest_from_bytes(&child.r#ref), size: child.size, }), ); } for child in pb.files { children.insert( child.name, Node::File(FileRef { ident: digest_from_bytes(&child.r#ref), executable: child.executable, size: child.size, }), ); } for child in pb.links { children.insert( child.name, Node::Link { target: child.target, }, ); } Directory { children } } } #[track_caller] pub fn digest_from_bytes(bytes: &[u8]) -> Digest { if bytes.len() != DIGEST_BYTES { panic!( "digest is {} bytes, expecting {} bytes", bytes.len(), DIGEST_BYTES ); } let mut buffer = [0; DIGEST_BYTES]; buffer.copy_from_slice(bytes); buffer.into() } #[test] /// Write a blob, and read it back. fn read_write() { let data = { let mut h = blake3::Hasher::new(); h.update(b"test vector"); let mut buf = Vec::new(); h.finalize_xof().take(2468).read_to_end(&mut buf).unwrap(); buf }; // TODO(edef): use a temporary file let store = Store::open("fossil.db").unwrap(); let ident = store.write_blob(&data); assert_eq!(data, store.read_blob(ident)); }