From db7c54f92f386a94db8af7a12626d2657b4dd640 Mon Sep 17 00:00:00 2001 From: edef Date: Sat, 14 Aug 2021 21:28:14 +0000 Subject: ripple/fossil: a basic content-addressable store Fossil stores content-addressed blobs of file contents and Protobuf-encoded directory listings, backed by Sled. Change-Id: I8b49de6342218ca00755cec980b1d0cfb18878a7 --- ripple/fossil/.gitignore | 4 + ripple/fossil/Cargo.toml | 17 ++++ ripple/fossil/build.rs | 9 ++ ripple/fossil/src/bin/add.rs | 31 ++++++ ripple/fossil/src/bin/extract.rs | 57 +++++++++++ ripple/fossil/src/lib.rs | 206 +++++++++++++++++++++++++++++++++++++++ ripple/fossil/src/store.proto | 28 ++++++ 7 files changed, 352 insertions(+) create mode 100644 ripple/fossil/.gitignore create mode 100644 ripple/fossil/Cargo.toml create mode 100644 ripple/fossil/build.rs create mode 100644 ripple/fossil/src/bin/add.rs create mode 100644 ripple/fossil/src/bin/extract.rs create mode 100644 ripple/fossil/src/lib.rs create mode 100644 ripple/fossil/src/store.proto (limited to 'ripple/fossil') diff --git a/ripple/fossil/.gitignore b/ripple/fossil/.gitignore new file mode 100644 index 0000000..be75022 --- /dev/null +++ b/ripple/fossil/.gitignore @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: edef +# SPDX-License-Identifier: OSL-3.0 + +/target diff --git a/ripple/fossil/Cargo.toml b/ripple/fossil/Cargo.toml new file mode 100644 index 0000000..a88a5f8 --- /dev/null +++ b/ripple/fossil/Cargo.toml @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: edef +# SPDX-License-Identifier: OSL-3.0 + +[package] +name = "fossil" +version = "0.1.0" +edition = "2018" + +[dependencies] +prost = "0.8.0" +bytes = "1.0.1" +blake3 = { version = "0.3.8", features = ["rayon"] } +sled = "0.34.6" +byteorder = "1.4.3" + +[build-dependencies] +prost-build = "0.8.0" diff --git a/ripple/fossil/build.rs b/ripple/fossil/build.rs new file mode 100644 index 0000000..412c2d2 --- /dev/null +++ b/ripple/fossil/build.rs @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: edef +// SPDX-License-Identifier: OSL-3.0 + +use std::io::Result; + +fn main() -> Result<()> { + prost_build::compile_protos(&["src/store.proto"], &["src/"])?; + Ok(()) +} diff --git a/ripple/fossil/src/bin/add.rs b/ripple/fossil/src/bin/add.rs new file mode 100644 index 0000000..114f893 --- /dev/null +++ b/ripple/fossil/src/bin/add.rs @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: edef +// SPDX-License-Identifier: OSL-3.0 + +use { + fossil::Directory, + prost::Message, + std::{ + env, + io::{self, Write}, + path::Path, + }, +}; + +fn main() { + let store = fossil::Store::open("fossil.db").unwrap(); + let mut root = Directory::new(); + + for name in env::args().skip(1) { + let path = Path::new(&name); + let name = path + .file_name() + .and_then(|s| s.to_str()) + .expect("invalid path") + .to_owned(); + + root.children.insert(name, store.add_path(path)); + } + + let mut stdout = io::stdout(); + stdout.write_all(&root.into_pb().encode_to_vec()).unwrap(); +} diff --git a/ripple/fossil/src/bin/extract.rs b/ripple/fossil/src/bin/extract.rs new file mode 100644 index 0000000..f83ce0e --- /dev/null +++ b/ripple/fossil/src/bin/extract.rs @@ -0,0 +1,57 @@ +// SPDX-FileCopyrightText: edef +// SPDX-License-Identifier: OSL-3.0 + +use { + fossil::{store, Directory}, + prost::Message, + std::{ + fs, + io::{self, Read, Write}, + os::unix::{fs::symlink, prelude::OpenOptionsExt}, + path::Path, + }, +}; + +fn main() { + let store = fossil::Store::open("fossil.db").unwrap(); + let root = { + let mut stdin = io::stdin(); + + let mut bytes = Vec::new(); + stdin.read_to_end(&mut bytes).unwrap(); + + let pb = store::Directory::decode(&*bytes).unwrap(); + Directory::from_pb(pb) + }; + + let root_path = Path::new("."); + extract(&store, root_path, &root); +} + +fn extract(store: &fossil::Store, path: &Path, dir: &Directory) { + for (name, node) in &dir.children { + let path = path.join(name); + match node.clone() { + fossil::Node::Directory { r#ref } => { + let blob = store.read_blob(r#ref); + let pb = store::Directory::decode(&*blob).unwrap(); + fs::create_dir(&path).unwrap(); + extract(store, &path, &Directory::from_pb(pb)); + } + fossil::Node::File { r#ref, executable } => { + let mode = if executable { 0o755 } else { 0o644 }; + let mut f = fs::OpenOptions::new() + .write(true) + .create_new(true) + .mode(mode) + .open(path) + .unwrap(); + let blob = store.read_blob(r#ref); + f.write_all(&blob).unwrap(); + } + fossil::Node::Link { target } => { + symlink(target, path).unwrap(); + } + } + } +} diff --git a/ripple/fossil/src/lib.rs b/ripple/fossil/src/lib.rs new file mode 100644 index 0000000..6fb5269 --- /dev/null +++ b/ripple/fossil/src/lib.rs @@ -0,0 +1,206 @@ +// SPDX-FileCopyrightText: edef +// SPDX-License-Identifier: OSL-3.0 + +use { + byteorder::{BigEndian, ByteOrder}, + prost::Message, + std::{collections::BTreeMap, fs, io, os::unix::fs::PermissionsExt, path::Path}, +}; + +pub mod store { + include!(concat!(env!("OUT_DIR"), "/fossil.store.rs")); +} + +const DIGEST_BYTES: usize = blake3::OUT_LEN; +const OFFSET_BYTES: usize = 4; + +pub struct Store { + db: sled::Db, +} + +impl Store { + pub fn open>(path: P) -> io::Result { + let db = sled::open(path)?; + Ok(Store { db }) + } + + pub fn add_path>(&self, path: P) -> Node { + let path = path.as_ref(); + let meta = fs::symlink_metadata(path).unwrap(); + + match meta.file_type() { + ty if ty.is_dir() => { + let mut d = Directory::new(); + + for entry in path.read_dir().unwrap() { + let entry = entry.unwrap(); + let name = entry.file_name().into_string().unwrap(); + d.children.insert(name, self.add_path(entry.path())); + } + + let blob = d.into_pb().encode_to_vec(); + + Node::Directory { + r#ref: self.write_blob(&blob), + } + } + ty if ty.is_file() => { + let executable = (meta.permissions().mode() & 0o100) != 0; + + let blob = fs::read(path).unwrap(); + Node::File { + executable, + r#ref: self.write_blob(&blob), + } + } + ty if ty.is_symlink() => { + let target = path + .read_link() + .unwrap() + .to_str() + .expect("symlink target is invalid UTF-8") + .to_owned(); + + Node::Link { target } + } + _ => panic!("not a symlink or a regular file"), + } + } + + fn write_blob(&self, data: &[u8]) -> Digest { + let digest = { + let mut h = blake3::Hasher::new(); + h.update_with_join::(&data); + *h.finalize().as_bytes() + }; + + // TODO(edef): maybe don't use the default tree? + // we should probably have a "blob" tree, + // and reserve the default tree for DB metadata + + self.db + .transaction::<_, _, sled::Error>(|db| { + for (n, chunk) in data.chunks(4096).enumerate() { + let mut key = [0u8; DIGEST_BYTES + OFFSET_BYTES]; + key[..DIGEST_BYTES].copy_from_slice(&digest); + BigEndian::write_u32(&mut key[DIGEST_BYTES..], n as u32); + db.insert(&key[..], chunk)?; + } + Ok(()) + }) + .unwrap(); + + digest.into() + } + + pub fn read_blob(&self, r#ref: Digest) -> Vec { + let mut buffer = Vec::new(); + let mut h = blake3::Hasher::new(); + for element in self.db.scan_prefix(r#ref.as_bytes()) { + let (_, chunk) = element.unwrap(); + h.update(&chunk); + buffer.extend_from_slice(&chunk); + } + + if buffer.len() == 0 { + panic!("blob not found"); + } + + if h.finalize() != r#ref { + panic!("hash mismatch"); + } + + buffer + } +} + +pub type Digest = blake3::Hash; + +pub struct Directory { + pub children: BTreeMap, +} + +#[derive(Clone)] +pub enum Node { + Directory { r#ref: Digest }, + File { r#ref: Digest, executable: bool }, + Link { target: String }, +} + +impl Directory { + pub fn new() -> Directory { + Directory { + children: BTreeMap::new(), + } + } + + pub fn into_pb(self) -> store::Directory { + let mut d = store::Directory::default(); + + for (name, node) in self.children.into_iter() { + match node { + Node::Directory { r#ref } => d.directories.push(store::DirectoryNode { + name, + r#ref: r#ref.as_bytes().to_vec(), + }), + Node::File { r#ref, executable } => d.files.push(store::FileNode { + name, + r#ref: r#ref.as_bytes().to_vec(), + executable, + }), + Node::Link { target } => d.links.push(store::LinkNode { name, target }), + } + } + + d + } + + pub fn from_pb(pb: store::Directory) -> Directory { + let mut children = BTreeMap::new(); + + for child in pb.directories { + children.insert( + child.name, + Node::Directory { + r#ref: digest_from_bytes(&child.r#ref), + }, + ); + } + + for child in pb.files { + children.insert( + child.name, + Node::File { + r#ref: digest_from_bytes(&child.r#ref), + executable: child.executable, + }, + ); + } + + for child in pb.links { + children.insert( + child.name, + Node::Link { + target: child.target, + }, + ); + } + + Directory { children } + } +} + +#[track_caller] +fn digest_from_bytes(bytes: &[u8]) -> Digest { + if bytes.len() != DIGEST_BYTES { + panic!( + "digest is {} bytes, expecting {} bytes", + bytes.len(), + DIGEST_BYTES + ); + } + + let mut buffer = [0; DIGEST_BYTES]; + buffer.copy_from_slice(bytes); + buffer.into() +} diff --git a/ripple/fossil/src/store.proto b/ripple/fossil/src/store.proto new file mode 100644 index 0000000..58832f0 --- /dev/null +++ b/ripple/fossil/src/store.proto @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: edef +// SPDX-License-Identifier: OSL-3.0 + +syntax = "proto3"; + +package fossil.store; + +message Directory { + repeated DirectoryNode directories = 1; + repeated FileNode files = 2; + repeated LinkNode links = 3; +} + +message DirectoryNode { + string name = 1; + bytes ref = 2; +} + +message FileNode { + string name = 1; + bytes ref = 2; + bool executable = 3; +} + +message LinkNode { + string name = 1; + string target = 2; +} -- cgit 1.4.1