Skip to content
Snippets Groups Projects
archiver.rs 9.78 KiB
Newer Older
  • Learn to ignore specific revisions
  • Daniel Müller's avatar
    Daniel Müller committed
    use std::{
        fs::File,
        io::{BufReader, Seek, Write},
        path::Path,
    };
    
    use filetime::FileTime;
    use log::{debug, trace};
    
    use crate::{
        chunker::{Chunker, FixedSizeChunker},
        codec::{ArCodec, BLOCK_SIZE},
        datamodel::{AllZeroExt, ArEntry, BlockPool, CompressionType, EntryType, FileTable},
        dirwalk::{FsEntry, WalkDir},
        read_write_extension::WriteExtTypes,
    };
    
    pub struct Archiver<T: Write + Seek> {
        codec: ArCodec,
        blocks_out: BlockPool<T>,
        reference_file_table: Option<FileTable>,
    }
    
    impl<T: Write + Seek> Archiver<T> {
        pub fn new(mut codec: ArCodec, blocks_out: BlockPool<T>) -> Self {
            // FileTable and IndexTable don't have a UID, so this is a fresh backup. Take the random
            // UID from the BlockPool
            if codec.file_table.uid.is_all_zero() && codec.index_table.uid.is_all_zero() {
                debug!("FileTable & IndexTable have no UID, using BlockPool UID");
                codec.file_table.uid = blocks_out.uid;
                codec.index_table.uid = blocks_out.uid;
            }
    
            // BlockPool and IndexTable have the same UID, but FileTable is default. This is a new
            // addition to an existing BackupPool
            if codec.index_table.uid == blocks_out.uid && codec.file_table.uid.is_all_zero() {
                codec.file_table.uid = blocks_out.uid;
            }
    
            Self {
                codec,
                blocks_out,
                reference_file_table: None,
            }
        }
    
        /// Check that all loaded files have the same UID
        ///
        /// Returns true if the UIDs are the same, false otherwise
        pub fn check_uids(&self) -> bool {
            self.codec.file_table.uid == self.codec.index_table.uid
                && self.codec.file_table.uid == self.blocks_out.uid
        }
    
        pub fn codec(&self) -> &ArCodec {
            &self.codec
        }
    
        pub fn load_reference_metadata(
            &mut self,
            ft_path: impl AsRef<Path>,
        ) -> Result<(), std::io::Error> {
            let ft_file = File::open(ft_path.as_ref())?;
            let ft_file = BufReader::new(ft_file);
    
            let mut decoder = zstd::Decoder::new(ft_file)?;
    
            self.reference_file_table = Some(FileTable::try_deserialize_from(&mut decoder)?);
            Ok(())
        }
    
        pub fn archive_dir_recursive(&mut self, path: impl AsRef<Path>) -> Result<(), std::io::Error> {
            if !path.as_ref().is_dir() {
                return self.archive_file(&FsEntry::new(path.as_ref().to_path_buf())?);
            }
    
            let entries_to_archive = WalkDir::new(path).run_iter();
    
            let _dbg_num_blocks_before = self.codec.index_table.blocks.len();
            let _dbg_num_pool_bytes_before = self.blocks_out.next_block_offset;
    
            for entry in entries_to_archive {
                match entry {
                    Ok(entry) if entry.metadata.is_symlink() => self.archive_symlink(&entry)?,
                    Ok(entry) if entry.metadata.is_dir() => self.archive_empty_dir(&entry)?,
                    Ok(entry) => self.archive_file(&entry)?,
                    Err(err) => {
                        eprintln!("Archive error for '{}': {}", err.path.display(), err.error);
                        continue;
                    }
                }
            }
    
            let _dbg_num_blocks_after = self.codec.index_table.blocks.len();
            let _dbg_num_new_blocks = _dbg_num_blocks_after - _dbg_num_blocks_before;
            let _dbg_num_pool_bytes_after = self.blocks_out.next_block_offset;
            let _dbg_num_pool_bytes_new = _dbg_num_pool_bytes_after - _dbg_num_pool_bytes_before;
    
            debug!(
                "Recursively archived dir. Added {} blocks / {} bytes to BlockPool",
                _dbg_num_new_blocks, _dbg_num_pool_bytes_new
            );
    
            Ok(())
        }
    
        pub fn archive_empty_dir(&mut self, entry: &FsEntry) -> Result<(), std::io::Error> {
            let path = &entry.path;
    
            let mut ar_entry = ArEntry::default();
    
            // let md = std::fs::symlink_metadata(path.as_ref())?;
            let md = &entry.metadata;
            if !md.is_dir() {
                panic!(
                    "Called archive_empty_dir on a non-dir path: {}",
                    path.display()
                );
            }
    
            let mod_time = FileTime::from_last_modification_time(md);
    
            ar_entry
                .metadata
                .entry_and_os_type
                .set_entry_type(EntryType::Directory);
    
            ar_entry.metadata.modified_unix_seconds = mod_time.unix_seconds();
            ar_entry.metadata.modified_nanos = mod_time.nanoseconds();
            ar_entry.path = path.to_string_lossy().to_string();
    
            self.codec.file_table.entries.push(ar_entry);
    
            Ok(())
        }
    
        pub fn archive_symlink(&mut self, entry: &FsEntry) -> Result<(), std::io::Error> {
            let path = &entry.path;
            let mut ar_entry = ArEntry::default();
    
            // let md = std::fs::symlink_metadata(path.as_ref())?;
            let md = &entry.metadata;
            if !md.is_symlink() {
                panic!(
                    "Called archive_symlink on a non-symlink path: {}",
                    path.display()
                );
            }
    
            let md_target = std::fs::metadata(&path);
    
            let symlink_type = match md_target {
                Err(e) if e.kind() == std::io::ErrorKind::NotFound => EntryType::SymbolicLinkFile,
                Err(e) => return Err(e),
                Ok(md_target) => {
                    match (
                        md_target.is_file(),
                        md_target.is_dir(),
                        md_target.is_symlink(),
                    ) {
                        (true, false, false) => EntryType::SymbolicLinkFile,
                        (false, true, false) => EntryType::SymbolicLinkDir,
                        _ => panic!("Invalid symlink target type"),
                    }
                }
            };
    
            let mod_time = FileTime::from_last_modification_time(md);
    
            ar_entry
                .metadata
                .entry_and_os_type
                .set_entry_type(symlink_type);
    
            ar_entry.metadata.modified_unix_seconds = mod_time.unix_seconds();
            ar_entry.metadata.modified_nanos = mod_time.nanoseconds();
            ar_entry.path = path.to_string_lossy().to_string();
    
            let symlink_target = std::fs::read_link(&path)?.to_string_lossy().to_string();
            ar_entry.symlink_target = Some(symlink_target);
    
            self.codec.file_table.entries.push(ar_entry);
    
            Ok(())
        }
    
        pub fn archive_file(&mut self, entry: &FsEntry) -> Result<(), std::io::Error> {
            let path = &entry.path;
            let s_path = path.to_string_lossy().to_string();
    
            if let Some(ref_ft) = &self.reference_file_table {
                if let Some(ref_entry) = ref_ft
                    .entries
                    .iter()
                    .find(|it| it.path == entry.path.to_string_lossy())
                {
                    let mod_time = FileTime::from_last_modification_time(&entry.metadata);
                    if ref_entry.metadata.modified_unix_seconds == mod_time.unix_seconds()
                        && ref_entry.metadata.modified_nanos == mod_time.nanoseconds()
                        && ref_entry.metadata.file_size == entry.metadata.len()
                    {
                        debug!(
                            "Detected same mtime + len, copying from ref-entry: {}",
                            s_path
                        );
                        self.codec.file_table.entries.push(ref_entry.clone());
                        return Ok(());
                    }
                }
            }
    
            trace!("Archiving file: {s_path}");
    
            let mut ar_entry = ArEntry::default();
    
            // let md = std::fs::metadata(path.as_ref())?;
            let md = &entry.metadata;
            if !md.is_file() {
                panic!("Called archive_file on a non-file path");
            }
    
            let mod_time = FileTime::from_last_modification_time(md);
    
            ar_entry
                .metadata
                .entry_and_os_type
                .set_entry_type(EntryType::File);
            ar_entry.metadata.modified_unix_seconds = mod_time.unix_seconds();
            ar_entry.metadata.modified_nanos = mod_time.nanoseconds();
            ar_entry.metadata.file_size = md.len();
            ar_entry.path = s_path;
    
            let file = File::open(path)?;
            let file = BufReader::new(file);
    
            let mut file_hash = [0_u8; 32];
            let mut file_hasher = blake3::Hasher::new();
    
            let mut chunker = FixedSizeChunker::new(file, BLOCK_SIZE);
    
            loop {
                let bytes_read = match chunker.get_chunk(&mut self.codec.buffer)? {
                    (0, _) => break,
                    (n, _) => n as usize,
                };
    
                let block = &self.codec.buffer[..bytes_read];
    
                let hash = blake3::hash(block);
                let hash = hash.as_bytes();
    
                // Update the full file hash
                file_hasher.update(block);
    
                if let Some(block_offset) = self.codec.index_table.blocks.get(hash) {
                    ar_entry.blocks.push(*block_offset);
                    continue;
                }
    
                let mut encode_buf = Vec::with_capacity(self.codec.buffer.len());
                let mut encoder = zstd::Encoder::new(&mut encode_buf, 2).unwrap();
                encoder.include_checksum(false)?;
                encoder.include_dictid(false)?;
                encoder.include_contentsize(false)?;
                encoder.write_all(block)?;
                encoder.finish()?;
    
                self.blocks_out.write_all_u32(encode_buf.len() as u32)?;
                self.blocks_out.write_all_u8(CompressionType::Zstd as u8)?;
                self.blocks_out.write_all(&encode_buf)?;
    
                let block_start = self.blocks_out.next_block_offset;
    
                ar_entry.blocks.push(block_start);
    
                self.codec.index_table.blocks.insert(*hash, block_start);
    
                // + 4 to skip the 32-bit blocklength, + 1 to skip the compression type
                self.blocks_out.next_block_offset += encode_buf.len() as u64 + 4 + 1;
            }
    
            file_hash.copy_from_slice(file_hasher.finalize().as_bytes());
            ar_entry.metadata.file_hash = file_hash;
    
            self.codec.file_table.entries.push(ar_entry);
    
            Ok(())
        }
    }