Abbaye

at ccc40f5

use std::{
    fs::File,
    path::{Path, PathBuf},
};

use flate2::{Compression, write::GzEncoder};
use globset::{Glob, GlobSet, GlobSetBuilder};
use ignore::WalkBuilder;
use miette::{IntoDiagnostic, Result};
use serde::Deserialize;

use crate::builders::{ArtifactPath, Builder};

fn default_ignore_patterns() -> Vec<String> {
    vec![".git".to_owned(), "*.local".to_owned()]
}

/// Configuration for [`ArchiveBuilder`].
#[derive(Debug, Clone, Deserialize)]
pub struct ArchiveBuilderConfig {
    /// Root directory to archive. Defaults to the current working directory.
    pub source_dir: Option<PathBuf>,

    /// Output path for the generated `.tar.gz` archive.
    /// Defaults to `source.tar.gz` in the current working directory.
    pub output: Option<PathBuf>,

    /// Prefix applied to every entry path inside the archive.
    /// For example, `"myproject-1.0.0"` produces entries like
    /// `myproject-1.0.0/src/main.rs`.
    /// Defaults to the source directory's name.
    pub prefix: Option<String>,

    /// Glob patterns for files and directories to exclude from the archive.
    /// Each pattern is matched against every component of a path, so a pattern
    /// like `".git"` excludes the `.git` directory and all its contents, and
    /// `"*.local"` excludes any entry whose name ends with `.local`.
    /// Defaults to `[".git", "*.local"]`.
    #[serde(default = "default_ignore_patterns")]
    pub ignore_patterns: Vec<String>,
}

impl Default for ArchiveBuilderConfig {
    fn default() -> Self {
        Self {
            source_dir: None,
            output: None,
            prefix: None,
            ignore_patterns: default_ignore_patterns(),
        }
    }
}

/// Creates a `.tar.gz` archive of the source tree, honouring all `.gitignore`
/// rules found in the directory hierarchy.
pub struct ArchiveBuilder;

impl Builder for ArchiveBuilder {
    type ConfigType = ArchiveBuilderConfig;

    async fn build(&self, config: Self::ConfigType) -> Result<Vec<ArtifactPath>> {
        let source_dir = config
            .source_dir
            .unwrap_or_else(|| PathBuf::from("."))
            .canonicalize()
            .into_diagnostic()?;

        let output = config
            .output
            .unwrap_or_else(|| PathBuf::from("../source.tar.gz"));

        let prefix = config.prefix.unwrap_or_else(|| {
            source_dir
                .file_name()
                .map(|n| n.to_string_lossy().into_owned())
                .unwrap_or_else(|| "source".to_owned())
        });

        let ignore_set = build_ignore_set(&config.ignore_patterns)?;

        let archive_path = tokio::task::spawn_blocking(move || {
            create_archive(&source_dir, &output, &prefix, &ignore_set)
        })
        .await
        .into_diagnostic()??;

        let name = archive_path
            .file_name()
            .map(|n| n.to_string_lossy().into_owned())
            .unwrap_or_default();

        let hash = Some(super::hash_file(&archive_path).await?);

        Ok(vec![ArtifactPath {
            path: archive_path,
            name,
            hash,
        }])
    }
}

/// Compiles a [`GlobSet`] from the given list of glob patterns.
fn build_ignore_set(patterns: &[String]) -> Result<GlobSet> {
    let mut builder = GlobSetBuilder::new();
    for pattern in patterns {
        builder.add(Glob::new(pattern).into_diagnostic()?);
    }
    builder.build().into_diagnostic()
}

/// Walks `source_dir` respecting `.gitignore` rules and writes a `.tar.gz`
/// archive to `output`, prefixing every entry with `prefix`.
/// `.git` and the output file itself are always excluded. Additionally, entries
/// whose path contains a component matched by `ignore_set` are skipped.
fn create_archive(
    source_dir: &Path,
    output: &Path,
    prefix: &str,
    ignore_set: &GlobSet,
) -> Result<PathBuf> {
    let file = File::create(output).into_diagnostic()?;
    // Canonicalize now that the file exists so we can reliably detect it during
    // the walk and avoid embedding the archive inside itself.
    let output_canonical = output.canonicalize().into_diagnostic()?;
    let encoder = GzEncoder::new(file, Compression::default());
    let mut archive = tar::Builder::new(encoder);

    for result in WalkBuilder::new(source_dir)
        .hidden(false) // include dotfiles such as .rustfmt.toml
        .build()
    {
        let entry = result.into_diagnostic()?;
        let path = entry.path();

        let relative = path.strip_prefix(source_dir).into_diagnostic()?;

        // Always exclude .git, regardless of ignore_patterns.
        if relative.components().any(|c| c.as_os_str() == ".git") {
            continue;
        }

        // Always exclude the output archive itself to prevent a tarbomb.
        if path == output_canonical {
            continue;
        }

        // Skip entries whose path contains a component matched by the ignore set.
        if relative
            .components()
            .any(|c| ignore_set.is_match(Path::new(c.as_os_str())))
        {
            continue;
        }

        if !path.is_file() {
            continue;
        }

        let entry_path = Path::new(prefix).join(relative);

        archive
            .append_path_with_name(path, &entry_path)
            .into_diagnostic()?;
    }

    // Finalise the tar stream, then flush and close the gzip layer.
    archive
        .into_inner()
        .into_diagnostic()?
        .finish()
        .into_diagnostic()?;

    Ok(output.to_path_buf())
}