Abbaye — markdown.rs

at 8645f5a
use std::collections::HashMap;
use std::path::{Path, PathBuf};

use ignore::WalkBuilder;
use miette::{IntoDiagnostic, Result, miette};
use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd, html};
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use tera::{Context, Tera};

use crate::builders::{ArtifactPath, Builder, LogEvent, LogSender};

/// The default Tera template used to wrap rendered Markdown content in a
/// complete HTML5 document. Exposed as a `pub const` so that `abbaye
/// dump-theme` can write it to `.abbaye/theme/markdown.html.j2`.
///
/// Template variables:
/// - `{{ title }}` — plain-text page title (auto-escaped by Tera).
/// - `{{ content | safe }}` — the rendered HTML body fragment.
pub const TEMPLATE_MARKDOWN: &str = include_str!("../templates/markdown.html.j2");

/// Filename looked up inside `.abbaye/theme/` at runtime.
const THEME_FILENAME: &str = "markdown.html.j2";
/// Name under which the template is registered inside the Tera instance.
const TERA_NAME: &str = "markdown.html";

fn default_recursive() -> bool {
    true
}

/// Configuration for [`MarkdownBuilder`].
#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
pub struct MarkdownBuilderConfig {
    /// Directory containing `.md` files to render.
    ///
    /// Every `.md` file in the directory is rendered to a corresponding
    /// `.html` file in the output directory, preserving the subdirectory
    /// structure.  Non-Markdown files referenced (linked or embedded) inside
    /// any Markdown source — images, PDFs, downloadable assets, etc. — are
    /// copied into the output directory next to the rendered HTML so that all
    /// relative URLs remain valid.
    ///
    /// Defaults to `"."` (the current working directory).
    pub input: Option<PathBuf>,

    /// Directory to write rendered files into.
    ///
    /// Defaults to `<input-name>-html` placed next to the input directory.
    ///
    /// ```toml
    /// [[builders]]
    /// type   = "markdown"
    /// input  = "docs/"
    /// output = "public/docs/"
    /// ```
    pub output: Option<PathBuf>,

    /// Also descend into subdirectories of `input`.
    ///
    /// Defaults to `true`. Files matched by a `.gitignore` in the directory
    /// hierarchy are always excluded, mirroring the behaviour of the `archive`
    /// builder.
    #[serde(default = "default_recursive")]
    pub recursive: bool,
}

impl Default for MarkdownBuilderConfig {
    fn default() -> Self {
        Self {
            input: None,
            output: None,
            recursive: default_recursive(),
        }
    }
}

/// Renders a directory of Markdown files (`.md`) to standalone HTML documents.
///
/// The output directory mirrors the input directory's structure: each `.md`
/// file becomes a `.html` file at the same relative path.  Any non-Markdown
/// file referenced by a local link or image embed in a source document is
/// copied to the output directory at the same relative path, keeping all URLs
/// intact.
pub struct MarkdownBuilder;

impl Builder for MarkdownBuilder {
    type ConfigType = MarkdownBuilderConfig;

    async fn build(
        &self,
        config: Self::ConfigType,
        _version: &str,
        log: LogSender,
    ) -> Result<Vec<ArtifactPath>> {
        let input = config
            .input
            .unwrap_or_else(|| PathBuf::from("."))
            .canonicalize()
            .into_diagnostic()?;

        if !input.is_dir() {
            return if input.exists() {
                Err(miette!(
                    "markdown builder input must be a directory, got a file: {}",
                    input.display()
                ))
            } else {
                Err(miette!(
                    "markdown builder input directory does not exist: {}",
                    input.display()
                ))
            };
        }

        // Load the Tera template once per builder invocation and share it
        // across all files so template parsing only happens once.
        let tera = load_tera()?;

        build_directory(&input, config.output, config.recursive, &log, &tera).await
    }
}

/// Load the Tera instance for this builder invocation.
///
/// Checks whether `.abbaye/theme/markdown.html.j2` exists and loads that
/// file when present; otherwise falls back to the compiled-in
/// [`TEMPLATE_MARKDOWN`] constant — exactly the same override mechanism
/// used by the site templates (`root_index.html.j2` / `version_index.html.j2`).
fn load_tera() -> Result<Tera> {
    let theme_file = PathBuf::from(".abbaye").join("theme").join(THEME_FILENAME);
    let mut tera = Tera::default();
    if theme_file.is_file() {
        tera.add_template_file(&theme_file, Some(TERA_NAME))
            .into_diagnostic()?;
    } else {
        tera.add_raw_template(TERA_NAME, TEMPLATE_MARKDOWN)
            .into_diagnostic()?;
    }
    Ok(tera)
}

// ── Directory rendering ───────────────────────────────────────────────────────

async fn build_directory(
    input_dir: &Path,
    output: Option<PathBuf>,
    recursive: bool,
    log: &LogSender,
    tera: &Tera,
) -> Result<Vec<ArtifactPath>> {
    let output_dir = output.unwrap_or_else(|| {
        let stem = input_dir
            .file_name()
            .map(|n| format!("{}-html", n.to_string_lossy()))
            .unwrap_or_else(|| "html".to_owned());
        input_dir.parent().unwrap_or(Path::new(".")).join(stem)
    });

    tokio::fs::create_dir_all(&output_dir)
        .await
        .into_diagnostic()?;

    // The ignore::WalkBuilder API is synchronous, so run it on the blocking
    // thread pool to avoid stalling the async runtime.
    let md_files = tokio::task::spawn_blocking({
        let input_dir = input_dir.to_owned();
        move || collect_md_files(&input_dir, recursive)
    })
    .await
    .into_diagnostic()??;

    if md_files.is_empty() {
        let _ = log.send(LogEvent::Line(format!(
            "warning: no .md files found in {}",
            input_dir.display()
        )));
        return Ok(vec![ArtifactPath {
            path: output_dir.clone(),
            name: dir_name_string(&output_dir),
            hash: None,
        }]);
    }

    // Map from absolute source path → absolute destination path, built up
    // while rendering so that files referenced by multiple documents are only
    // copied once.
    let mut files_to_copy: HashMap<PathBuf, PathBuf> = HashMap::new();

    for md_path in &md_files {
        let relative = md_path.strip_prefix(input_dir).into_diagnostic()?;
        let out_path = output_dir.join(relative).with_extension("html");

        // Ensure any intermediate subdirectories exist.
        if let Some(parent) = out_path.parent() {
            tokio::fs::create_dir_all(parent).await.into_diagnostic()?;
        }

        let _ = log.send(LogEvent::Line(format!(
            "{} → {}",
            md_path.display(),
            out_path.display()
        )));

        let md = tokio::fs::read_to_string(md_path).await.into_diagnostic()?;

        // Collect local asset references before rendering so we can copy them.
        for (src, rel) in collect_referenced_files(&md, md_path, input_dir) {
            files_to_copy
                .entry(src)
                .or_insert_with(|| output_dir.join(rel));
        }

        let title = extract_title(&md).unwrap_or_else(|| file_stem_string(md_path));
        let document = render_template(tera, &title, &render_markdown(&md))?;

        tokio::fs::write(&out_path, document.as_bytes())
            .await
            .into_diagnostic()?;
    }

    // Copy every referenced asset, creating parent directories as needed.
    for (src, dest) in &files_to_copy {
        if let Some(parent) = dest.parent() {
            tokio::fs::create_dir_all(parent).await.into_diagnostic()?;
        }

        let _ = log.send(LogEvent::Line(format!(
            "copying {} → {}",
            src.display(),
            dest.display()
        )));

        tokio::fs::copy(src, dest).await.into_diagnostic()?;
    }

    Ok(vec![ArtifactPath {
        path: output_dir.clone(),
        name: dir_name_string(&output_dir),
        hash: None,
    }])
}

// ── Helpers ───────────────────────────────────────────────────────────────────

/// Walk `dir` for `.md` files, honouring `.gitignore` rules (via the `ignore`
/// crate). Returns paths in a stable, sorted order so output is reproducible.
///
/// When `recursive` is `false` only the top level of `dir` is visited
/// (`max_depth = 1`).
fn collect_md_files(dir: &Path, recursive: bool) -> Result<Vec<PathBuf>> {
    let mut files = Vec::new();

    let walker = WalkBuilder::new(dir)
        .max_depth(if recursive { None } else { Some(1) })
        // Include dotfiles (e.g. .github/CONTRIBUTING.md).
        .hidden(false)
        .build();

    for result in walker {
        let entry = result.into_diagnostic()?;
        let path = entry.into_path();
        if path.is_file() && path.extension().and_then(|e| e.to_str()) == Some("md") {
            files.push(path);
        }
    }

    files.sort();
    Ok(files)
}

/// Parse `md` for local link and image targets that are not other Markdown
/// files, resolve them relative to `md_path`'s parent directory, and return
/// those that exist as files within `input_dir`.
///
/// Returns a list of `(absolute_source_path, relative_path_from_input_dir)`
/// pairs.  The caller uses the relative path to mirror the asset at the same
/// position inside the output directory, keeping all relative URLs in the
/// rendered HTML valid.
///
/// Skipped silently:
/// - Remote URLs (`://`), data URIs, and fragment-only refs (`#…`).
/// - `.md` files — those are rendered to `.html`, not copied.
/// - Refs that do not resolve to an existing file.
/// - Refs that resolve to a file outside `input_dir` (a warning is logged
///   to the caller instead).
fn collect_referenced_files(md: &str, md_path: &Path, input_dir: &Path) -> Vec<(PathBuf, PathBuf)> {
    let md_dir = md_path.parent().unwrap_or(Path::new("."));
    let opts = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH;
    let mut result = Vec::new();

    for event in Parser::new_ext(md, opts) {
        let url: Option<pulldown_cmark::CowStr> = match event {
            Event::Start(Tag::Image { dest_url, .. }) => Some(dest_url),
            Event::Start(Tag::Link { dest_url, .. }) => Some(dest_url),
            _ => None,
        };

        let Some(url) = url else { continue };
        let s = url.as_ref();

        // Skip remote URLs, data URIs, and fragment-only refs.
        if s.contains("://") || s.starts_with('#') || s.is_empty() {
            continue;
        }

        // Strip any trailing fragment before treating the string as a path.
        let path_part = s.split('#').next().unwrap_or(s);
        if path_part.is_empty() {
            continue;
        }

        // Skip links to other Markdown files — those will be rendered to
        // .html and don't need to be copied as assets.
        if Path::new(path_part).extension().and_then(|e| e.to_str()) == Some("md") {
            continue;
        }

        // Resolve the ref relative to the containing markdown file's directory.
        // A leading `/` is treated as relative to input_dir, not the filesystem
        // root, which is the most useful interpretation for a docs directory.
        let abs = if path_part.starts_with('/') {
            input_dir.join(path_part.trim_start_matches('/'))
        } else {
            md_dir.join(path_part)
        };

        // canonicalize() fails if the path does not exist.
        let Ok(abs) = abs.canonicalize() else {
            continue;
        };

        if !abs.is_file() {
            continue;
        }

        // Only copy assets that live inside the input directory; assets
        // outside it are silently skipped (they cannot be given a stable
        // relative output path).
        let Ok(relative) = abs.strip_prefix(input_dir) else {
            continue;
        };
        let relative = relative.to_owned();

        result.push((abs, relative));
    }

    result
}

/// Convert a Markdown string to an HTML fragment.
///
/// Enables tables, strikethrough, and footnotes — a superset of what
/// [`crate::site`]'s own renderer uses.
fn render_markdown(md: &str) -> String {
    let opts = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH | Options::ENABLE_FOOTNOTES;
    let parser = Parser::new_ext(md, opts);
    let mut buf = String::new();
    html::push_html(&mut buf, parser);
    buf
}

/// Scan a Markdown string for the first heading of any level and return its
/// plain-text content. Used to populate the `<title>` element.
///
/// Handles headings that contain inline emphasis, code spans, or other inline
/// elements by concatenating all `Text` and `Code` events seen between the
/// opening and closing heading tags.
fn extract_title(md: &str) -> Option<String> {
    let opts = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH;
    let mut parser = Parser::new_ext(md, opts);
    let mut in_heading = false;
    let mut title = String::new();

    loop {
        match parser.next()? {
            Event::Start(Tag::Heading { .. }) => {
                in_heading = true;
            }
            Event::End(TagEnd::Heading(_)) => break,
            Event::Text(text) | Event::Code(text) if in_heading => {
                title.push_str(&text);
            }
            _ => {}
        }
    }

    if title.is_empty() { None } else { Some(title) }
}

/// Render the Tera template with the given `title` and HTML `content`.
///
/// `title` is passed as a plain string; Tera auto-escapes it when inserted
/// into `{{ title }}`. `content` is the already-rendered HTML fragment and
/// must be inserted with `{{ content | safe }}` in the template.
fn render_template(tera: &Tera, title: &str, content: &str) -> Result<String> {
    let mut ctx = Context::new();
    ctx.insert("title", title);
    ctx.insert("content", content);
    tera.render(TERA_NAME, &ctx).into_diagnostic()
}

/// Extract the file stem as an owned `String`, falling back to `"Document"`.
fn file_stem_string(path: &Path) -> String {
    path.file_stem()
        .map(|s| s.to_string_lossy().into_owned())
        .unwrap_or_else(|| "Document".to_owned())
}

/// Extract the directory name as an owned `String`, falling back to `"html"`.
fn dir_name_string(path: &Path) -> String {
    path.file_name()
        .map(|n| n.to_string_lossy().into_owned())
        .unwrap_or_else(|| "html".to_owned())
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::fs;

    /// Build a temporary directory tree:
    ///
    /// ```
    /// <tmpdir>/
    ///   docs/
    ///     page.md         ← references image.png, guide.pdf, other.md, https://…
    ///     image.png       ← local asset (should be collected)
    ///     guide.pdf       ← local asset (should be collected)
    ///     other.md        ← .md file   (should be skipped)
    ///     sub/
    ///       nested.md     ← references ../image.png (should be collected)
    /// ```
    fn make_test_tree() -> (tempfile::TempDir, PathBuf) {
        let tmp = tempfile::tempdir().expect("tempdir");
        let docs = tmp.path().join("docs");
        fs::create_dir_all(docs.join("sub")).unwrap();
        fs::write(docs.join("image.png"), b"PNG").unwrap();
        fs::write(docs.join("guide.pdf"), b"PDF").unwrap();
        fs::write(docs.join("other.md"), b"# Other").unwrap();
        fs::write(
            docs.join("page.md"),
            b"# Hello\n\
              ![logo](image.png)\n\
              [guide](guide.pdf)\n\
              [other](other.md)\n\
              [remote](https://example.com)\n\
              [frag](#section)\n",
        )
        .unwrap();
        fs::write(
            docs.join("sub").join("nested.md"),
            b"# Nested\n![logo](../image.png)\n",
        )
        .unwrap();
        (tmp, docs)
    }

    #[test]
    fn collects_images_and_non_md_links() {
        let (_tmp, docs) = make_test_tree();
        let md_path = docs.join("page.md");
        let md = fs::read_to_string(&md_path).unwrap();

        let refs = collect_referenced_files(&md, &md_path, &docs);
        let sources: Vec<_> = refs.iter().map(|(src, _)| src.clone()).collect();

        let img = docs.join("image.png").canonicalize().unwrap();
        let pdf = docs.join("guide.pdf").canonicalize().unwrap();
        let other = docs.join("other.md").canonicalize().unwrap();

        assert!(sources.contains(&img), "image.png should be collected");
        assert!(sources.contains(&pdf), "guide.pdf should be collected");
        assert!(!sources.contains(&other), "other.md should be skipped");
        // Remote URL and fragment-only link must not appear.
        assert_eq!(refs.len(), 2, "expected exactly 2 assets (image + pdf)");
    }

    #[test]
    fn relative_paths_from_subdirectory_are_resolved() {
        let (_tmp, docs) = make_test_tree();
        let md_path = docs.join("sub").join("nested.md");
        let md = fs::read_to_string(&md_path).unwrap();

        let refs = collect_referenced_files(&md, &md_path, &docs);

        assert_eq!(refs.len(), 1, "expected exactly 1 asset");
        let (src, rel) = &refs[0];
        assert_eq!(
            src,
            &docs.join("image.png").canonicalize().unwrap(),
            "source should be docs/image.png"
        );
        assert_eq!(
            rel,
            &PathBuf::from("image.png"),
            "relative path should be image.png (relative to docs/)"
        );
    }

    #[test]
    fn extract_title_finds_first_heading() {
        assert_eq!(
            extract_title("# Hello World\n\nsome text"),
            Some("Hello World".to_owned())
        );
    }

    #[test]
    fn extract_title_handles_inline_code_in_heading() {
        assert_eq!(
            extract_title("# Use `foo()` wisely"),
            Some("Use foo() wisely".to_owned())
        );
    }

    #[test]
    fn extract_title_returns_none_when_no_heading() {
        assert_eq!(extract_title("just a paragraph"), None);
    }
}