Skip to main content

abbaye/builders/
markdown.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3
4use ignore::WalkBuilder;
5use miette::{IntoDiagnostic, Result, miette};
6use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd, html};
7use schemars::JsonSchema;
8use serde::{Deserialize, Serialize};
9use tera::{Context, Tera};
10
11use crate::builders::{ArtifactPath, Builder, LogEvent, LogSender};
12
13/// The default Tera template used to wrap rendered Markdown content in a
14/// complete HTML5 document. Exposed as a `pub const` so that `abbaye
15/// dump-theme` can write it to `.abbaye/theme/markdown.html.j2`.
16///
17/// Template variables:
18/// - `{{ title }}` — plain-text page title (auto-escaped by Tera).
19/// - `{{ content | safe }}` — the rendered HTML body fragment.
20pub const TEMPLATE_MARKDOWN: &str = include_str!("../templates/markdown.html.j2");
21
22/// Filename looked up inside `.abbaye/theme/` at runtime.
23const THEME_FILENAME: &str = "markdown.html.j2";
24/// Name under which the template is registered inside the Tera instance.
25const TERA_NAME: &str = "markdown.html";
26
27fn default_recursive() -> bool {
28    true
29}
30
31/// Configuration for [`MarkdownBuilder`].
32#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
33pub struct MarkdownBuilderConfig {
34    /// Directory containing `.md` files to render.
35    ///
36    /// Every `.md` file in the directory is rendered to a corresponding
37    /// `.html` file in the output directory, preserving the subdirectory
38    /// structure.  Non-Markdown files referenced (linked or embedded) inside
39    /// any Markdown source — images, PDFs, downloadable assets, etc. — are
40    /// copied into the output directory next to the rendered HTML so that all
41    /// relative URLs remain valid.
42    ///
43    /// Defaults to `"."` (the current working directory).
44    pub input: Option<PathBuf>,
45
46    /// Directory to write rendered files into.
47    ///
48    /// Defaults to `<input-name>-html` placed next to the input directory.
49    ///
50    /// ```toml
51    /// [[builders]]
52    /// type   = "markdown"
53    /// input  = "docs/"
54    /// output = "public/docs/"
55    /// ```
56    pub output: Option<PathBuf>,
57
58    /// Also descend into subdirectories of `input`.
59    ///
60    /// Defaults to `true`. Files matched by a `.gitignore` in the directory
61    /// hierarchy are always excluded, mirroring the behaviour of the `archive`
62    /// builder.
63    #[serde(default = "default_recursive")]
64    pub recursive: bool,
65}
66
67impl Default for MarkdownBuilderConfig {
68    fn default() -> Self {
69        Self {
70            input: None,
71            output: None,
72            recursive: default_recursive(),
73        }
74    }
75}
76
77/// Renders a directory of Markdown files (`.md`) to standalone HTML documents.
78///
79/// The output directory mirrors the input directory's structure: each `.md`
80/// file becomes a `.html` file at the same relative path.  Any non-Markdown
81/// file referenced by a local link or image embed in a source document is
82/// copied to the output directory at the same relative path, keeping all URLs
83/// intact.
84pub struct MarkdownBuilder;
85
86impl Builder for MarkdownBuilder {
87    type ConfigType = MarkdownBuilderConfig;
88
89    async fn build(
90        &self,
91        config: Self::ConfigType,
92        _version: &str,
93        log: LogSender,
94    ) -> Result<Vec<ArtifactPath>> {
95        let input = config
96            .input
97            .unwrap_or_else(|| PathBuf::from("."))
98            .canonicalize()
99            .into_diagnostic()?;
100
101        if !input.is_dir() {
102            return if input.exists() {
103                Err(miette!(
104                    "markdown builder input must be a directory, got a file: {}",
105                    input.display()
106                ))
107            } else {
108                Err(miette!(
109                    "markdown builder input directory does not exist: {}",
110                    input.display()
111                ))
112            };
113        }
114
115        // Load the Tera template once per builder invocation and share it
116        // across all files so template parsing only happens once.
117        let tera = load_tera()?;
118
119        build_directory(&input, config.output, config.recursive, &log, &tera).await
120    }
121}
122
123/// Load the Tera instance for this builder invocation.
124///
125/// Checks whether `.abbaye/theme/markdown.html.j2` exists and loads that
126/// file when present; otherwise falls back to the compiled-in
127/// [`TEMPLATE_MARKDOWN`] constant — exactly the same override mechanism
128/// used by the site templates (`root_index.html.j2` / `version_index.html.j2`).
129fn load_tera() -> Result<Tera> {
130    let theme_path = PathBuf::from(".abbaye").join("theme");
131    let theme_file = theme_path.join(THEME_FILENAME);
132    let mut tera = Tera::default();
133    if theme_file.is_file() {
134        tera.add_template_file(&theme_file, Some(TERA_NAME))
135            .into_diagnostic()?;
136    } else {
137        tera.add_raw_template(TERA_NAME, TEMPLATE_MARKDOWN)
138            .into_diagnostic()?;
139    }
140    crate::site::load_extra_theme_templates(&mut tera, &theme_path, &[TERA_NAME])?;
141    Ok(tera)
142}
143
144// ── Directory rendering ───────────────────────────────────────────────────────
145
146async fn build_directory(
147    input_dir: &Path,
148    output: Option<PathBuf>,
149    recursive: bool,
150    log: &LogSender,
151    tera: &Tera,
152) -> Result<Vec<ArtifactPath>> {
153    let output_dir = output.unwrap_or_else(|| {
154        let stem = input_dir
155            .file_name()
156            .map(|n| format!("{}-html", n.to_string_lossy()))
157            .unwrap_or_else(|| "html".to_owned());
158        input_dir.parent().unwrap_or(Path::new(".")).join(stem)
159    });
160
161    tokio::fs::create_dir_all(&output_dir)
162        .await
163        .into_diagnostic()?;
164
165    // The ignore::WalkBuilder API is synchronous, so run it on the blocking
166    // thread pool to avoid stalling the async runtime.
167    let md_files = tokio::task::spawn_blocking({
168        let input_dir = input_dir.to_owned();
169        move || collect_md_files(&input_dir, recursive)
170    })
171    .await
172    .into_diagnostic()??;
173
174    if md_files.is_empty() {
175        let _ = log.send(LogEvent::Line(format!(
176            "warning: no .md files found in {}",
177            input_dir.display()
178        )));
179        return Ok(vec![ArtifactPath {
180            path: output_dir.clone(),
181            name: dir_name_string(&output_dir),
182            hash: None,
183        }]);
184    }
185
186    // Map from absolute source path → absolute destination path, built up
187    // while rendering so that files referenced by multiple documents are only
188    // copied once.
189    let mut files_to_copy: HashMap<PathBuf, PathBuf> = HashMap::new();
190
191    for md_path in &md_files {
192        let relative = md_path.strip_prefix(input_dir).into_diagnostic()?;
193        let out_path = output_dir.join(relative).with_extension("html");
194
195        // Ensure any intermediate subdirectories exist.
196        if let Some(parent) = out_path.parent() {
197            tokio::fs::create_dir_all(parent).await.into_diagnostic()?;
198        }
199
200        let _ = log.send(LogEvent::Line(format!(
201            "{} → {}",
202            md_path.display(),
203            out_path.display()
204        )));
205
206        let md = tokio::fs::read_to_string(md_path).await.into_diagnostic()?;
207
208        // Collect local asset references before rendering so we can copy them.
209        for (src, rel) in collect_referenced_files(&md, md_path, input_dir) {
210            files_to_copy
211                .entry(src)
212                .or_insert_with(|| output_dir.join(rel));
213        }
214
215        let title = extract_title(&md).unwrap_or_else(|| file_stem_string(md_path));
216        let document = render_template(tera, &title, &render_markdown(&md))?;
217
218        tokio::fs::write(&out_path, document.as_bytes())
219            .await
220            .into_diagnostic()?;
221    }
222
223    // Copy every referenced asset, creating parent directories as needed.
224    for (src, dest) in &files_to_copy {
225        if let Some(parent) = dest.parent() {
226            tokio::fs::create_dir_all(parent).await.into_diagnostic()?;
227        }
228
229        let _ = log.send(LogEvent::Line(format!(
230            "copying {} → {}",
231            src.display(),
232            dest.display()
233        )));
234
235        tokio::fs::copy(src, dest).await.into_diagnostic()?;
236    }
237
238    Ok(vec![ArtifactPath {
239        path: output_dir.clone(),
240        name: dir_name_string(&output_dir),
241        hash: None,
242    }])
243}
244
245// ── Helpers ───────────────────────────────────────────────────────────────────
246
247/// Walk `dir` for `.md` files, honouring `.gitignore` rules (via the `ignore`
248/// crate). Returns paths in a stable, sorted order so output is reproducible.
249///
250/// When `recursive` is `false` only the top level of `dir` is visited
251/// (`max_depth = 1`).
252fn collect_md_files(dir: &Path, recursive: bool) -> Result<Vec<PathBuf>> {
253    let mut files = Vec::new();
254
255    let walker = WalkBuilder::new(dir)
256        .max_depth(if recursive { None } else { Some(1) })
257        // Include dotfiles (e.g. .github/CONTRIBUTING.md).
258        .hidden(false)
259        .build();
260
261    for result in walker {
262        let entry = result.into_diagnostic()?;
263        let path = entry.into_path();
264        if path.is_file() && path.extension().and_then(|e| e.to_str()) == Some("md") {
265            files.push(path);
266        }
267    }
268
269    files.sort();
270    Ok(files)
271}
272
273/// Parse `md` for local link and image targets that are not other Markdown
274/// files, resolve them relative to `md_path`'s parent directory, and return
275/// those that exist as files within `input_dir`.
276///
277/// Returns a list of `(absolute_source_path, relative_path_from_input_dir)`
278/// pairs.  The caller uses the relative path to mirror the asset at the same
279/// position inside the output directory, keeping all relative URLs in the
280/// rendered HTML valid.
281///
282/// Skipped silently:
283/// - Remote URLs (`://`), data URIs, and fragment-only refs (`#…`).
284/// - `.md` files — those are rendered to `.html`, not copied.
285/// - Refs that do not resolve to an existing file.
286/// - Refs that resolve to a file outside `input_dir` (a warning is logged
287///   to the caller instead).
288fn collect_referenced_files(md: &str, md_path: &Path, input_dir: &Path) -> Vec<(PathBuf, PathBuf)> {
289    let md_dir = md_path.parent().unwrap_or(Path::new("."));
290    let opts = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH;
291    let mut result = Vec::new();
292
293    for event in Parser::new_ext(md, opts) {
294        let url: Option<pulldown_cmark::CowStr> = match event {
295            Event::Start(Tag::Image { dest_url, .. }) => Some(dest_url),
296            Event::Start(Tag::Link { dest_url, .. }) => Some(dest_url),
297            _ => None,
298        };
299
300        let Some(url) = url else { continue };
301        let s = url.as_ref();
302
303        // Skip remote URLs, data URIs, and fragment-only refs.
304        if s.contains("://") || s.starts_with('#') || s.is_empty() {
305            continue;
306        }
307
308        // Strip any trailing fragment before treating the string as a path.
309        let path_part = s.split('#').next().unwrap_or(s);
310        if path_part.is_empty() {
311            continue;
312        }
313
314        // Skip links to other Markdown files — those will be rendered to
315        // .html and don't need to be copied as assets.
316        if Path::new(path_part).extension().and_then(|e| e.to_str()) == Some("md") {
317            continue;
318        }
319
320        // Resolve the ref relative to the containing markdown file's directory.
321        // A leading `/` is treated as relative to input_dir, not the filesystem
322        // root, which is the most useful interpretation for a docs directory.
323        let abs = if path_part.starts_with('/') {
324            input_dir.join(path_part.trim_start_matches('/'))
325        } else {
326            md_dir.join(path_part)
327        };
328
329        // canonicalize() fails if the path does not exist.
330        let Ok(abs) = abs.canonicalize() else {
331            continue;
332        };
333
334        if !abs.is_file() {
335            continue;
336        }
337
338        // Only copy assets that live inside the input directory; assets
339        // outside it are silently skipped (they cannot be given a stable
340        // relative output path).
341        let Ok(relative) = abs.strip_prefix(input_dir) else {
342            continue;
343        };
344        let relative = relative.to_owned();
345
346        result.push((abs, relative));
347    }
348
349    result
350}
351
352/// Convert a Markdown string to an HTML fragment.
353///
354/// Enables tables, strikethrough, and footnotes — a superset of what
355/// [`crate::site`]'s own renderer uses.
356fn render_markdown(md: &str) -> String {
357    let opts = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH | Options::ENABLE_FOOTNOTES;
358    let parser = Parser::new_ext(md, opts);
359    let mut buf = String::new();
360    html::push_html(&mut buf, parser);
361    buf
362}
363
364/// Scan a Markdown string for the first heading of any level and return its
365/// plain-text content. Used to populate the `<title>` element.
366///
367/// Handles headings that contain inline emphasis, code spans, or other inline
368/// elements by concatenating all `Text` and `Code` events seen between the
369/// opening and closing heading tags.
370fn extract_title(md: &str) -> Option<String> {
371    let opts = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH;
372    let mut parser = Parser::new_ext(md, opts);
373    let mut in_heading = false;
374    let mut title = String::new();
375
376    loop {
377        match parser.next()? {
378            Event::Start(Tag::Heading { .. }) => {
379                in_heading = true;
380            }
381            Event::End(TagEnd::Heading(_)) => break,
382            Event::Text(text) | Event::Code(text) if in_heading => {
383                title.push_str(&text);
384            }
385            _ => {}
386        }
387    }
388
389    if title.is_empty() { None } else { Some(title) }
390}
391
392/// Render the Tera template with the given `title` and HTML `content`.
393///
394/// `title` is passed as a plain string; Tera auto-escapes it when inserted
395/// into `{{ title }}`. `content` is the already-rendered HTML fragment and
396/// must be inserted with `{{ content | safe }}` in the template.
397fn render_template(tera: &Tera, title: &str, content: &str) -> Result<String> {
398    let mut ctx = Context::new();
399    ctx.insert("title", title);
400    ctx.insert("content", content);
401    tera.render(TERA_NAME, &ctx).into_diagnostic()
402}
403
404/// Extract the file stem as an owned `String`, falling back to `"Document"`.
405fn file_stem_string(path: &Path) -> String {
406    path.file_stem()
407        .map(|s| s.to_string_lossy().into_owned())
408        .unwrap_or_else(|| "Document".to_owned())
409}
410
411/// Extract the directory name as an owned `String`, falling back to `"html"`.
412fn dir_name_string(path: &Path) -> String {
413    path.file_name()
414        .map(|n| n.to_string_lossy().into_owned())
415        .unwrap_or_else(|| "html".to_owned())
416}
417
418#[cfg(test)]
419mod tests {
420    use super::*;
421    use std::fs;
422
423    /// Build a temporary directory tree:
424    ///
425    /// ```
426    /// <tmpdir>/
427    ///   docs/
428    ///     page.md         ← references image.png, guide.pdf, other.md, https://…
429    ///     image.png       ← local asset (should be collected)
430    ///     guide.pdf       ← local asset (should be collected)
431    ///     other.md        ← .md file   (should be skipped)
432    ///     sub/
433    ///       nested.md     ← references ../image.png (should be collected)
434    /// ```
435    fn make_test_tree() -> (tempfile::TempDir, PathBuf) {
436        let tmp = tempfile::tempdir().expect("tempdir");
437        let docs = tmp.path().join("docs");
438        fs::create_dir_all(docs.join("sub")).unwrap();
439        fs::write(docs.join("image.png"), b"PNG").unwrap();
440        fs::write(docs.join("guide.pdf"), b"PDF").unwrap();
441        fs::write(docs.join("other.md"), b"# Other").unwrap();
442        fs::write(
443            docs.join("page.md"),
444            b"# Hello\n\
445              ![logo](image.png)\n\
446              [guide](guide.pdf)\n\
447              [other](other.md)\n\
448              [remote](https://example.com)\n\
449              [frag](#section)\n",
450        )
451        .unwrap();
452        fs::write(
453            docs.join("sub").join("nested.md"),
454            b"# Nested\n![logo](../image.png)\n",
455        )
456        .unwrap();
457        (tmp, docs)
458    }
459
460    #[test]
461    fn collects_images_and_non_md_links() {
462        let (_tmp, docs) = make_test_tree();
463        let md_path = docs.join("page.md");
464        let md = fs::read_to_string(&md_path).unwrap();
465
466        let refs = collect_referenced_files(&md, &md_path, &docs);
467        let sources: Vec<_> = refs.iter().map(|(src, _)| src.clone()).collect();
468
469        let img = docs.join("image.png").canonicalize().unwrap();
470        let pdf = docs.join("guide.pdf").canonicalize().unwrap();
471        let other = docs.join("other.md").canonicalize().unwrap();
472
473        assert!(sources.contains(&img), "image.png should be collected");
474        assert!(sources.contains(&pdf), "guide.pdf should be collected");
475        assert!(!sources.contains(&other), "other.md should be skipped");
476        // Remote URL and fragment-only link must not appear.
477        assert_eq!(refs.len(), 2, "expected exactly 2 assets (image + pdf)");
478    }
479
480    #[test]
481    fn relative_paths_from_subdirectory_are_resolved() {
482        let (_tmp, docs) = make_test_tree();
483        let md_path = docs.join("sub").join("nested.md");
484        let md = fs::read_to_string(&md_path).unwrap();
485
486        let refs = collect_referenced_files(&md, &md_path, &docs);
487
488        assert_eq!(refs.len(), 1, "expected exactly 1 asset");
489        let (src, rel) = &refs[0];
490        assert_eq!(
491            src,
492            &docs.join("image.png").canonicalize().unwrap(),
493            "source should be docs/image.png"
494        );
495        assert_eq!(
496            rel,
497            &PathBuf::from("image.png"),
498            "relative path should be image.png (relative to docs/)"
499        );
500    }
501
502    #[test]
503    fn extract_title_finds_first_heading() {
504        assert_eq!(
505            extract_title("# Hello World\n\nsome text"),
506            Some("Hello World".to_owned())
507        );
508    }
509
510    #[test]
511    fn extract_title_handles_inline_code_in_heading() {
512        assert_eq!(
513            extract_title("# Use `foo()` wisely"),
514            Some("Use foo() wisely".to_owned())
515        );
516    }
517
518    #[test]
519    fn extract_title_returns_none_when_no_heading() {
520        assert_eq!(extract_title("just a paragraph"), None);
521    }
522}