Skip to main content

abbaye/builders/
markdown.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3
4use ignore::WalkBuilder;
5use miette::{IntoDiagnostic, Result, miette};
6use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd, html};
7use schemars::JsonSchema;
8use serde::{Deserialize, Serialize};
9use tera::{Context, Tera};
10
11use crate::builders::{ArtifactPath, Builder, LogEvent, LogSender};
12use crate::config::OutputFormat;
13
14/// The default Tera template used to wrap rendered Markdown content in a
15/// complete HTML5 document. Exposed as a `pub const` so that `abbaye
16/// dump-theme` can write it to `.abbaye/theme/markdown.html.j2`.
17///
18/// Template variables:
19/// - `{{ title }}` - plain-text page title (auto-escaped by Tera).
20/// - `{{ content | safe }}` - the rendered HTML body fragment.
21pub const TEMPLATE_MARKDOWN_HTML: &str = include_str!("../templates/markdown.html.j2");
22pub const TEMPLATE_MARKDOWN_GEMTEXT: &str = include_str!("../templates/markdown.gmi.j2");
23
24fn default_recursive() -> bool {
25    true
26}
27
28/// Configuration for [`MarkdownBuilder`].
29#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema)]
30pub struct MarkdownBuilderConfig {
31    /// Directory containing `.md` files to render.
32    ///
33    /// Every `.md` file in the directory is rendered to a corresponding
34    /// `.html` file in the output directory, preserving the subdirectory
35    /// structure.  Non-Markdown files referenced (linked or embedded) inside
36    /// any Markdown source - images, PDFs, downloadable assets, etc. - are
37    /// copied into the output directory next to the rendered HTML so that all
38    /// relative URLs remain valid.
39    ///
40    /// Defaults to `"."` (the current working directory).
41    pub input: Option<PathBuf>,
42
43    /// Directory to write rendered files into.
44    ///
45    /// Defaults to `<input-name>-html` placed next to the input directory.
46    ///
47    /// ```toml
48    /// [[builders]]
49    /// type   = "markdown"
50    /// input  = "docs/"
51    /// output = "public/docs/"
52    /// ```
53    pub output: Option<PathBuf>,
54
55    /// Also descend into subdirectories of `input`.
56    ///
57    /// Defaults to `true`. Files matched by a `.gitignore` in the directory
58    /// hierarchy are always excluded, mirroring the behaviour of the `archive`
59    /// builder.
60    #[serde(default = "default_recursive")]
61    pub recursive: bool,
62    /// Output formats to generate. Defaults to `["html"]`.
63    /// Set to `["html", "gemtext"]` to also produce Gemini text files.
64    #[serde(default = "default_recursive_formats")]
65    pub formats: Vec<OutputFormat>,
66}
67
68fn default_recursive_formats() -> Vec<OutputFormat> {
69    vec![OutputFormat::Html]
70}
71
72impl Default for MarkdownBuilderConfig {
73    fn default() -> Self {
74        Self {
75            input: None,
76            output: None,
77            recursive: default_recursive(),
78            formats: default_recursive_formats(),
79        }
80    }
81}
82
83/// Renders a directory of Markdown files (`.md`) to standalone HTML documents.
84///
85/// The output directory mirrors the input directory's structure: each `.md`
86/// file becomes a `.html` file at the same relative path.  Any non-Markdown
87/// file referenced by a local link or image embed in a source document is
88/// copied to the output directory at the same relative path, keeping all URLs
89/// intact.
90pub struct MarkdownBuilder;
91
92impl Builder for MarkdownBuilder {
93    type ConfigType = MarkdownBuilderConfig;
94
95    async fn build(
96        &self,
97        config: Self::ConfigType,
98        _version: &str,
99        log: LogSender,
100    ) -> Result<Vec<ArtifactPath>> {
101        let input = config
102            .input
103            .unwrap_or_else(|| PathBuf::from("."))
104            .canonicalize()
105            .into_diagnostic()?;
106
107        if !input.is_dir() {
108            return if input.exists() {
109                Err(miette!(
110                    "markdown builder input must be a directory, got a file: {}",
111                    input.display()
112                ))
113            } else {
114                Err(miette!(
115                    "markdown builder input directory does not exist: {}",
116                    input.display()
117                ))
118            };
119        }
120
121        // Load the Tera template once per builder invocation and share it
122        // across all files so template parsing only happens once.
123        let tera = load_tera(&config.formats)?;
124
125        build_directory(
126            &input,
127            config.output,
128            config.recursive,
129            &log,
130            &tera,
131            &config.formats,
132        )
133        .await
134    }
135}
136
137/// Load the Tera instance for this builder invocation.
138fn load_tera(formats: &[OutputFormat]) -> Result<Tera> {
139    let theme_path = PathBuf::from(".abbaye").join("theme");
140    let mut tera = Tera::default();
141    crate::site::register_format_templates(
142        &mut tera,
143        &theme_path,
144        formats,
145        &[(
146            "markdown",
147            TEMPLATE_MARKDOWN_HTML,
148            TEMPLATE_MARKDOWN_GEMTEXT,
149        )],
150    )?;
151    Ok(tera)
152}
153
154// ── Directory rendering ───────────────────────────────────────────────────────
155
156async fn build_directory(
157    input_dir: &Path,
158    output: Option<PathBuf>,
159    recursive: bool,
160    log: &LogSender,
161    tera: &Tera,
162    formats: &[OutputFormat],
163) -> Result<Vec<ArtifactPath>> {
164    let output_dir = output.unwrap_or_else(|| {
165        let stem = input_dir
166            .file_name()
167            .map(|n| format!("{}-html", n.to_string_lossy()))
168            .unwrap_or_else(|| "html".to_owned());
169        input_dir.parent().unwrap_or(Path::new(".")).join(stem)
170    });
171
172    tokio::fs::create_dir_all(&output_dir)
173        .await
174        .into_diagnostic()?;
175
176    // The ignore::WalkBuilder API is synchronous, so run it on the blocking
177    // thread pool to avoid stalling the async runtime.
178    let md_files = tokio::task::spawn_blocking({
179        let input_dir = input_dir.to_owned();
180        move || collect_md_files(&input_dir, recursive)
181    })
182    .await
183    .into_diagnostic()??;
184
185    if md_files.is_empty() {
186        let _ = log.send(LogEvent::Line(format!(
187            "warning: no .md files found in {}",
188            input_dir.display()
189        )));
190        return Ok(vec![ArtifactPath {
191            path: output_dir.clone(),
192            name: dir_name_string(&output_dir),
193            hash: None,
194            category: None,
195            group_name: None,
196            group_comment: None,
197        }]);
198    }
199
200    // Map from absolute source path → absolute destination path, built up
201    // while rendering so that files referenced by multiple documents are only
202    // copied once.
203    let mut files_to_copy: HashMap<PathBuf, PathBuf> = HashMap::new();
204
205    for md_path in &md_files {
206        let relative = md_path.strip_prefix(input_dir).into_diagnostic()?;
207        let out_path = output_dir.join(relative).with_extension("html");
208
209        // Ensure any intermediate subdirectories exist.
210        if let Some(parent) = out_path.parent() {
211            tokio::fs::create_dir_all(parent).await.into_diagnostic()?;
212        }
213
214        let _ = log.send(LogEvent::Line(format!(
215            "{} → {}",
216            md_path.display(),
217            out_path.display()
218        )));
219
220        let md = tokio::fs::read_to_string(md_path).await.into_diagnostic()?;
221
222        // Collect local asset references before rendering so we can copy them.
223        for (src, rel) in collect_referenced_files(&md, md_path, input_dir) {
224            files_to_copy
225                .entry(src)
226                .or_insert_with(|| output_dir.join(rel));
227        }
228
229        let title = extract_title(&md).unwrap_or_else(|| file_stem_string(md_path));
230
231        for format in formats {
232            let suffix = format.extension();
233            let ext = format.extension();
234            let tmpl_name = format!("markdown.{suffix}");
235            let content = match format {
236                OutputFormat::Html => render_markdown(&md),
237                OutputFormat::Gemtext => crate::render::render_markdown_gemtext(&md),
238            };
239            let document = render_template(tera, &tmpl_name, &title, &content)?;
240
241            let fmt_out_path = out_path.with_extension(ext);
242            tokio::fs::write(&fmt_out_path, document.as_bytes())
243                .await
244                .into_diagnostic()?;
245        }
246    }
247
248    // Copy every referenced asset, creating parent directories as needed.
249    for (src, dest) in &files_to_copy {
250        if let Some(parent) = dest.parent() {
251            tokio::fs::create_dir_all(parent).await.into_diagnostic()?;
252        }
253
254        let _ = log.send(LogEvent::Line(format!(
255            "copying {} → {}",
256            src.display(),
257            dest.display()
258        )));
259
260        tokio::fs::copy(src, dest).await.into_diagnostic()?;
261    }
262
263    Ok(vec![ArtifactPath {
264        path: output_dir.clone(),
265        name: dir_name_string(&output_dir),
266        hash: None,
267        category: None,
268        group_name: None,
269        group_comment: None,
270    }])
271}
272
273// ── Helpers ───────────────────────────────────────────────────────────────────
274
275/// Walk `dir` for `.md` files, honouring `.gitignore` rules (via the `ignore`
276/// crate). Returns paths in a stable, sorted order so output is reproducible.
277///
278/// When `recursive` is `false` only the top level of `dir` is visited
279/// (`max_depth = 1`).
280fn collect_md_files(dir: &Path, recursive: bool) -> Result<Vec<PathBuf>> {
281    let mut files = Vec::new();
282
283    let walker = WalkBuilder::new(dir)
284        .max_depth(if recursive { None } else { Some(1) })
285        // Include dotfiles (e.g. .github/CONTRIBUTING.md).
286        .hidden(false)
287        .build();
288
289    for result in walker {
290        let entry = result.into_diagnostic()?;
291        let path = entry.into_path();
292        if path.is_file() && path.extension().and_then(|e| e.to_str()) == Some("md") {
293            files.push(path);
294        }
295    }
296
297    files.sort();
298    Ok(files)
299}
300
301/// Parse `md` for local link and image targets that are not other Markdown
302/// files, resolve them relative to `md_path`'s parent directory, and return
303/// those that exist as files within `input_dir`.
304///
305/// Returns a list of `(absolute_source_path, relative_path_from_input_dir)`
306/// pairs.  The caller uses the relative path to mirror the asset at the same
307/// position inside the output directory, keeping all relative URLs in the
308/// rendered HTML valid.
309///
310/// Skipped silently:
311/// - Remote URLs (`://`), data URIs, and fragment-only refs (`#…`).
312/// - `.md` files - those are rendered to `.html`, not copied.
313/// - Refs that do not resolve to an existing file.
314/// - Refs that resolve to a file outside `input_dir` (a warning is logged
315///   to the caller instead).
316fn collect_referenced_files(md: &str, md_path: &Path, input_dir: &Path) -> Vec<(PathBuf, PathBuf)> {
317    let md_dir = md_path.parent().unwrap_or(Path::new("."));
318    let opts = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH;
319    let mut result = Vec::new();
320
321    for event in Parser::new_ext(md, opts) {
322        let url: Option<pulldown_cmark::CowStr> = match event {
323            Event::Start(Tag::Image { dest_url, .. }) => Some(dest_url),
324            Event::Start(Tag::Link { dest_url, .. }) => Some(dest_url),
325            _ => None,
326        };
327
328        let Some(url) = url else { continue };
329        let s = url.as_ref();
330
331        // Skip remote URLs, data URIs, and fragment-only refs.
332        if s.contains("://") || s.starts_with('#') || s.is_empty() {
333            continue;
334        }
335
336        // Strip any trailing fragment before treating the string as a path.
337        let path_part = s.split('#').next().unwrap_or(s);
338        if path_part.is_empty() {
339            continue;
340        }
341
342        // Skip links to other Markdown files - those will be rendered to
343        // .html and don't need to be copied as assets.
344        if Path::new(path_part).extension().and_then(|e| e.to_str()) == Some("md") {
345            continue;
346        }
347
348        // Resolve the ref relative to the containing markdown file's directory.
349        // A leading `/` is treated as relative to input_dir, not the filesystem
350        // root, which is the most useful interpretation for a docs directory.
351        let abs = if path_part.starts_with('/') {
352            input_dir.join(path_part.trim_start_matches('/'))
353        } else {
354            md_dir.join(path_part)
355        };
356
357        // canonicalize() fails if the path does not exist.
358        let Ok(abs) = abs.canonicalize() else {
359            continue;
360        };
361
362        if !abs.is_file() {
363            continue;
364        }
365
366        // Only copy assets that live inside the input directory; assets
367        // outside it are silently skipped (they cannot be given a stable
368        // relative output path).
369        let Ok(relative) = abs.strip_prefix(input_dir) else {
370            continue;
371        };
372        let relative = relative.to_owned();
373
374        result.push((abs, relative));
375    }
376
377    result
378}
379
380/// Convert a Markdown string to an HTML fragment.
381///
382/// Enables tables, strikethrough, and footnotes - a superset of what
383/// [`crate::site`]'s own renderer uses.
384fn render_markdown(md: &str) -> String {
385    let opts = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH | Options::ENABLE_FOOTNOTES;
386    let parser = Parser::new_ext(md, opts);
387    let mut buf = String::new();
388    html::push_html(&mut buf, parser);
389    buf
390}
391
392/// Scan a Markdown string for the first heading of any level and return its
393/// plain-text content. Used to populate the `<title>` element.
394///
395/// Handles headings that contain inline emphasis, code spans, or other inline
396/// elements by concatenating all `Text` and `Code` events seen between the
397/// opening and closing heading tags.
398fn extract_title(md: &str) -> Option<String> {
399    let opts = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH;
400    let mut parser = Parser::new_ext(md, opts);
401    let mut in_heading = false;
402    let mut title = String::new();
403
404    loop {
405        match parser.next()? {
406            Event::Start(Tag::Heading { .. }) => {
407                in_heading = true;
408            }
409            Event::End(TagEnd::Heading(_)) => break,
410            Event::Text(text) | Event::Code(text) if in_heading => {
411                title.push_str(&text);
412            }
413            _ => {}
414        }
415    }
416
417    if title.is_empty() { None } else { Some(title) }
418}
419
420/// Render the Tera template with the given `title` and format-specific `content`.
421fn render_template(tera: &Tera, template_name: &str, title: &str, content: &str) -> Result<String> {
422    let mut ctx = Context::new();
423    ctx.insert("title", title);
424    ctx.insert("content", content);
425    tera.render(template_name, &ctx).into_diagnostic()
426}
427
428/// Extract the file stem as an owned `String`, falling back to `"Document"`.
429fn file_stem_string(path: &Path) -> String {
430    path.file_stem()
431        .map(|s| s.to_string_lossy().into_owned())
432        .unwrap_or_else(|| "Document".to_owned())
433}
434
435/// Extract the directory name as an owned `String`, falling back to `"html"`.
436fn dir_name_string(path: &Path) -> String {
437    path.file_name()
438        .map(|n| n.to_string_lossy().into_owned())
439        .unwrap_or_else(|| "html".to_owned())
440}
441
442#[cfg(test)]
443mod tests {
444    use super::*;
445    use std::fs;
446
447    /// Build a temporary directory tree:
448    ///
449    /// ```
450    /// <tmpdir>/
451    ///   docs/
452    ///     page.md         ← references image.png, guide.pdf, other.md, https://…
453    ///     image.png       ← local asset (should be collected)
454    ///     guide.pdf       ← local asset (should be collected)
455    ///     other.md        ← .md file   (should be skipped)
456    ///     sub/
457    ///       nested.md     ← references ../image.png (should be collected)
458    /// ```
459    fn make_test_tree() -> (tempfile::TempDir, PathBuf) {
460        let tmp = tempfile::tempdir().expect("tempdir");
461        let docs = tmp.path().join("docs");
462        fs::create_dir_all(docs.join("sub")).unwrap();
463        fs::write(docs.join("image.png"), b"PNG").unwrap();
464        fs::write(docs.join("guide.pdf"), b"PDF").unwrap();
465        fs::write(docs.join("other.md"), b"# Other").unwrap();
466        fs::write(
467            docs.join("page.md"),
468            b"# Hello\n\
469              ![logo](image.png)\n\
470              [guide](guide.pdf)\n\
471              [other](other.md)\n\
472              [remote](https://example.com)\n\
473              [frag](#section)\n",
474        )
475        .unwrap();
476        fs::write(
477            docs.join("sub").join("nested.md"),
478            b"# Nested\n![logo](../image.png)\n",
479        )
480        .unwrap();
481        (tmp, docs)
482    }
483
484    #[test]
485    fn collects_images_and_non_md_links() {
486        let (_tmp, docs) = make_test_tree();
487        let md_path = docs.join("page.md");
488        let md = fs::read_to_string(&md_path).unwrap();
489
490        let refs = collect_referenced_files(&md, &md_path, &docs);
491        let sources: Vec<_> = refs.iter().map(|(src, _)| src.clone()).collect();
492
493        let img = docs.join("image.png").canonicalize().unwrap();
494        let pdf = docs.join("guide.pdf").canonicalize().unwrap();
495        let other = docs.join("other.md").canonicalize().unwrap();
496
497        assert!(sources.contains(&img), "image.png should be collected");
498        assert!(sources.contains(&pdf), "guide.pdf should be collected");
499        assert!(!sources.contains(&other), "other.md should be skipped");
500        // Remote URL and fragment-only link must not appear.
501        assert_eq!(refs.len(), 2, "expected exactly 2 assets (image + pdf)");
502    }
503
504    #[test]
505    fn relative_paths_from_subdirectory_are_resolved() {
506        let (_tmp, docs) = make_test_tree();
507        let md_path = docs.join("sub").join("nested.md");
508        let md = fs::read_to_string(&md_path).unwrap();
509
510        let refs = collect_referenced_files(&md, &md_path, &docs);
511
512        assert_eq!(refs.len(), 1, "expected exactly 1 asset");
513        let (src, rel) = &refs[0];
514        assert_eq!(
515            src,
516            &docs.join("image.png").canonicalize().unwrap(),
517            "source should be docs/image.png"
518        );
519        assert_eq!(
520            rel,
521            &PathBuf::from("image.png"),
522            "relative path should be image.png (relative to docs/)"
523        );
524    }
525
526    #[test]
527    fn extract_title_finds_first_heading() {
528        assert_eq!(
529            extract_title("# Hello World\n\nsome text"),
530            Some("Hello World".to_owned())
531        );
532    }
533
534    #[test]
535    fn extract_title_handles_inline_code_in_heading() {
536        assert_eq!(
537            extract_title("# Use `foo()` wisely"),
538            Some("Use foo() wisely".to_owned())
539        );
540    }
541
542    #[test]
543    fn extract_title_returns_none_when_no_heading() {
544        assert_eq!(extract_title("just a paragraph"), None);
545    }
546}