search_hub

at 18c4440 Raw

use std::sync::OnceLock;

static RT: OnceLock<tokio::runtime::Runtime> = OnceLock::new();

fn rt() -> &'static tokio::runtime::Runtime {
    RT.get_or_init(|| tokio::runtime::Runtime::new().unwrap())
}

fn fetch(url: &str) -> String {
    rt().block_on(async {
        let client = reqwest::Client::builder()
            .user_agent("search_hub_test")
            .build()
            .unwrap();
        let resp = client.get(url).send().await.unwrap();
        resp.text().await.unwrap()
    })
}

fn print_md(name: &str, html: &str, md: &str) {
    println!();
    println!("=== {} ===", name);
    println!("Raw HTML : {} bytes", html.len());
    println!("Markdown : {} bytes", md.len());
    println!("Ratio    : {:.1}x smaller", html.len() as f64 / md.len().max(1) as f64);
    println!();
    println!("--- Markdown output ---");
    println!("{}", md);
    println!("--- end ---");
}

#[test]
fn strips_html_tags_and_preserves_text_example() {
    let html = fetch("https://example.com");
    assert!(html.contains("<h1>"), "expected HTML to contain tags before conversion");

    let md = htmd::convert(&html).expect("conversion should succeed");
    print_md("example.com", &html, &md);

    assert!(!md.contains("<h1>"), "no HTML heading tags");
    assert!(!md.contains("<a "), "no HTML anchor tags");
    assert!(!md.contains("<div"), "no HTML div tags");
    assert!(!md.contains("</"), "no closing HTML tags");

    assert!(md.contains("Example Domain"), "visible heading text preserved");
    assert_eq!(md.lines().filter(|l| l.starts_with('#')).count(), 1, "exactly one H1 in Markdown");
}

#[test]
fn strips_html_tags_and_preserves_text_rustlang() {
    let html = fetch("https://www.rust-lang.org");
    assert!(html.contains("<html") || html.contains("<!DOCTYPE"), "expected valid HTML");

    let md = htmd::convert(&html).expect("conversion should succeed");
    print_md("rust-lang.org", &html, &md);

    assert!(!md.contains("<script"), "no script tags in output");
    assert!(!md.contains("<style"), "no style tags in output");
    assert!(!md.contains("class=\""), "no HTML attribute syntax in output");
    assert!(!md.contains("id=\""), "no HTML id attributes in output");

    assert!(md.contains("Rust"), "page title preserved in Markdown");
    assert!(md.lines().any(|l| l.starts_with("# Rust")), "heading preserved as Markdown H1");
    assert!(md.len() < html.len(), "Markdown smaller than raw HTML ({} vs {})", md.len(), html.len());
}

#[test]
fn markdown_output_is_readable() {
    let html = fetch("https://example.com");
    let md = htmd::convert(&html).expect("conversion should succeed");

    let lines: Vec<&str> = md.lines().filter(|l| !l.trim().is_empty()).collect();
    assert!(lines.len() >= 3, "at least 3 non-empty lines of content");

    let words: Vec<&str> = md.split_whitespace().collect();
    assert!(words.len() >= 20, "at least 20 readable words in output");
}