search_hub

at c9cf0e4 Raw

pub mod crates_io;
pub mod searxng;

use async_trait::async_trait;
use scraper::{Html, Selector};
use serde::Serialize;
use std::collections::HashSet;
use std::fmt;

/// A single search result returned by an external search engine.
///
/// # Example
///
/// ```rust
/// use search_hub::search_engines::ResultEntry;
///
/// let r = ResultEntry {
///     title: "Rust Lang".into(),
///     url: "https://rust-lang.org".into(),
///     description: Some("The Rust programming language".into()),
///     engine: "duckduckgo".into(),
/// };
/// assert_eq!(r.engine, "duckduckgo");
/// ```
#[derive(Debug, Clone, Serialize)]
pub struct ResultEntry {
    /// Result page title.
    pub title: String,
    /// Result page URL.
    pub url: String,
    /// Optional text snippet or description.
    pub description: Option<String>,
    /// Name of the search engine that returned this result.
    pub engine: String,
}

/// Error type for engine fetch and parse operations.
#[derive(Debug)]
pub struct EngineError(pub String);

impl fmt::Display for EngineError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{}", self.0)
    }
}

impl std::error::Error for EngineError {}

/// Trait for external search engine integrations.
///
/// Implementors define the engine's metadata (`id`, `name`, `url_template`,
/// `selector`) and optionally override `fetch_results` or `parse_results`
/// for custom behavior.  The default `fetch_results` fetches the search URL
/// via `reqwest` and delegates to `parse_results`.  The default
/// `parse_results` uses the CSS `selector` to find the result container
/// and extracts `<a>` links from it (deduplicated by URL, max 10 results,
/// title must be at least 4 characters).
///
/// # Example
///
/// ```rust
/// use search_hub::search_engines::{SearchEngine, ResultEntry, EngineError};
/// use async_trait::async_trait;
///
/// struct ExampleEngine;
///
/// #[async_trait]
/// impl SearchEngine for ExampleEngine {
///     fn id(&self) -> &str { "example" }
///     fn name(&self) -> &str { "Example" }
///     fn url_template(&self) -> &str { "https://example.com/search?q={}" }
///     fn selector(&self) -> &str { "div.results" }
/// }
///
/// let e = ExampleEngine;
/// assert_eq!(e.id(), "example");
/// assert_eq!(e.search_url("test"), "https://example.com/search?q=test");
/// ```
#[async_trait]
pub trait SearchEngine: Send + Sync {
    /// Unique identifier for this engine (e.g. "duckduckgo").
    fn id(&self) -> &str;
    /// Human-readable display name (e.g. "DuckDuckGo").
    fn name(&self) -> &str;
    /// URL template with `{}` placeholder for the query string.
    fn url_template(&self) -> &str;
    /// CSS selector targeting the result container in the engine's HTML page.
    fn selector(&self) -> &str;

    /// Build a search URL from the given query by replacing `{}` with the
    /// URL-encoded query string.
    fn search_url(&self, query: &str) -> String {
        self.url_template().replace("{}", &urlencode(query))
    }

    /// Fetch search results from the engine for the given query.
    ///
    /// Default implementation: builds the search URL via `self.search_url()`,
    /// fetches the page via the provided `reqwest::Client`, then delegates to
    /// `self.parse_results()`.
    async fn fetch_results(
        &self,
        query: &str,
        client: &reqwest::Client,
    ) -> Result<Vec<ResultEntry>, EngineError> {
        let url = self.search_url(query);
        let html = client
            .get(&url)
            .send()
            .await
            .map_err(|e| EngineError(format!("fetch failed: {e}")))?
            .text()
            .await
            .map_err(|e| EngineError(format!("read body failed: {e}")))?;
        self.parse_results(&html)
    }

    /// Parse search results from raw HTML.
    ///
    /// Default implementation: uses `self.selector()` to find the result
    /// container with `scraper`, extracts `<a>` links from it, deduplicates
    /// by URL, filters to HTTP links with title >= 4 characters, and returns
    /// at most 10 results.
    fn parse_results(&self, html: &str) -> Result<Vec<ResultEntry>, EngineError> {
        let doc = Html::parse_document(html);
        let sel = Selector::parse(self.selector())
            .map_err(|e| EngineError(format!("bad selector: {e}")))?;
        let link_sel = Selector::parse("a[href]")
            .map_err(|e| EngineError(format!("bad link selector: {e}")))?;

        let container = doc
            .select(&sel)
            .next()
            .ok_or_else(|| EngineError("no container matched".into()))?;

        let mut results = Vec::new();
        let mut seen = HashSet::new();

        for link in container.select(&link_sel) {
            let href = match link.value().attr("href") {
                Some(h) => h.to_string(),
                None => continue,
            };
            let title: String = link.text().collect::<String>().trim().to_string();

            if title.len() < 4 || href.is_empty() {
                continue;
            }
            if !href.starts_with("http") {
                continue;
            }
            if !seen.insert(href.clone()) {
                continue;
            }
            if results.len() >= 10 {
                break;
            }

            results.push(ResultEntry {
                title,
                url: href,
                description: None,
                engine: self.name().to_string(),
            });
        }

        if results.is_empty() {
            Err(EngineError("no results found".into()))
        } else {
            Ok(results)
        }
    }
}

/// Return the default set of search engines.
///
/// These can be enabled or disabled via the `enabled_engines` config field.
/// DuckDuckGo, lib.rs, and StackOverflow were removed from defaults because
/// they now block automated requests. crates.io works via its public JSON API.
///
/// # Example
///
/// ```rust
/// let engines = search_hub::search_engines::default_search_engines();
/// assert_eq!(engines.len(), 1);
/// assert_eq!(engines[0].id(), "crates.io");
/// ```
pub fn default_search_engines() -> Vec<Box<dyn SearchEngine>> {
    vec![
        Box::new(crates_io::CratesIo),
    ]
}

fn urlencode(s: &str) -> String {
    let mut out = String::with_capacity(s.len());
    for byte in s.bytes() {
        match byte {
            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => {
                out.push(byte as char);
            }
            b' ' => out.push_str("+"),
            _ => {
                out.push_str(&format!("%{:02X}", byte));
            }
        }
    }
    out
}