pub mod crates_io; pub mod searxng; use async_trait::async_trait; use scraper::{Html, Selector}; use serde::Serialize; use std::collections::HashSet; use std::fmt; /// A single search result returned by an external search engine. /// /// # Example /// /// ```rust /// use search_hub::search_engines::ResultEntry; /// /// let r = ResultEntry { /// title: "Rust Lang".into(), /// url: "https://rust-lang.org".into(), /// description: Some("The Rust programming language".into()), /// engine: "duckduckgo".into(), /// }; /// assert_eq!(r.engine, "duckduckgo"); /// ``` #[derive(Debug, Clone, Serialize)] pub struct ResultEntry { /// Result page title. pub title: String, /// Result page URL. pub url: String, /// Optional text snippet or description. pub description: Option<String>, /// Name of the search engine that returned this result. pub engine: String, } /// Error type for engine fetch and parse operations. #[derive(Debug)] pub struct EngineError(pub String); impl fmt::Display for EngineError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.0) } } impl std::error::Error for EngineError {} /// Trait for external search engine integrations. /// /// Implementors define the engine's metadata (`id`, `name`, `url_template`, /// `selector`) and optionally override `fetch_results` or `parse_results` /// for custom behavior. The default `fetch_results` fetches the search URL /// via `reqwest` and delegates to `parse_results`. The default /// `parse_results` uses the CSS `selector` to find the result container /// and extracts `<a>` links from it (deduplicated by URL, max 10 results, /// title must be at least 4 characters). /// /// # Example /// /// ```rust /// use search_hub::search_engines::{SearchEngine, ResultEntry, EngineError}; /// use async_trait::async_trait; /// /// struct ExampleEngine; /// /// #[async_trait] /// impl SearchEngine for ExampleEngine { /// fn id(&self) -> &str { "example" } /// fn name(&self) -> &str { "Example" } /// fn url_template(&self) -> &str { "https://example.com/search?q={}" } /// fn selector(&self) -> &str { "div.results" } /// } /// /// let e = ExampleEngine; /// assert_eq!(e.id(), "example"); /// assert_eq!(e.search_url("test"), "https://example.com/search?q=test"); /// ``` #[async_trait] pub trait SearchEngine: Send + Sync { /// Unique identifier for this engine (e.g. "duckduckgo"). fn id(&self) -> &str; /// Human-readable display name (e.g. "DuckDuckGo"). fn name(&self) -> &str; /// URL template with `{}` placeholder for the query string. fn url_template(&self) -> &str; /// CSS selector targeting the result container in the engine's HTML page. fn selector(&self) -> &str; /// Build a search URL from the given query by replacing `{}` with the /// URL-encoded query string. fn search_url(&self, query: &str) -> String { self.url_template().replace("{}", &urlencode(query)) } /// Fetch search results from the engine for the given query. /// /// Default implementation: builds the search URL via `self.search_url()`, /// fetches the page via the provided `reqwest::Client`, then delegates to /// `self.parse_results()`. async fn fetch_results( &self, query: &str, client: &reqwest::Client, ) -> Result<Vec<ResultEntry>, EngineError> { let url = self.search_url(query); let html = client .get(&url) .send() .await .map_err(|e| EngineError(format!("fetch failed: {e}")))? .text() .await .map_err(|e| EngineError(format!("read body failed: {e}")))?; self.parse_results(&html) } /// Parse search results from raw HTML. /// /// Default implementation: uses `self.selector()` to find the result /// container with `scraper`, extracts `<a>` links from it, deduplicates /// by URL, filters to HTTP links with title >= 4 characters, and returns /// at most 10 results. fn parse_results(&self, html: &str) -> Result<Vec<ResultEntry>, EngineError> { let doc = Html::parse_document(html); let sel = Selector::parse(self.selector()) .map_err(|e| EngineError(format!("bad selector: {e}")))?; let link_sel = Selector::parse("a[href]") .map_err(|e| EngineError(format!("bad link selector: {e}")))?; let container = doc .select(&sel) .next() .ok_or_else(|| EngineError("no container matched".into()))?; let mut results = Vec::new(); let mut seen = HashSet::new(); for link in container.select(&link_sel) { let href = match link.value().attr("href") { Some(h) => h.to_string(), None => continue, }; let title: String = link.text().collect::<String>().trim().to_string(); if title.len() < 4 || href.is_empty() { continue; } if !href.starts_with("http") { continue; } if !seen.insert(href.clone()) { continue; } if results.len() >= 10 { break; } results.push(ResultEntry { title, url: href, description: None, engine: self.name().to_string(), }); } if results.is_empty() { Err(EngineError("no results found".into())) } else { Ok(results) } } } /// Return the default set of search engines. /// /// These can be enabled or disabled via the `enabled_engines` config field. /// DuckDuckGo, lib.rs, and StackOverflow were removed from defaults because /// they now block automated requests. crates.io works via its public JSON API. /// /// # Example /// /// ```rust /// let engines = search_hub::search_engines::default_search_engines(); /// assert_eq!(engines.len(), 1); /// assert_eq!(engines[0].id(), "crates.io"); /// ``` pub fn default_search_engines() -> Vec<Box<dyn SearchEngine>> { vec![ Box::new(crates_io::CratesIo), ] } fn urlencode(s: &str) -> String { let mut out = String::with_capacity(s.len()); for byte in s.bytes() { match byte { b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => { out.push(byte as char); } b' ' => out.push_str("+"), _ => { out.push_str(&format!("%{:02X}", byte)); } } } out }