pub mod crates_io; pub mod searxng; use async_trait::async_trait; use scraper::{Html, Selector}; use serde::Serialize; use std::collections::HashSet; use std::fmt; /// A single search result returned by an external search engine. /// /// # Example /// /// ```rust /// use search_hub::search_engines::ResultEntry; /// /// let r = ResultEntry { /// title: "Rust Lang".into(), /// url: "https://rust-lang.org".into(), /// description: Some("The Rust programming language".into()), /// engine: "duckduckgo".into(), /// }; /// assert_eq!(r.engine, "duckduckgo"); /// ``` #[derive(Debug, Clone, Serialize)] pub struct ResultEntry { /// Result page title. pub title: String, /// Result page URL. pub url: String, /// Optional text snippet or description. pub description: Option<String>, /// Name of the search engine that returned this result. pub engine: String, } /// Error type for engine fetch and parse operations. #[derive(Debug)] pub struct EngineError(pub String); impl fmt::Display for EngineError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.0) } } impl std::error::Error for EngineError {} /// Trait for external search engine integrations. /// /// Implementors define the engine's metadata (`id`, `name`, `url_template`, /// `selector`) and optionally override `fetch_results` or `parse_results` /// for custom behavior. The default `fetch_results` fetches the search URL /// via `reqwest` and delegates to `parse_results`. The default /// `parse_results` uses the CSS `selector` to find the result container /// and extracts `<a>` links from it (deduplicated by URL, max 10 results, /// title must be at least 4 characters). /// /// # Example /// /// ```rust /// use search_hub::search_engines::{SearchEngine, ResultEntry, EngineError}; /// use async_trait::async_trait; /// /// struct ExampleEngine; /// /// #[async_trait] /// impl SearchEngine for ExampleEngine { /// fn id(&self) -> &str { "example" } /// fn name(&self) -> &str { "Example" } /// fn url_template(&self) -> &str { "https://example.com/search?q={}" } /// fn selector(&self) -> &str { "div.results" } /// } /// /// let e = ExampleEngine; /// assert_eq!(e.id(), "example"); /// assert_eq!(e.search_url("test"), "https://example.com/search?q=test"); /// ``` #[async_trait] pub trait SearchEngine: Send + Sync { /// Unique identifier for this engine (e.g. "duckduckgo"). fn id(&self) -> &str; /// Human-readable display name (e.g. "DuckDuckGo"). fn name(&self) -> &str; /// URL template with `{}` placeholder for the query string. fn url_template(&self) -> &str; /// CSS selector targeting the result container in the engine's HTML page. fn selector(&self) -> &str; /// Build a search URL from the given query by replacing `{}` with the /// URL-encoded query string. fn search_url(&self, query: &str) -> String { self.url_template().replace("{}", &urlencode(query)) } /// Fetch search results from the engine for the given query. /// /// Default implementation: builds the search URL via `self.search_url()`, /// fetches the page via the provided `reqwest::Client`, then delegates to /// `self.parse_results()`. async fn fetch_results( &self, query: &str, client: &reqwest::Client, ) -> Result<Vec<ResultEntry>, EngineError> { let url = self.search_url(query); let html = client .get(&url) .send() .await .map_err(|e| EngineError(format!("fetch failed: {e}")))? .text() .await .map_err(|e| EngineError(format!("read body failed: {e}")))?; self.parse_results(&html) } /// Parse search results from raw HTML. /// /// Default implementation: uses `self.selector()` to find the result /// container with `scraper`, extracts `<a>` links from it, deduplicates /// by URL, filters to HTTP links with title >= 4 characters, and returns /// at most 10 results. fn parse_results(&self, html: &str) -> Result<Vec<ResultEntry>, EngineError> { let doc = Html::parse_document(html); let sel = Selector::parse(self.selector()) .map_err(|e| EngineError(format!("bad selector: {e}")))?; let link_sel = Selector::parse("a[href]") .map_err(|e| EngineError(format!("bad link selector: {e}")))?; let container = doc .select(&sel) .next() .ok_or_else(|| EngineError("no container matched".into()))?; let mut results = Vec::new(); let mut seen = HashSet::new(); for link in container.select(&link_sel) { let href = match link.value().attr("href") { Some(h) => h.to_string(), None => continue, }; let title: String = link.text().collect::<String>().trim().to_string(); if title.len() < 4 || href.is_empty() { continue; } if !href.starts_with("http") { continue; } if !seen.insert(href.clone()) { continue; } if results.len() >= 10 { break; } results.push(ResultEntry { title, url: href, description: None, engine: self.name().to_string(), }); } if results.is_empty() { Err(EngineError("no results found".into())) } else { Ok(results) } } } /// Return the default set of search engines. /// /// These can be enabled or disabled via the `enabled_engines` config field. /// DuckDuckGo, lib.rs, and StackOverflow were removed from defaults because /// they now block automated requests. crates.io works via its public JSON API. /// /// # Example /// /// ```rust /// let engines = search_hub::search_engines::default_search_engines(); /// assert_eq!(engines.len(), 1); /// assert_eq!(engines[0].id(), "crates.io"); /// ``` pub fn default_search_engines() -> Vec<Box<dyn SearchEngine>> { vec![ Box::new(crates_io::CratesIo), ] } fn urlencode(s: &str) -> String { let mut out = String::with_capacity(s.len()); for byte in s.bytes() { match byte { b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => { out.push(byte as char); } b' ' => out.push_str("+"), _ => { out.push_str(&format!("%{:02X}", byte)); } } } out } #[cfg(test)] mod tests { use super::*; use async_trait::async_trait; struct TestEngine; #[async_trait] impl SearchEngine for TestEngine { fn id(&self) -> &str { "test" } fn name(&self) -> &str { "Test" } fn url_template(&self) -> &str { "https://example.com/?q={}" } fn selector(&self) -> &str { "div.results" } } #[test] fn test_urlencode_plain() { assert_eq!(urlencode("hello"), "hello"); } #[test] fn test_urlencode_spaces() { assert_eq!(urlencode("hello world"), "hello+world"); } #[test] fn test_urlencode_special() { assert_eq!(urlencode("a&b/c"), "a%26b%2Fc"); } #[test] fn test_urlencode_empty() { assert_eq!(urlencode(""), ""); } #[test] fn test_urlencode_alphanum() { assert_eq!(urlencode("ABC123-_~."), "ABC123-_~."); } #[test] fn test_engine_error_display() { let e = EngineError("oops".into()); assert_eq!(format!("{e}"), "oops"); } #[test] fn test_engine_error_debug() { let e = EngineError("oops".into()); assert!(format!("{e:?}").contains("EngineError")); } #[test] fn test_engine_error_is_error() { use std::error::Error; let e = EngineError("oops".into()); assert!(e.source().is_none()); } #[test] fn test_search_url_replaces_placeholder() { let e = TestEngine; assert_eq!(e.search_url("rust"), "https://example.com/?q=rust"); } #[test] fn test_search_url_encodes_query() { let e = TestEngine; assert_eq!( e.search_url("hello world"), "https://example.com/?q=hello+world" ); } #[test] fn test_search_url_special_chars() { let e = TestEngine; assert_eq!( e.search_url("a&b"), "https://example.com/?q=a%26b" ); } #[test] fn test_default_search_engines() { let engines = default_search_engines(); assert_eq!(engines.len(), 1); assert_eq!(engines[0].id(), "crates.io"); } #[test] fn test_parse_results_empty_html() { let e = TestEngine; let result = e.parse_results("<html></html>"); assert!(result.is_err()); assert_eq!(format!("{}", result.unwrap_err()), "no container matched"); } #[test] fn test_parse_results_no_links() { let e = TestEngine; let html = r#"<html><body><div class="results"><p>nothing here</p></div></body></html>"#; let result = e.parse_results(html); assert!(result.is_err()); assert_eq!(format!("{}", result.unwrap_err()), "no results found"); } #[test] fn test_parse_results_extracts_links() { let e = TestEngine; let html = r#"<html><body><div class="results"><a href="https://example.com/1">First Result</a><a href="https://example.com/2">Second Result</a></div></body></html>"#; let result = e.parse_results(html); assert!(result.is_ok()); let entries = result.unwrap(); assert_eq!(entries.len(), 2); assert_eq!(entries[0].title, "First Result"); assert_eq!(entries[0].url, "https://example.com/1"); assert_eq!(entries[0].engine, "Test"); assert_eq!(entries[1].title, "Second Result"); assert_eq!(entries[1].url, "https://example.com/2"); } #[test] fn test_parse_results_short_title_skipped() { let e = TestEngine; let html = r#"<html><body><div class="results"><a href="https://example.com/1">ab</a><a href="https://example.com/2">Long Title</a></div></body></html>"#; let result = e.parse_results(html); assert!(result.is_ok()); let entries = result.unwrap(); assert_eq!(entries.len(), 1); assert_eq!(entries[0].url, "https://example.com/2"); } #[test] fn test_parse_results_dedup_by_url() { let e = TestEngine; let html = r#"<html><body><div class="results"><a href="https://example.com/1">First</a><a href="https://example.com/1">First Duplicate</a></div></body></html>"#; let result = e.parse_results(html); assert!(result.is_ok()); let entries = result.unwrap(); assert_eq!(entries.len(), 1); } #[test] fn test_parse_results_non_http_skipped() { let e = TestEngine; let html = r#"<html><body><div class="results"><a href="https://example.com/1">Valid</a><a href="javascript:void(0)">JS Link</a><a href="mailto:test@test.com">Email</a></div></body></html>"#; let result = e.parse_results(html); assert!(result.is_ok()); let entries = result.unwrap(); assert_eq!(entries.len(), 1); assert_eq!(entries[0].url, "https://example.com/1"); } #[test] fn test_parse_results_max_ten() { let e = TestEngine; let mut links = String::new(); for i in 0..15 { links.push_str(&format!(r#"<a href="https://example.com/{i}">Result {i}</a>"#)); } let html = format!(r#"<html><body><div class="results">{links}</div></body></html>"#); let result = e.parse_results(&html); assert!(result.is_ok()); assert_eq!(result.unwrap().len(), 10); } }