Commit
Message
Changed Files (7)
-
modified README.md
diff --git a/README.md b/README.md index 795cdc3..dcef110 100644 --- a/README.md +++ b/README.md @@ -73,10 +73,7 @@ Run `search_hub init-config` to create `~/.config/search_hub/config.toml` with a # name = "my-custom-tag" # examples = ["example text one", "example text two"] -# Which external search engines to use (default: ["crates.io"]) -# enabled_engines = ["crates.io"] - -# Enable auto-tagging (default: false, requires ONNX model download on first use) +# Whether auto-tagging is enabled (default: false, requires ONNX model download on first use) # tagging_enabled = true # Minimum confidence for auto-tagging (0.0 to 1.0, default: 0.6) @@ -100,6 +97,12 @@ instance = "https://search.kael.ink" # type = "crates_io" # url = "https://registry.example.com/api/v1/crates?q={}&per_page=10" # timeout_secs = 5.0 + +# Wikipedia search (optional, defaults to English) +# [[engines]] +# type = "wikipedia" +# lang = "fr" +# timeout_secs = 5.0 ``` ## Run the web server as a systemd user service -
modified src/config.rs
diff --git a/src/config.rs b/src/config.rs index bd17f27..2f395cc 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2,12 +2,17 @@ use figment::Figment; use figment::providers::{Format, Toml}; use serde::Deserialize; use std::path::PathBuf; +use std::time::Duration; + +use async_trait::async_trait; +use crate::search_engines::{EngineError, ResultEntry, SearchEngine}; /// Configuration for a single search engine instance. #[derive(Debug, Deserialize, Clone)] #[serde(tag = "type", rename_all = "lowercase")] pub enum EngineConfig { /// crates.io registry (public or private) + #[serde(rename = "crates_io")] CratesIo { #[serde(default)] url: Option<String>, @@ -20,6 +25,81 @@ pub enum EngineConfig { #[serde(default)] timeout_secs: Option<f32>, }, + /// Wikipedia search (language-specific) + Wikipedia { + /// Language code (e.g. "en", "fr", "de"). Defaults to "en" if omitted. + #[serde(default)] + lang: Option<String>, + #[serde(default)] + timeout_secs: Option<f32>, + }, +} + +#[async_trait] +impl SearchEngine for EngineConfig { + fn id(&self) -> &str { + match self { + EngineConfig::CratesIo { .. } => "crates.io", + EngineConfig::SearXng { .. } => "searxng", + EngineConfig::Wikipedia { .. } => "wikipedia", + } + } + + fn name(&self) -> &str { + match self { + EngineConfig::CratesIo { .. } => "crates.io", + EngineConfig::SearXng { .. } => "SearXNG", + EngineConfig::Wikipedia { .. } => "Wikipedia", + } + } + + fn url_template(&self) -> &str { + "" + } + + fn selector(&self) -> &str { + "" + } + + fn timeout(&self) -> Duration { + let secs = match self { + EngineConfig::CratesIo { timeout_secs, .. } + | EngineConfig::SearXng { timeout_secs, .. } + | EngineConfig::Wikipedia { timeout_secs, .. } => timeout_secs, + }; + secs.map(|s| Duration::from_secs_f32(s)).unwrap_or(Duration::from_secs(5)) + } + + async fn fetch_results( + &self, + query: &str, + client: &reqwest::Client, + ) -> Result<Vec<ResultEntry>, EngineError> { + match self { + EngineConfig::CratesIo { url, timeout_secs } => { + let engine = crate::search_engines::crates_io::CratesIo { + timeout_secs: *timeout_secs, + api_url: url.clone().unwrap_or_else(|| crate::search_engines::crates_io::DEFAULT_API_URL.into()), + }; + engine.fetch_results(query, client).await + } + EngineConfig::SearXng { instance, timeout_secs } => { + let engine = crate::search_engines::searxng::SearXng { + instance: instance.clone(), + url_tpl: format!("{}/search?format=json&q={{}}", instance.trim_end_matches('/')), + timeout_secs: *timeout_secs, + }; + engine.fetch_results(query, client).await + } + EngineConfig::Wikipedia { lang, timeout_secs } => { + let engine = crate::search_engines::wikipedia::Wikipedia { + lang: lang.clone().unwrap_or_else(|| crate::search_engines::wikipedia::DEFAULT_LANG.into()), + timeout_secs: *timeout_secs, + }; + engine.fetch_results(query, client).await + } + } + } } /// Application configuration loaded from the TOML config file. @@ -28,7 +108,7 @@ pub enum EngineConfig { /// /// ```ignore /// let cfg = search_hub::config::Config::load(); -/// let engines = cfg.resolve_engines(); +/// let engines = cfg.engines.clone(); /// println!("{} engines enabled", engines.len()); /// ``` #[derive(Debug, Deserialize)] @@ -36,9 +116,6 @@ pub struct Config { /// Custom tag definitions. If non-empty, these replace the hardcoded defaults. #[serde(default)] pub tags: Vec<crate::tagging::TagDef>, - /// List of engine IDs to enable for inline search results. - #[serde(default)] - pub enabled_engines: Option<Vec<String>>, /// Whether auto-tagging is enabled. Defaults to `false` if not set. #[serde(default)] pub tagging_enabled: Option<bool>, @@ -48,7 +125,7 @@ pub struct Config { /// Hostnames to exclude from content fetching. #[serde(default)] pub exclude_urls: Option<Vec<String>>, - /// Per-engine configuration (supports multiple instances of the same engine type). + /// Per-engine configuration. Each entry defines an enabled search engine. #[serde(default)] pub engines: Vec<EngineConfig>, /// Default bookmark database path. @@ -120,7 +197,6 @@ impl Default for Config { fn default() -> Self { Config { tags: Vec::new(), - enabled_engines: None, tagging_enabled: None, tagging_threshold: None, exclude_urls: None, @@ -136,64 +212,6 @@ impl Default for Config { } } -impl Config { - /// Resolve the list of enabled search engines. - /// - /// Default engines (`crates.io`) are included unless filtered by - /// `enabled_engines`. Engines with configuration in the `engines` vec - /// are added subject to the same filter. - pub fn resolve_engines(&self) -> Vec<Box<dyn crate::search_engines::SearchEngine>> { - let is_enabled = |id: &str| -> bool { - self.enabled_engines - .as_ref() - .map(|enabled| enabled.iter().any(|e| e == id)) - .unwrap_or(true) - }; - - let mut engines: Vec<Box<dyn crate::search_engines::SearchEngine>> = Vec::new(); - - // Add default crates.io if enabled and not explicitly configured - if is_enabled("crates.io") { - let has_custom_crates_io = self.engines.iter().any(|cfg| { - matches!(cfg, EngineConfig::CratesIo { .. }) - }); - if !has_custom_crates_io { - engines.push(Box::new(crate::search_engines::crates_io::CratesIo { - timeout_secs: None, - api_url: crate::search_engines::crates_io::DEFAULT_API_URL.into(), - })); - } - } - - // Add configured engines - for cfg in &self.engines { - match cfg { - EngineConfig::CratesIo { url, timeout_secs } => { - if is_enabled("crates.io") { - let api_url = url.clone().unwrap_or_else(|| crate::search_engines::crates_io::DEFAULT_API_URL.into()); - engines.push(Box::new(crate::search_engines::crates_io::CratesIo { - timeout_secs: *timeout_secs, - api_url, - })); - } - } - EngineConfig::SearXng { instance, timeout_secs } => { - if is_enabled("searxng") { - let url_tpl = format!("{}/search?format=json&q={{}}", instance.trim_end_matches('/')); - engines.push(Box::new(crate::search_engines::searxng::SearXng { - instance: instance.clone(), - url_tpl, - timeout_secs: *timeout_secs, - })); - } - } - } - } - - engines - } -} - /// Return the expected config file path (e.g. `~/.config/search_hub/config.toml` on Linux). /// /// # Example @@ -254,40 +272,57 @@ instance = "https://search.example.com" "#).unwrap(); let cfg = Config::load_from(&file.path().to_path_buf()); - let engines = cfg.resolve_engines(); + let engines = cfg.engines.clone(); assert!(engines.iter().any(|e| e.id() == "searxng")); } #[test] - fn resolve_engines_filters_with_enabled_list() { + fn resolve_engines_empty_by_default() { + let cfg = Config::default(); + assert!(cfg.engines.is_empty()); + } + + #[test] + fn resolve_engines_includes_crates_io_when_configured() { let mut file = NamedTempFile::new().unwrap(); write!(file, r#" -enabled_engines = ["crates.io"] [[engines]] -type = "searxng" -instance = "https://search.example.com" +type = "crates_io" "#).unwrap(); let cfg = Config::load_from(&file.path().to_path_buf()); - let engines = cfg.resolve_engines(); - // searxng should be excluded because it's not in enabled_engines - assert!(!engines.iter().any(|e| e.id() == "searxng")); - // crates.io should be included + let engines = cfg.engines.clone(); assert!(engines.iter().any(|e| e.id() == "crates.io")); } #[test] - fn resolve_engines_includes_searxng_when_in_enabled_list() { + fn resolve_engines_includes_wikipedia_when_configured() { + let mut file = NamedTempFile::new().unwrap(); + write!(file, r#" +[[engines]] +type = "wikipedia" +lang = "fr" +"#).unwrap(); + + let cfg = Config::load_from(&file.path().to_path_buf()); + let engines = cfg.engines.clone(); + assert!(engines.iter().any(|e| e.id() == "wikipedia")); + } + + #[test] + fn resolve_engines_respects_multiple_engines() { let mut file = NamedTempFile::new().unwrap(); write!(file, r#" -enabled_engines = ["crates.io", "searxng"] +[[engines]] +type = "crates_io" + [[engines]] type = "searxng" instance = "https://search.example.com" "#).unwrap(); let cfg = Config::load_from(&file.path().to_path_buf()); - let engines = cfg.resolve_engines(); + let engines = cfg.engines.clone(); assert!(engines.iter().any(|e| e.id() == "crates.io")); assert!(engines.iter().any(|e| e.id() == "searxng")); } -
modified src/main.rs
diff --git a/src/main.rs b/src/main.rs index d548bb0..6c680b0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,7 +9,6 @@ use search_hub::importer::firefox::FirefoxImporter; use search_hub::importer::zen::ZenImporter; use search_hub::importer::Importer; use search_hub::models::Bookmark; -use search_hub::search_engines::SearchEngine; use search_hub::storage; use search_hub::tagging::{default_tags, TagDef, TaggingEngine}; use search_hub::web; @@ -298,7 +297,7 @@ async fn main() { Some(p) => Config::load_from(p), None => Config::load(), }; - let engines: Vec<Box<dyn SearchEngine>> = config.resolve_engines(); + let engines = config.engines.clone(); let tagging_enabled = config.tagging_enabled.unwrap_or(false); let tag_threshold: f32 = config.tagging_threshold.map(|t| t as f32).unwrap_or(0.60); let exclude_hosts: Vec<String> = config.exclude_urls.clone().unwrap_or_else(|| { @@ -577,8 +576,6 @@ async fn main() { # name = \"my-custom-tag\"\n\ # examples = [\"example text one\", \"example text two\"]\n\ \n\ - # Which external search engines to use (default: [\"crates.io\"])\n\ - # enabled_engines = [\"crates.io\"]\n\ \n\ # Whether auto-tagging is enabled (default: false)\n\ # tagging_enabled = true\n\ @@ -590,7 +587,8 @@ async fn main() { # exclude_urls = [\"localhost\", \"127.0.0.1\", \"::1\"]\n\ \n\ # Per-engine configuration (optional)\n\ - # [engines.searxng]\n\ + # [[engines]]\n\ + # type = \"searxng\"\n\ # instance = \"https://search.kael.ink\"\n\ # Best: use an existing public instance (see https://searx.space).\n\ # Also possible: run your own with Docker:\n\ -
modified src/search_engines/mod.rs
diff --git a/src/search_engines/mod.rs b/src/search_engines/mod.rs index 3e6ee03..57a9c59 100644 --- a/src/search_engines/mod.rs +++ b/src/search_engines/mod.rs @@ -1,5 +1,7 @@ pub mod crates_io; pub mod searxng; +pub mod utils; +pub mod wikipedia; use async_trait::async_trait; use scraper::{Html, Selector}; @@ -96,7 +98,7 @@ pub trait SearchEngine: Send + Sync { /// Build a search URL from the given query by replacing `{}` with the /// URL-encoded query string. fn search_url(&self, query: &str) -> String { - self.url_template().replace("{}", &urlencode(query)) + self.url_template().replace("{}", &utils::urlencode(query)) } /// Fetch search results from the engine for the given query. @@ -179,10 +181,7 @@ pub trait SearchEngine: Send + Sync { } /// Return the default set of search engines. -/// -/// These can be enabled or disabled via the `enabled_engines` config field. -/// DuckDuckGo, lib.rs, and StackOverflow were removed from defaults because -/// they now block automated requests. crates.io works via its public JSON API. +/// crates.io works via its public JSON API. /// /// # Example /// @@ -197,22 +196,6 @@ pub fn default_search_engines() -> Vec<Box<dyn SearchEngine>> { ] } -fn urlencode(s: &str) -> String { - let mut out = String::with_capacity(s.len()); - for byte in s.bytes() { - match byte { - b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => { - out.push(byte as char); - } - b' ' => out.push_str("+"), - _ => { - out.push_str(&format!("%{:02X}", byte)); - } - } - } - out -} - #[cfg(test)] mod tests { use super::*; @@ -228,31 +211,6 @@ mod tests { fn selector(&self) -> &str { "div.results" } } - #[test] - fn test_urlencode_plain() { - assert_eq!(urlencode("hello"), "hello"); - } - - #[test] - fn test_urlencode_spaces() { - assert_eq!(urlencode("hello world"), "hello+world"); - } - - #[test] - fn test_urlencode_special() { - assert_eq!(urlencode("a&b/c"), "a%26b%2Fc"); - } - - #[test] - fn test_urlencode_empty() { - assert_eq!(urlencode(""), ""); - } - - #[test] - fn test_urlencode_alphanum() { - assert_eq!(urlencode("ABC123-_~."), "ABC123-_~."); - } - #[test] fn test_engine_error_display() { let e = EngineError("oops".into()); -
added src/search_engines/utils.rs
diff --git a/src/search_engines/utils.rs b/src/search_engines/utils.rs new file mode 100644 index 0000000..5f59e45 --- /dev/null +++ b/src/search_engines/utils.rs @@ -0,0 +1,45 @@ +pub fn urlencode(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for byte in s.bytes() { + match byte { + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => { + out.push(byte as char); + } + b' ' => out.push_str("+"), + _ => { + out.push_str(&format!("%{:02X}", byte)); + } + } + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_urlencode_plain() { + assert_eq!(urlencode("hello"), "hello"); + } + + #[test] + fn test_urlencode_spaces() { + assert_eq!(urlencode("hello world"), "hello+world"); + } + + #[test] + fn test_urlencode_special() { + assert_eq!(urlencode("a&b/c"), "a%26b%2Fc"); + } + + #[test] + fn test_urlencode_empty() { + assert_eq!(urlencode(""), ""); + } + + #[test] + fn test_urlencode_alphanum() { + assert_eq!(urlencode("ABC123-_~."), "ABC123-_~."); + } +} -
added src/search_engines/wikipedia.rs
diff --git a/src/search_engines/wikipedia.rs b/src/search_engines/wikipedia.rs new file mode 100644 index 0000000..5116c19 --- /dev/null +++ b/src/search_engines/wikipedia.rs @@ -0,0 +1,153 @@ +use async_trait::async_trait; +use serde::Deserialize; + +use crate::search_engines::{EngineError, ResultEntry, SearchEngine}; + +pub const DEFAULT_LANG: &str = "en"; + +pub struct Wikipedia { + pub lang: String, + pub timeout_secs: Option<f32>, +} + +fn api_url(lang: &str) -> String { + format!("https://{lang}.wikipedia.org/w/api.php?action=query&list=search&srsearch={{}}&format=json&srlimit=10") +} + +#[derive(Deserialize)] +struct SearchResult { + title: String, + snippet: Option<String>, +} + +#[derive(Deserialize)] +struct Query { + search: Vec<SearchResult>, +} + +#[derive(Deserialize)] +struct ApiResponse { + query: Option<Query>, +} + +#[async_trait] +impl SearchEngine for Wikipedia { + fn id(&self) -> &str { + "wikipedia" + } + + fn name(&self) -> &str { + "Wikipedia" + } + + fn url_template(&self) -> &str { + // dynamic, so we construct at build time + // this is only a best-effort display + "https://{lang}.wikipedia.org/w/index.php?search={}" + } + + fn selector(&self) -> &str { + "" + } + + fn timeout(&self) -> std::time::Duration { + self.timeout_secs + .map(|s| std::time::Duration::from_secs_f32(s)) + .unwrap_or_else(|| std::time::Duration::from_secs(5)) + } + + async fn fetch_results( + &self, + query: &str, + client: &reqwest::Client, + ) -> Result<Vec<ResultEntry>, EngineError> { + let url = api_url(&self.lang).replace("{}", &urlencode(query)); + let body = client + .get(&url) + .header("Accept", "application/json") + .send() + .await + .map_err(|e| EngineError(format!("fetch failed: {e}")))? + .text() + .await + .map_err(|e| EngineError(format!("read body failed: {e}")))?; + + let resp: ApiResponse = serde_json::from_str(&body) + .map_err(|e| EngineError(format!("JSON parse failed: {e}")))?; + + let results: Vec<ResultEntry> = resp + .query + .map(|q| q.search) + .unwrap_or_default() + .into_iter() + .map(|r| { + let page_url = format!("https://{}.wikipedia.org/wiki/{}", + self.lang, + urlencode(&r.title), + ); + ResultEntry { + title: r.title, + url: page_url, + description: r.snippet.map(|s| { + let s = s.replace("<span class=\"searchmatch\">", ""); + let s = s.replace("</span>", ""); + s + }), + engine: "wikipedia".into(), + } + }) + .collect(); + + if results.is_empty() { + Err(EngineError("no results found".into())) + } else { + Ok(results) + } + } +} + +// share urlencode from the utils module +use crate::search_engines::utils::urlencode; + +#[cfg(test)] +mod tests { + use super::*; + use crate::search_engines::SearchEngine; + + #[test] + fn test_id() { + let e = Wikipedia { lang: DEFAULT_LANG.into(), timeout_secs: None }; + assert_eq!(e.id(), "wikipedia"); + } + + #[test] + fn test_name() { + let e = Wikipedia { lang: DEFAULT_LANG.into(), timeout_secs: None }; + assert_eq!(e.name(), "Wikipedia"); + } + + #[test] + fn test_selector() { + let e = Wikipedia { lang: DEFAULT_LANG.into(), timeout_secs: None }; + assert_eq!(e.selector(), ""); + } + + #[test] + fn test_engine_construct() { + let e = Wikipedia { lang: "fr".into(), timeout_secs: Some(3.0) }; + assert_eq!(e.lang, "fr"); + assert_eq!(e.timeout_secs, Some(3.0)); + } + + #[test] + fn test_api_url_format() { + assert_eq!( + api_url("en"), + "https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={}&format=json&srlimit=10" + ); + assert_eq!( + api_url("fr"), + "https://fr.wikipedia.org/w/api.php?action=query&list=search&srsearch={}&format=json&srlimit=10" + ); + } +} -
modified src/web.rs
diff --git a/src/web.rs b/src/web.rs index 0740238..d968fb2 100644 --- a/src/web.rs +++ b/src/web.rs @@ -1,3 +1,4 @@ +use crate::config::EngineConfig; use crate::search_engines::{EngineError, ResultEntry, SearchEngine}; use crate::storage; use actix_web::{get, web, App, HttpRequest, HttpResponse, HttpServer, Responder}; @@ -71,7 +72,7 @@ async fn search( query: web::Query<SearchQuery>, templates: web::Data<Tera>, db_pool: web::Data<DbPool>, - engines: web::Data<Vec<Box<dyn SearchEngine>>>, + engines: web::Data<Vec<EngineConfig>>, cfg: web::Data<ServerConfig>, ) -> impl Responder { let start = Instant::now(); @@ -207,7 +208,7 @@ pub struct SearchQuery { pub async fn run_server( db_path: &str, cfg: ServerConfig, - engines: Vec<Box<dyn SearchEngine>>, + engines: Vec<EngineConfig>, ) -> std::io::Result<()> { let db_pool = web::Data::new(DbPool::new(db_path)); let engines = web::Data::new(engines);