search_hub

at 27361b0 Raw

use figment::Figment;
use figment::providers::{Format, Toml};
use serde::Deserialize;
use std::path::PathBuf;
use std::time::Duration;

use async_trait::async_trait;
use crate::search_engines::{EngineError, ResultEntry, SearchEngine};

/// Configuration for a single search engine instance.
#[derive(Debug, Deserialize, Clone)]
#[serde(tag = "type", rename_all = "lowercase")]
pub enum EngineConfig {
    /// crates.io registry (public or private)
    #[serde(rename = "crates_io")]
    CratesIo {
        #[serde(default)]
        url: Option<String>,
        #[serde(default)]
        timeout_secs: Option<f32>,
    },
    /// SearXNG meta-search engine
    SearXng {
        instance: String,
        #[serde(default)]
        timeout_secs: Option<f32>,
    },
    /// Wikipedia search (language-specific)
    Wikipedia {
        /// Language code (e.g. "en", "fr", "de"). Defaults to "en" if omitted.
        #[serde(default)]
        lang: Option<String>,
        #[serde(default)]
        timeout_secs: Option<f32>,
    },
    /// MDN Web Docs search (language-specific)
    Mdn {
        /// Locale code (e.g. "en-US", "fr", "de"). Defaults to "en-US" if omitted.
        #[serde(default)]
        locale: Option<String>,
        #[serde(default)]
        timeout_secs: Option<f32>,
    },
    /// Generic HTML-scraped search engine (configurable URL + CSS selector)
    Generic {
        /// Display name for this engine (e.g. "DuckDuckGo", "Stack Overflow").
        name: String,
        /// URL template with `{}` placeholder for the query.
        url: String,
        /// CSS selector targeting the result container.
        selector: String,
        #[serde(default)]
        timeout_secs: Option<f32>,
    },
}

#[async_trait]
impl SearchEngine for EngineConfig {
    fn id(&self) -> &str {
        match self {
            EngineConfig::CratesIo { .. } => "crates.io",
            EngineConfig::SearXng { .. } => "searxng",
            EngineConfig::Wikipedia { .. } => "wikipedia",
            EngineConfig::Mdn { .. } => "mdn",
            EngineConfig::Generic { .. } => "generic",
        }
    }

    fn name(&self) -> &str {
        match self {
            EngineConfig::CratesIo { .. } => "crates.io",
            EngineConfig::SearXng { .. } => "SearXNG",
            EngineConfig::Wikipedia { .. } => "Wikipedia",
            EngineConfig::Mdn { .. } => "MDN",
            EngineConfig::Generic { name, .. } => name,
        }
    }

    fn url_template(&self) -> &str {
        match self {
            EngineConfig::Generic { url, .. } => url,
            _ => "",
        }
    }

    fn selector(&self) -> &str {
        match self {
            EngineConfig::Generic { selector, .. } => selector,
            _ => "",
        }
    }

    fn timeout(&self) -> Duration {
        let secs = match self {
            EngineConfig::CratesIo { timeout_secs, .. }
            | EngineConfig::SearXng { timeout_secs, .. }
            | EngineConfig::Wikipedia { timeout_secs, .. }
            | EngineConfig::Mdn { timeout_secs, .. }
            | EngineConfig::Generic { timeout_secs, .. } => timeout_secs,
        };
        secs.map(|s| Duration::from_secs_f32(s)).unwrap_or(Duration::from_secs(5))
    }

    async fn fetch_results(
        &self,
        query: &str,
        client: &reqwest::Client,
    ) -> Result<Vec<ResultEntry>, EngineError> {
        match self {
            EngineConfig::CratesIo { url, timeout_secs } => {
                let engine = crate::search_engines::crates_io::CratesIo {
                    timeout_secs: *timeout_secs,
                    api_url: url.clone().unwrap_or_else(|| crate::search_engines::crates_io::DEFAULT_API_URL.into()),
                };
                engine.fetch_results(query, client).await
            }
            EngineConfig::SearXng { instance, timeout_secs } => {
                let engine = crate::search_engines::searxng::SearXng {
                    instance: instance.clone(),
                    url_tpl: format!("{}/search?format=json&q={{}}", instance.trim_end_matches('/')),
                    timeout_secs: *timeout_secs,
                };
                engine.fetch_results(query, client).await
            }
            EngineConfig::Wikipedia { lang, timeout_secs } => {
                let engine = crate::search_engines::wikipedia::Wikipedia {
                    lang: lang.clone().unwrap_or_else(|| crate::search_engines::wikipedia::DEFAULT_LANG.into()),
                    timeout_secs: *timeout_secs,
                };
                engine.fetch_results(query, client).await
            }
            EngineConfig::Mdn { locale, timeout_secs } => {
                let engine = crate::search_engines::mdn::Mdn {
                    locale: locale.clone().unwrap_or_else(|| crate::search_engines::mdn::DEFAULT_LOCALE.into()),
                    timeout_secs: *timeout_secs,
                };
                engine.fetch_results(query, client).await
            }
            EngineConfig::Generic { name, url, selector, timeout_secs } => {
                let engine = crate::search_engines::generic::Generic {
                    name: name.clone(),
                    url: url.clone(),
                    selector: selector.clone(),
                    timeout_secs: *timeout_secs,
                };
                engine.fetch_results(query, client).await
            }
        }
    }
}

/// Application configuration loaded from the TOML config file.
///
/// # Example
///
/// ```ignore
/// let cfg = search_hub::config::Config::load();
/// let engines = cfg.engines.clone();
/// println!("{} engines enabled", engines.len());
/// ```
#[derive(Debug, Deserialize)]
pub struct Config {
    /// Custom tag definitions. If non-empty, these replace the hardcoded defaults.
    #[serde(default)]
    pub tags: Vec<crate::tagging::TagDef>,
    /// Whether auto-tagging is enabled. Defaults to `false` if not set.
    #[serde(default)]
    pub tagging_enabled: Option<bool>,
    /// Tagging threshold (0.0 to 1.0). Defaults to 0.60 if not set.
    #[serde(default)]
    pub tagging_threshold: Option<f64>,
    /// Hostnames to exclude from content fetching.
    #[serde(default)]
    pub exclude_urls: Option<Vec<String>>,
    /// Per-engine configuration. Each entry defines an enabled search engine.
    #[serde(default)]
    pub engines: Vec<EngineConfig>,
    /// Default bookmark database path.
    #[serde(default)]
    pub db_path: Option<String>,

    /// Server bind address (default: "127.0.0.1").
    #[serde(default)]
    pub bind_address: Option<String>,
    /// Results per page (default: 20).
    #[serde(default)]
    pub page_size: Option<usize>,
    /// Actix worker threads (default: 2).
    #[serde(default)]
    pub workers: Option<usize>,

    /// ONNX embedding model name (default: "BGESmallENV15").
    #[serde(default)]
    pub onnx_model: Option<String>,
    /// Max characters to use from page content for tagging (default: 2000).
    #[serde(default)]
    pub truncation: Option<usize>,
    /// Max tags to assign per bookmark (default: 5).
    #[serde(default)]
    pub max_tags: Option<usize>,
}

impl Config {
    /// Load configuration from the default config file path.
    ///
    /// Returns a default (empty) `Config` if the file doesn't exist or can't be parsed.
    /// Parse errors are printed to stderr.
    ///
    /// # Example
    ///
    /// ```ignore
    /// let cfg = search_hub::config::Config::load();
    /// ```
    pub fn load() -> Self {
        Self::load_from(&config_file_path())
    }

    /// Load configuration from a specific file path.
    ///
    /// Returns a default (empty) `Config` if the file doesn't exist or can't be parsed.
    /// Parse errors are printed to stderr.
    ///
    /// # Example
    ///
    /// ```ignore
    /// let cfg = search_hub::config::Config::load_from(&PathBuf::from("/tmp/test.toml"));
    /// ```
    pub fn load_from(path: &PathBuf) -> Self {
        if path.exists() {
            Figment::new()
                .merge(Toml::file(path))
                .extract()
                .unwrap_or_else(|e| {
                    eprintln!("Warning: failed to parse config file {:?}: {}", path, e);
                    Config::default()
                })
        } else {
            Config::default()
        }
    }
}

impl Default for Config {
    fn default() -> Self {
        Config {
            tags: Vec::new(),
            tagging_enabled: None,
            tagging_threshold: None,
            exclude_urls: None,
            engines: Vec::new(),
            db_path: None,
            bind_address: None,
            page_size: None,
            workers: None,
            onnx_model: None,
            truncation: None,
            max_tags: None,
        }
    }
}

/// Return the expected config file path (e.g. `~/.config/search_hub/config.toml` on Linux).
///
/// # Example
///
/// ```ignore
/// let path = search_hub::config::config_file_path();
/// ```
///
/// # Panics
///
/// Panics if the platform has no valid config directory.
pub fn config_file_path() -> PathBuf {
    let dirs = directories::ProjectDirs::from("com", "search_hub", "search_hub")
        .expect("no valid config directory");
    let config_dir = dirs.config_dir();
    config_dir.join("config.toml")
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::NamedTempFile;
    use std::io::Write;

    #[test]
    fn load_from_missing_file_returns_default() {
        let cfg = Config::load_from(&PathBuf::from("/nonexistent/path.toml"));
        assert!(cfg.tags.is_empty());
        assert!(cfg.engines.is_empty());
    }

    #[test]
    fn load_from_valid_file_with_engines() {
        let mut file = NamedTempFile::new().unwrap();
        write!(file, r#"
[[engines]]
type = "searxng"
instance = "https://search.example.com"
"#).unwrap();

        let cfg = Config::load_from(&file.path().to_path_buf());
        assert!(!cfg.engines.is_empty());
        assert!(matches!(cfg.engines[0], EngineConfig::SearXng { .. }));
        if let EngineConfig::SearXng { instance, .. } = &cfg.engines[0] {
            assert_eq!(instance, "https://search.example.com");
        } else {
            panic!("expected SearXng");
        }
    }

    #[test]
    fn resolve_engines_includes_searxng_from_engines_vec() {
        let mut file = NamedTempFile::new().unwrap();
        write!(file, r#"
[[engines]]
type = "searxng"
instance = "https://search.example.com"
"#).unwrap();

        let cfg = Config::load_from(&file.path().to_path_buf());
        let engines = cfg.engines.clone();
        assert!(engines.iter().any(|e| e.id() == "searxng"));
    }

    #[test]
    fn resolve_engines_empty_by_default() {
        let cfg = Config::default();
        assert!(cfg.engines.is_empty());
    }

    #[test]
    fn resolve_engines_includes_crates_io_when_configured() {
        let mut file = NamedTempFile::new().unwrap();
        write!(file, r#"
[[engines]]
type = "crates_io"
"#).unwrap();

        let cfg = Config::load_from(&file.path().to_path_buf());
        let engines = cfg.engines.clone();
        assert!(engines.iter().any(|e| e.id() == "crates.io"));
    }

    #[test]
    fn resolve_engines_includes_wikipedia_when_configured() {
        let mut file = NamedTempFile::new().unwrap();
        write!(file, r#"
[[engines]]
type = "wikipedia"
lang = "fr"
"#).unwrap();

        let cfg = Config::load_from(&file.path().to_path_buf());
        let engines = cfg.engines.clone();
        assert!(engines.iter().any(|e| e.id() == "wikipedia"));
    }

    #[test]
    fn resolve_engines_respects_multiple_engines() {
        let mut file = NamedTempFile::new().unwrap();
        write!(file, r#"
[[engines]]
type = "crates_io"

[[engines]]
type = "searxng"
instance = "https://search.example.com"
"#).unwrap();

        let cfg = Config::load_from(&file.path().to_path_buf());
        let engines = cfg.engines.clone();
        assert!(engines.iter().any(|e| e.id() == "crates.io"));
        assert!(engines.iter().any(|e| e.id() == "searxng"));
    }

    #[test]
    fn parse_error_returns_default() {
        let mut file = NamedTempFile::new().unwrap();
        write!(file, "invalid toml [[[").unwrap();
        let cfg = Config::load_from(&file.path().to_path_buf());
        assert!(cfg.tags.is_empty());
        assert!(cfg.engines.is_empty());
    }

    #[test]
    fn tagging_enabled_defaults_to_false() {
        let cfg = Config::default();
        assert_eq!(cfg.tagging_enabled.unwrap_or(false), false);
    }

    #[test]
    fn tagging_enabled_can_be_false() {
        let mut file = NamedTempFile::new().unwrap();
        write!(file, r#"tagging_enabled = false"#).unwrap();
        let cfg = Config::load_from(&file.path().to_path_buf());
        assert_eq!(cfg.tagging_enabled, Some(false));
    }

    #[test]
    fn tagging_enabled_can_be_true() {
        let mut file = NamedTempFile::new().unwrap();
        write!(file, r#"tagging_enabled = true"#).unwrap();
        let cfg = Config::load_from(&file.path().to_path_buf());
        assert_eq!(cfg.tagging_enabled, Some(true));
    }
}