search_hub — wikipedia.rs

at 9ceb48b Raw
use async_trait::async_trait;
use serde::Deserialize;

use crate::search_engines::{EngineError, ResultEntry, SearchEngine};

pub const DEFAULT_LANG: &str = "en";

pub struct Wikipedia {
    pub lang: String,
    pub timeout_secs: Option<f32>,
}

fn api_url(lang: &str) -> String {
    format!("https://{lang}.wikipedia.org/w/api.php?action=query&list=search&srsearch={{}}&format=json&srlimit=10")
}

#[derive(Deserialize)]
struct SearchResult {
    title: String,
    snippet: Option<String>,
}

#[derive(Deserialize)]
struct Query {
    search: Vec<SearchResult>,
}

#[derive(Deserialize)]
struct ApiResponse {
    query: Option<Query>,
}

#[async_trait]
impl SearchEngine for Wikipedia {
    fn id(&self) -> &str {
        "wikipedia"
    }

    fn name(&self) -> &str {
        "Wikipedia"
    }

    fn url_template(&self) -> &str {
        // dynamic, so we construct at build time
        // this is only a best-effort display
        "https://{lang}.wikipedia.org/w/index.php?search={}"
    }

    fn selector(&self) -> &str {
        ""
    }

    fn timeout(&self) -> std::time::Duration {
        self.timeout_secs
            .map(std::time::Duration::from_secs_f32)
            .unwrap_or_else(|| std::time::Duration::from_secs(5))
    }

    async fn fetch_results(
        &self,
        query: &str,
        client: &reqwest::Client,
    ) -> Result<Vec<ResultEntry>, EngineError> {
        let url = api_url(&self.lang).replace("{}", &urlencode(query));
        let body = client
            .get(&url)
            .header("Accept", "application/json")
            .send()
            .await
            .map_err(|e| EngineError(format!("fetch failed: {e}")))?
            .text()
            .await
            .map_err(|e| EngineError(format!("read body failed: {e}")))?;

        let resp: ApiResponse = serde_json::from_str(&body)
            .map_err(|e| EngineError(format!("JSON parse failed: {e}")))?;

        let results: Vec<ResultEntry> = resp
            .query
            .map(|q| q.search)
            .unwrap_or_default()
            .into_iter()
            .map(|r| {
                let page_url = format!("https://{}.wikipedia.org/wiki/{}",
                    self.lang,
                    urlencode(&r.title).replace('+', "_"),
                );
                ResultEntry {
                    title: r.title,
                    url: page_url,
                    description: r.snippet.map(|s| {
                        let s = s.replace("<span class=\"searchmatch\">", "");
                        let s = s.replace("</span>", "");
                        decode_html_entities(&s)
                    }),
                    engine: format!("wikipedia.{}", self.lang),
                }
            })
            .collect();

        if results.is_empty() {
            Err(EngineError("no results found".into()))
        } else {
            Ok(results)
        }
    }
}

// share utility functions from the utils module
use crate::search_engines::utils::{decode_html_entities, urlencode};

#[cfg(test)]
mod tests {
    use super::*;
    use crate::search_engines::SearchEngine;

    #[test]
    fn test_id() {
        let e = Wikipedia { lang: DEFAULT_LANG.into(), timeout_secs: None };
        assert_eq!(e.id(), "wikipedia");
    }

    #[test]
    fn test_name() {
        let e = Wikipedia { lang: DEFAULT_LANG.into(), timeout_secs: None };
        assert_eq!(e.name(), "Wikipedia");
    }

    #[test]
    fn test_selector() {
        let e = Wikipedia { lang: DEFAULT_LANG.into(), timeout_secs: None };
        assert_eq!(e.selector(), "");
    }

    #[test]
    fn test_engine_construct() {
        let e = Wikipedia { lang: "fr".into(), timeout_secs: Some(3.0) };
        assert_eq!(e.lang, "fr");
        assert_eq!(e.timeout_secs, Some(3.0));
    }

    #[test]
    fn test_title_url_spaces_become_underscores() {
        let title = "IRC chat";
        let url = format!("https://en.wikipedia.org/wiki/{}",
            urlencode(title).replace('+', "_"),
        );
        assert_eq!(url, "https://en.wikipedia.org/wiki/IRC_chat");
    }

    #[test]
    fn test_title_url_plus_sign_preserved() {
        let title = "C++";
        let url = format!("https://en.wikipedia.org/wiki/{}",
            urlencode(title).replace('+', "_"),
        );
        assert_eq!(url, "https://en.wikipedia.org/wiki/C%2B%2B");
    }

    #[test]
    fn test_api_url_format() {
        assert_eq!(
            api_url("en"),
            "https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={}&format=json&srlimit=10"
        );
        assert_eq!(
            api_url("fr"),
            "https://fr.wikipedia.org/w/api.php?action=query&list=search&srsearch={}&format=json&srlimit=10"
        );
    }
}