Commit
Message
Changed Files (26)
-
modified .gitignore
diff --git a/.gitignore b/.gitignore index bd5ef61..1b69028 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ # SearchHub project directory structure /target +.fastembed_cache -
modified Cargo.lock
diff --git a/Cargo.lock b/Cargo.lock index 8d996f5..f37b8d1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -29,15 +29,15 @@ dependencies = [ "actix-rt", "actix-service", "actix-utils", - "base64", + "base64 0.22.1", "bitflags", "brotli", "bytes", "bytestring", - "derive_more", + "derive_more 2.1.1", "encoding_rs", "flate2", - "foldhash", + "foldhash 0.1.5", "futures-core", "h2 0.3.27", "http 0.2.12", @@ -148,10 +148,10 @@ dependencies = [ "bytes", "bytestring", "cfg-if", - "cookie", - "derive_more", + "cookie 0.16.2", + "derive_more 2.1.1", "encoding_rs", - "foldhash", + "foldhash 0.1.5", "futures-core", "futures-util", "impl-more", @@ -198,7 +198,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", + "getrandom 0.3.4", "once_cell", + "serde", "version_check", "zerocopy", ] @@ -212,6 +214,24 @@ dependencies = [ "memchr", ] +[[package]] +name = "aligned" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee4508988c62edf04abd8d92897fca0c2995d907ce1dfeaf369dac3716a40685" +dependencies = [ + "as-slice", +] + +[[package]] +name = "aligned-vec" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc890384c8602f339876ded803c97ad529f3842aba97f6392b3dba0dd171769b" +dependencies = [ + "equator", +] + [[package]] name = "alloc-no-stdlib" version = "2.0.4" @@ -227,6 +247,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -292,6 +318,58 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +[[package]] +name = "arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" + +[[package]] +name = "arg_enum_proc_macro" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "as-slice" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "516b6b4f0e40d50dcda9365d53964ec74560ad4284da2e7fc97122cd83174516" +dependencies = [ + "stable_deref_trait", +] + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "atomic" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89cbf775b137e9b968e67227ef7f775587cde3fd31b0d8599dbd0f598a48340" +dependencies = [ + "bytemuck", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -304,18 +382,88 @@ version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" +[[package]] +name = "av-scenechange" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f321d77c20e19b92c39e7471cf986812cbb46659d2af674adc4331ef3f18394" +dependencies = [ + "aligned", + "anyhow", + "arg_enum_proc_macro", + "arrayvec", + "log", + "num-rational", + "num-traits", + "pastey", + "rayon", + "thiserror", + "v_frame", + "y4m", +] + +[[package]] +name = "av1-grain" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cfddb07216410377231960af4fcab838eaa12e013417781b78bd95ee22077f8" +dependencies = [ + "anyhow", + "arrayvec", + "log", + "nom 8.0.0", + "num-rational", + "v_frame", +] + +[[package]] +name = "avif-serialize" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7178fe5f7d460b13895ebb9dcb28a3a6216d2df2574a0806cb51b555d297f38" +dependencies = [ + "arrayvec", +] + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "base64ct" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" + +[[package]] +name = "bit_field" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e4b40c7323adcfc0a41c4b88143ed58346ff65a288fc144329c5c45e05d70c6" + [[package]] name = "bitflags" version = "2.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" +[[package]] +name = "bitstream-io" +version = "4.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eff00be299a18769011411c9def0d827e8f2d7bf0c3dbf53633147a8867fd1f" +dependencies = [ + "no_std_io2", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -365,12 +513,36 @@ dependencies = [ "serde", ] +[[package]] +name = "built" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c0e531d93d39c34eef561e929e8a7f86d77a5af08aac4f6d6e39976c51858e9" + [[package]] name = "bumpalo" version = "3.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "byteorder-lite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" + [[package]] name = "bytes" version = "1.11.1" @@ -386,6 +558,15 @@ dependencies = [ "bytes", ] +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + [[package]] name = "cc" version = "1.2.64" @@ -491,12 +672,43 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + [[package]] name = "colorchoice" version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +[[package]] +name = "colored" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" +dependencies = [ + "lazy_static", + "windows-sys 0.59.0", +] + +[[package]] +name = "compact_str" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dfdd1c2274d9aa354115b09dc9a901d6c5576818cdf70d14cae2bdb47df00ab" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + [[package]] name = "console" version = "0.15.11" @@ -510,6 +722,18 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "console" +version = "0.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87" +dependencies = [ + "encode_unicode", + "libc", + "unicode-width", + "windows-sys 0.61.2", +] + [[package]] name = "const-oid" version = "0.10.2" @@ -536,6 +760,35 @@ dependencies = [ "version_check", ] +[[package]] +name = "cookie" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ddef33a339a91ea89fb53151bd0a4689cfce27055c291dfa69945475d22c747" +dependencies = [ + "percent-encoding", + "time", + "version_check", +] + +[[package]] +name = "cookie_store" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15b2c103cf610ec6cae3da84a766285b42fd16aad564758459e6ecf128c75206" +dependencies = [ + "cookie 0.18.1", + "document-features", + "idna", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "time", + "url", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -614,6 +867,12 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + [[package]] name = "crypto-common" version = "0.1.7" @@ -633,12 +892,131 @@ dependencies = [ "hybrid-array", ] +[[package]] +name = "cssparser" +version = "0.31.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf 0.11.3", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "dary_heap" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1e3a325bc115f096c8b77bbf027a7c2592230e70be2d985be950d3d5e60ebe" +dependencies = [ + "serde", +] + +[[package]] +name = "der" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71fd89660b2dc699704064e59e9dba0147b903e85319429e131620d022be411b" +dependencies = [ + "pem-rfc7468", + "zeroize", +] + [[package]] name = "deranged" version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn", +] + +[[package]] +name = "derive_more" +version = "0.99.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "derive_more" version = "2.1.1" @@ -698,6 +1076,15 @@ dependencies = [ "dirs-sys", ] +[[package]] +name = "dirs" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" +dependencies = [ + "dirs-sys", +] + [[package]] name = "dirs-sys" version = "0.5.0" @@ -721,6 +1108,42 @@ dependencies = [ "syn", ] +[[package]] +name = "document-features" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" +dependencies = [ + "litrs", +] + +[[package]] +name = "dtoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + +[[package]] +name = "ego-tree" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642" + +[[package]] +name = "either" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" + [[package]] name = "encode_unicode" version = "1.0.0" @@ -736,6 +1159,26 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "equator" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -752,6 +1195,27 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "esaxx-rs" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" + +[[package]] +name = "exr" +version = "1.74.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4300e043a56aa2cb633c01af81ca8f699a321879a7854d3896a0ba89056363be" +dependencies = [ + "bit_field", + "half", + "lebe", + "miniz_oxide", + "rayon-core", + "smallvec", + "zune-inflate", +] + [[package]] name = "fallible-iterator" version = "0.3.0" @@ -764,11 +1228,56 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" +[[package]] +name = "fastembed" +version = "5.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "545e4fb17fc48768ff36c2a3854aa5b0b809d0ed595ab5530fa8ac94f31bd0ea" +dependencies = [ + "anyhow", + "hf-hub", + "image", + "ndarray", + "ort", + "safetensors", + "serde", + "serde_json", + "tokenizers", +] + [[package]] name = "fastrand" version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "fax" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caf1079563223d5d59d83c85886a56e586cfd5c1a26292e971a0fa266531ac5a" + +[[package]] +name = "fdeflate" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "figment" +version = "0.10.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cb01cd46b0cf372153850f4c6c272d9cbea2da513e07538405148f95bd789f3" +dependencies = [ + "atomic", + "serde", + "toml", + "uncased", + "version_check", +] [[package]] name = "find-msvc-tools" @@ -798,6 +1307,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "foreign-types" version = "0.3.2" @@ -822,6 +1337,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures-channel" version = "0.3.32" @@ -829,7 +1354,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" dependencies = [ "futures-core", - "futures-sink", ] [[package]] @@ -844,6 +1368,17 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "futures-sink" version = "0.3.32" @@ -864,6 +1399,7 @@ checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ "futures-core", "futures-io", + "futures-macro", "futures-sink", "futures-task", "memchr", @@ -871,6 +1407,15 @@ dependencies = [ "slab", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -881,6 +1426,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "getopts" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -916,6 +1470,16 @@ dependencies = [ "rand_core 0.10.1", ] +[[package]] +name = "gif" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee8cfcc411d9adbbaba82fb72661cc1bcca13e8bba98b364e62b2dba8f960159" +dependencies = [ + "color_quant", + "weezl", +] + [[package]] name = "globset" version = "0.4.18" @@ -978,6 +1542,17 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + [[package]] name = "hashbrown" version = "0.14.5" @@ -987,6 +1562,19 @@ dependencies = [ "ahash", ] +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", + "serde", + "serde_core", +] + [[package]] name = "hashbrown" version = "0.17.1" @@ -1008,17 +1596,58 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hf-hub" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef3982638978efa195ff11b305f51f1f22f4f0a6cabee7af79b383ebee6a213" +dependencies = [ + "dirs", + "http 1.4.2", + "indicatif 0.18.4", + "libc", + "log", + "native-tls", + "rand 0.9.4", + "reqwest", + "serde", + "serde_json", + "thiserror", + "ureq", + "windows-sys 0.61.2", +] + +[[package]] +name = "hmac-sha256" +version = "1.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec9d92d097f4749b64e8cc33d924d9f40a2d4eb91402b458014b781f5733d60f" + [[package]] name = "htmd" version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7eee9b00ee2e599b4f86507157e3db786e7a3319fc225f0e9584151dbea2291d" dependencies = [ - "html5ever", + "html5ever 0.38.0", "markup5ever_rcdom", "phf 0.13.1", ] +[[package]] +name = "html5ever" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" +dependencies = [ + "log", + "mac", + "markup5ever 0.12.1", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "html5ever" version = "0.38.0" @@ -1026,7 +1655,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1054432bae2f14e0061e33d23402fbaa67a921d319d56adc6bcf887ddad1cbc2" dependencies = [ "log", - "markup5ever", + "markup5ever 0.38.0", ] [[package]] @@ -1161,7 +1790,7 @@ version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "futures-channel", "futures-util", @@ -1286,6 +1915,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.1.0" @@ -1323,6 +1958,46 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "image" +version = "0.25.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85ab80394333c02fe689eaf900ab500fbd0c2213da414687ebf995a65d5a6104" +dependencies = [ + "bytemuck", + "byteorder-lite", + "color_quant", + "exr", + "gif", + "image-webp", + "moxcms", + "num-traits", + "png", + "qoi", + "ravif", + "rayon", + "rgb", + "tiff", + "zune-core", + "zune-jpeg", +] + +[[package]] +name = "image-webp" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3" +dependencies = [ + "byteorder-lite", + "quick-error", +] + +[[package]] +name = "imgref" +version = "1.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89194689a993ab15268672e99e7b0e19da2da3268ac682e8f02d29d4d1434cd7" + [[package]] name = "impl-more" version = "0.1.9" @@ -1345,13 +2020,37 @@ version = "0.17.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" dependencies = [ - "console", + "console 0.15.11", "number_prefix", "portable-atomic", "unicode-width", "web-time", ] +[[package]] +name = "indicatif" +version = "0.18.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb" +dependencies = [ + "console 0.16.3", + "portable-atomic", + "unicode-width", + "unit-prefix", + "web-time", +] + +[[package]] +name = "interpolate_name" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "ipnet" version = "2.12.0" @@ -1364,6 +2063,15 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.18" @@ -1403,12 +2111,28 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "lebe" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" + [[package]] name = "libc" version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" +[[package]] +name = "libfuzzer-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9fd2f41a1cba099f79a0b6b6c35656cf7c03351a7bae8ff0f28f25270f929d2" +dependencies = [ + "arbitrary", + "cc", +] + [[package]] name = "libm" version = "0.2.16" @@ -1447,6 +2171,12 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" +[[package]] +name = "litrs" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" + [[package]] name = "local-channel" version = "0.1.5" @@ -1479,6 +2209,57 @@ version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" +[[package]] +name = "loop9" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fae87c125b03c1d2c0150c90365d7d6bcc53fb73a9acaef207d2d065860f062" +dependencies = [ + "imgref", +] + +[[package]] +name = "lzma-rust2" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e20f57f9918e5bd7bc58c22cdd70a6afc7375d4dd9683af5f2b34bd3d2bba619" + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "macro_rules_attribute" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" + +[[package]] +name = "markup5ever" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" +dependencies = [ + "log", + "phf 0.11.3", + "phf_codegen 0.11.3", + "string_cache 0.8.9", + "string_cache_codegen 0.5.4", + "tendril 0.4.3", +] + [[package]] name = "markup5ever" version = "0.38.0" @@ -1486,7 +2267,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8983d30f2915feeaaab2d6babdd6bc7e9ed1a00b66b5e6d74df19aa9c0e91862" dependencies = [ "log", - "tendril", + "tendril 0.5.0", "web_atoms", ] @@ -1496,12 +2277,32 @@ version = "0.38.0+unofficial" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "333171ccdf66e915257740d44e38ea5b1b19ce7b45d33cc35cb6f118fbd981ff" dependencies = [ - "html5ever", - "markup5ever", - "tendril", + "html5ever 0.38.0", + "markup5ever 0.38.0", + "tendril 0.5.0", "xml5ever", ] +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "maybe-rayon" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519" +dependencies = [ + "cfg-if", + "rayon", +] + [[package]] name = "memchr" version = "2.8.2" @@ -1514,6 +2315,12 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -1537,27 +2344,108 @@ dependencies = [ ] [[package]] -name = "native-tls" -version = "0.2.18" +name = "monostate" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3341a273f6c9d5bef1908f17b7267bbab0e95c9bf69a0d4dcf8e9e1b2c76ef67" +dependencies = [ + "monostate-impl", + "serde", + "serde_core", +] + +[[package]] +name = "monostate-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "moxcms" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb85c154ba489f01b25c0d36ae69a87e4a1c73a72631fc6c0eb6dde34a73e44b" +dependencies = [ + "num-traits", + "pxfm", +] + +[[package]] +name = "native-tls" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "ndarray" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520080814a7a6b4a6e9070823bb24b4531daac8c4627e08ba5de8c5ef2f2752d" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "no_std_io2" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418abd1b6d34fbf6cae440dc874771b0525a604428704c76e48b29a5e67b8003" +dependencies = [ + "memchr", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "nom" +version = "8.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" dependencies = [ - "libc", - "log", - "openssl", - "openssl-probe", - "openssl-sys", - "schannel", - "security-framework", - "security-framework-sys", - "tempfile", + "memchr", ] [[package]] -name = "new_debug_unreachable" -version = "1.0.6" +name = "noop_proc_macro" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" +checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" [[package]] name = "nu-ansi-term" @@ -1568,12 +2456,62 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + [[package]] name = "num-conv" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "521739c6d2bac4aa25192232afe6841231376b2b26d4d9fae5ecf8ca5772e441" +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1601,6 +2539,28 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "onig" +version = "6.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc3cbf698f9438986c11a880c90a6d04b9de27575afd28bbf45b154b6c709e2" +dependencies = [ + "bitflags", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e68317604e77e53b85896388e1a803c1d21b74c899ec9e5e1112db90735edd7" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "openssl" version = "0.10.81" @@ -1650,6 +2610,30 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "ort" +version = "2.0.0-rc.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7de3af33d24a745ffb8fab904b13478438d1cd52868e6f17735ef6e1f8bf133" +dependencies = [ + "ndarray", + "ort-sys", + "smallvec", + "tracing", + "ureq", +] + +[[package]] +name = "ort-sys" +version = "2.0.0-rc.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7b497d21a8b6fbb4b5a544f8fadb77e801a09ae0add9e411d31c6f89e3c1e90" +dependencies = [ + "hmac-sha256", + "lzma-rust2", + "ureq", +] + [[package]] name = "parking_lot" version = "0.12.5" @@ -1682,6 +2666,27 @@ dependencies = [ "regex", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pastey" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" + +[[package]] +name = "pem-rfc7468" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6305423e0e7738146434843d1694d621cce767262b2a86910beab705e4493d9" +dependencies = [ + "base64ct", +] + [[package]] name = "percent-encoding" version = "2.3.2" @@ -1731,12 +2736,22 @@ dependencies = [ "sha2", ] +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared 0.10.0", +] + [[package]] name = "phf" version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" dependencies = [ + "phf_macros 0.11.3", "phf_shared 0.11.3", ] @@ -1746,11 +2761,21 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" dependencies = [ - "phf_macros", + "phf_macros 0.13.1", "phf_shared 0.13.1", "serde", ] +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", +] + [[package]] name = "phf_codegen" version = "0.11.3" @@ -1771,6 +2796,16 @@ dependencies = [ "phf_shared 0.13.1", ] +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand 0.8.6", +] + [[package]] name = "phf_generator" version = "0.11.3" @@ -1791,6 +2826,19 @@ dependencies = [ "phf_shared 0.13.1", ] +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "phf_macros" version = "0.13.1" @@ -1804,13 +2852,22 @@ dependencies = [ "syn", ] +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher 0.3.11", +] + [[package]] name = "phf_shared" version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" dependencies = [ - "siphasher", + "siphasher 1.0.3", ] [[package]] @@ -1819,7 +2876,7 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" dependencies = [ - "siphasher", + "siphasher 1.0.3", ] [[package]] @@ -1834,12 +2891,34 @@ version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" +[[package]] +name = "png" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61" +dependencies = [ + "bitflags", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + [[package]] name = "portable-atomic" version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +[[package]] +name = "portable-atomic-util" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618" +dependencies = [ + "portable-atomic", +] + [[package]] name = "potential_utf" version = "0.1.5" @@ -1879,6 +2958,46 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "profiling" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d595e54a326bc53c1c197b32d295e14b169e3cfeaa8dc82b529f947fba6bcf5" +dependencies = [ + "profiling-procmacros", +] + +[[package]] +name = "profiling-procmacros" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4488a4a36b9a4ba6b9334a32a39971f77c1436ec82c38707bce707699cc3bbcb" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "pxfm" +version = "0.1.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f" + +[[package]] +name = "qoi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + [[package]] name = "quote" version = "1.0.45" @@ -1907,10 +3026,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" dependencies = [ "libc", - "rand_chacha", + "rand_chacha 0.3.1", "rand_core 0.6.4", ] +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + [[package]] name = "rand" version = "0.10.1" @@ -1932,6 +3061,16 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + [[package]] name = "rand_core" version = "0.6.4" @@ -1941,12 +3080,108 @@ dependencies = [ "getrandom 0.2.17", ] +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + [[package]] name = "rand_core" version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63b8176103e19a2643978565ca18b50549f6101881c443590420e4dc998a3c69" +[[package]] +name = "rav1e" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b6dd56e85d9483277cde964fd1bdb0428de4fec5ebba7540995639a21cb32b" +dependencies = [ + "aligned-vec", + "arbitrary", + "arg_enum_proc_macro", + "arrayvec", + "av-scenechange", + "av1-grain", + "bitstream-io", + "built", + "cfg-if", + "interpolate_name", + "itertools", + "libc", + "libfuzzer-sys", + "log", + "maybe-rayon", + "new_debug_unreachable", + "noop_proc_macro", + "num-derive", + "num-traits", + "paste", + "profiling", + "rand 0.9.4", + "rand_chacha 0.9.0", + "simd_helpers", + "thiserror", + "v_frame", + "wasm-bindgen", +] + +[[package]] +name = "ravif" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e52310197d971b0f5be7fe6b57530dcd27beb35c1b013f29d66c1ad73fbbcc45" +dependencies = [ + "avif-serialize", + "imgref", + "loop9", + "quick-error", + "rav1e", + "rayon", + "rgb", +] + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-cond" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" +dependencies = [ + "either", + "itertools", + "rayon", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -2008,10 +3243,9 @@ version = "0.12.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "encoding_rs", - "futures-channel", "futures-core", "futures-util", "h2 0.4.15", @@ -2035,15 +3269,23 @@ dependencies = [ "sync_wrapper", "tokio", "tokio-native-tls", + "tokio-util", "tower", "tower-http", "tower-service", "url", "wasm-bindgen", "wasm-bindgen-futures", + "wasm-streams", "web-sys", ] +[[package]] +name = "rgb" +version = "0.8.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b34b781b31e5d73e9fbc8689c70551fd1ade9a19e3e28cfec8580a79290cc4" + [[package]] name = "ring" version = "0.17.14" @@ -2058,6 +3300,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "robotstxt" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbc52377db80e3fec3a2c748ca603b8b6cacdd34ff89ff4b742a635361d4b4a7" + [[package]] name = "rusqlite" version = "0.32.1" @@ -2101,7 +3349,9 @@ version = "0.23.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" dependencies = [ + "log", "once_cell", + "ring", "rustls-pki-types", "rustls-webpki", "subtle", @@ -2140,6 +3390,19 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "safetensors" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79b079b829cb27a1c3c374341345ed2e8b2c0c839034522cee576c140bd7f846" +dependencies = [ + "hashbrown 0.16.1", + "libc", + "serde", + "serde_json", + "tempfile", +] + [[package]] name = "same-file" version = "1.0.6" @@ -2164,26 +3427,50 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "scraper" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b90460b31bfe1fc07be8262e42c665ad97118d4585869de9345a84d501a9eaf0" +dependencies = [ + "ahash", + "cssparser", + "ego-tree", + "getopts", + "html5ever 0.27.0", + "once_cell", + "selectors", + "tendril 0.4.3", +] + [[package]] name = "search_hub" version = "0.1.0" dependencies = [ "actix-web", "anyhow", + "async-trait", "chrono", "clap", + "colored", "directories", + "fastembed", + "figment", "htmd", - "indicatif", + "indicatif 0.17.11", "reqwest", + "robotstxt", "rusqlite", + "scraper", "serde", "serde_json", "tempfile", "tera", "tokio", + "toml", "tracing", "tracing-subscriber", + "url", ] [[package]] @@ -2209,6 +3496,25 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" +dependencies = [ + "bitflags", + "cssparser", + "derive_more 0.99.20", + "fxhash", + "log", + "new_debug_unreachable", + "phf 0.10.1", + "phf_codegen 0.10.0", + "precomputed-hash", + "servo_arc", + "smallvec", +] + [[package]] name = "semver" version = "1.0.28" @@ -2258,6 +3564,15 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -2270,6 +3585,15 @@ dependencies = [ "serde", ] +[[package]] +name = "servo_arc" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "sha1" version = "0.11.0" @@ -2323,6 +3647,21 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" +[[package]] +name = "simd_helpers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95890f873bec569a0362c235787f3aca6e1e887302ba4840839bcc6459c42da6" +dependencies = [ + "quote", +] + +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + [[package]] name = "siphasher" version = "1.0.3" @@ -2371,12 +3710,54 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "socks" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b" +dependencies = [ + "byteorder", + "libc", + "winapi", +] + +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64 0.13.1", + "nom 7.1.3", + "serde", + "unicode-segmentation", +] + [[package]] name = "stable_deref_trait" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared 0.11.3", + "precomputed-hash", + "serde", +] + [[package]] name = "string_cache" version = "0.9.0" @@ -2390,6 +3771,18 @@ dependencies = [ "serde", ] +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", +] + [[package]] name = "string_cache_codegen" version = "0.6.1" @@ -2479,6 +3872,17 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "tendril" version = "0.5.0" @@ -2540,6 +3944,20 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "tiff" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b63feaf3343d35b6ca4d50483f94843803b0f51634937cc2ec519fc32232bc52" +dependencies = [ + "fax", + "flate2", + "half", + "quick-error", + "weezl", + "zune-jpeg", +] + [[package]] name = "time" version = "0.3.49" @@ -2580,6 +3998,39 @@ dependencies = [ "zerovec", ] +[[package]] +name = "tokenizers" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b238e22d44a15349529690fb07bd645cf58149a1b1e44d6cb5bd1641ff1a6223" +dependencies = [ + "ahash", + "aho-corasick", + "compact_str", + "dary_heap", + "derive_builder", + "esaxx-rs", + "getrandom 0.3.4", + "itertools", + "log", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand 0.9.4", + "rayon", + "rayon-cond", + "regex", + "regex-syntax", + "serde", + "serde_json", + "spm_precompiled", + "thiserror", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + [[package]] name = "tokio" version = "1.52.3" @@ -2641,6 +4092,47 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + [[package]] name = "tower" version = "0.5.3" @@ -2762,12 +4254,30 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" +[[package]] +name = "uncased" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1b88fcfe09e89d3866a5c11019378088af2d24c3fbd4f0543f96b479ec90697" +dependencies = [ + "version_check", +] + [[package]] name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + [[package]] name = "unicode-segmentation" version = "1.13.3" @@ -2786,12 +4296,60 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + +[[package]] +name = "unit-prefix" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3" + [[package]] name = "untrusted" version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "ureq" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0" +dependencies = [ + "base64 0.22.1", + "cookie_store", + "der", + "flate2", + "log", + "native-tls", + "percent-encoding", + "rustls", + "rustls-pki-types", + "serde", + "serde_json", + "socks", + "ureq-proto", + "utf8-zero", + "webpki-root-certs", + "webpki-roots", +] + +[[package]] +name = "ureq-proto" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c" +dependencies = [ + "base64 0.22.1", + "http 1.4.2", + "httparse", + "log", +] + [[package]] name = "url" version = "2.5.8" @@ -2810,6 +4368,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf8-zero" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -2822,6 +4386,17 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "v_frame" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "666b7727c8875d6ab5db9533418d7c764233ac9c0cff1d469aec8fa127597be2" +dependencies = [ + "aligned-vec", + "num-traits", + "wasm-bindgen", +] + [[package]] name = "valuable" version = "0.1.1" @@ -2929,6 +4504,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + [[package]] name = "web-sys" version = "0.3.102" @@ -2957,10 +4545,50 @@ checksum = "d7cff6eef815df1834fd250e3a2ff436044d82a9f1bc1980ca1dbdf07effc538" dependencies = [ "phf 0.13.1", "phf_codegen 0.13.1", - "string_cache", - "string_cache_codegen", + "string_cache 0.9.0", + "string_cache_codegen 0.6.1", +] + +[[package]] +name = "webpki-root-certs" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d46a5a140e6f7afeccd8eae97eff335163939eac8b929834875168b29b3d267" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "webpki-roots" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "weezl" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", ] +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + [[package]] name = "winapi-util" version = "0.1.11" @@ -2970,6 +4598,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-core" version = "0.62.2" @@ -3131,6 +4765,15 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + [[package]] name = "wit-bindgen" version = "0.57.1" @@ -3150,9 +4793,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3dc9559429edf0cd3f327cc0afd9d6b36fa8cec6d93107b7fbe64f806b5f2d9" dependencies = [ "log", - "markup5ever", + "markup5ever 0.38.0", ] +[[package]] +name = "y4m" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5a4b21e1a62b67a2970e6831bc091d7b87e119e7f9791aef9702e3bef04448" + [[package]] name = "yoke" version = "0.8.3" @@ -3289,3 +4938,27 @@ dependencies = [ "cc", "pkg-config", ] + +[[package]] +name = "zune-core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" + +[[package]] +name = "zune-inflate" +version = "0.2.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "zune-jpeg" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27bc9d5b815bc103f142aa054f561d9187d191692ec7c2d1e2b4737f8dbd7296" +dependencies = [ + "zune-core", +] -
modified Cargo.toml
diff --git a/Cargo.toml b/Cargo.toml index 69dc851..e924945 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,20 +6,28 @@ edition = "2021" [dependencies] actix-web = "4" anyhow = "1.0" +async-trait = "0.1" chrono = { version = "0.4", features = ["serde"] } clap = { version = "4.5", features = ["derive"] } +colored = "2" directories = "6" +fastembed = "5" +figment = { version = "0.10", features = ["toml"] } htmd = "0.5" +scraper = "0.20" indicatif = "0.17" -reqwest = { version = "0.12", features = ["blocking"] } +reqwest = "0.12" +robotstxt = "0.3" rusqlite = { version = "0.32", features = ["bundled", "chrono"] } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" tempfile = "3" tera = "1.8" +toml = "0.8" tokio = { version = "1.0", features = ["full"] } tracing = "0.1" tracing-subscriber = "0.3" +url = "2" [build-dependencies] tera = "1.8" -
modified README.md
diff --git a/README.md b/README.md index 57aa442..cf13fc1 100644 --- a/README.md +++ b/README.md @@ -1,84 +1,107 @@ # SearchHub -A local searchable database of browser bookmarks with an HTTP interface that also forwards queries to external search engines. +A local search engine for your browser bookmarks. Import bookmarks from Firefox, Zen, Chrome, or Chromium, search them with full-text queries, and optionally forward searches to crates.io (via its public JSON API). Content is automatically tagged via local ONNX embeddings. -## Features +## Install -- **Local Bookmark Database**: Search your bookmarks with FTS5 full-text search -- **HTTP Interface**: Actix-web server with search form -- **Bookmark Import**: Import bookmarks from Firefox (Chrome/Chromium support planned) -- **CLI Tools**: `insert`, `list`, `remove`, and `serve` subcommands -- **Multi-Engine Forwarding**: (Planned) DuckDuckGo, lib.rs, crates.io, StackOverflow +**Prerequisites:** Rust (install via [rustup](https://rustup.rs/)). -## Usage +```sh +git clone https://github.com/your/repo.git +cd search_hub +cargo install --path . +``` -All commands use a platform-appropriate default database path (e.g. `~/.local/share/search_hub/bookmarks.db` on Linux). Override with `--db-path`. +This installs the `search_hub` binary to `~/.cargo/bin/search_hub`. -``` -cargo run -- serve # Start web server on :8080 -cargo run -- serve --port 3000 # Custom port -cargo run -- insert "Title" "url" # Add a bookmark -cargo run -- list # List all bookmarks -cargo run -- remove --id 1 # Delete by ID -cargo run -- import firefox # Import from Firefox (auto-discover profile) -cargo run -- import firefox --profile ~/.mozilla/firefox/abc.default +To update later, pull the latest code and reinstall. + +## First steps + +```sh +# Import bookmarks from Firefox (auto-discovers your profile) +search_hub import firefox + +# Import from Chrome +search_hub import chrome + +# Start the web UI +search_hub serve ``` -## Project Structure +Open http://127.0.0.1:8080 in your browser. You can now search your bookmarks. -- `src/lib.rs` — Library root (models, storage, web, importer) -- `src/main.rs` — CLI entry point with clap subcommands -- `src/models.rs` — Bookmark data types -- `src/storage.rs` — SQLite/FTS5 database operations -- `src/web.rs` — Actix-web server and handlers -- `src/importer/` — Browser bookmark importers -- `templates/` — Tera HTML templates +Search queries are also forwarded to [crates.io](https://crates.io) via its public JSON API. Works as a custom search provider in Firefox/Zen via the OpenSearch protocol (your browser should auto-discover it at `/opensearch.xml`). -## Technology Stack +## CLI reference -- [Rust](https://www.rust-lang.org/) -- [Actix-web](https://actix.rs/) — HTTP server -- [Tera](https://tera.netlify.app/) — Template engine -- [rusqlite](https://github.com/rusqlite/rusqlite) — SQLite with FTS5 -- [clap](https://docs.rs/clap/) — CLI argument parsing +| Command | What it does | +|---------|-------------| +| `search_hub serve` | Start web UI on port 8080 | +| `search_hub serve --port 3000` | Start on a custom port | +| `search_hub import firefox` | Import bookmarks from Firefox | +| `search_hub import chrome` | Import from Chrome/Chromium | +| `search_hub import zen` | Import from Zen Browser | +| `search_hub search "query"` | Search bookmarks from the terminal | +| `search_hub list` | List all bookmarks | +| `search_hub insert "Title" https://..."` | Add a bookmark (fetches content, auto-tags) | +| `search_hub remove --id 1` | Delete a bookmark by ID | +| `search_hub retag --all` | Re-run auto-tagging on all bookmarks | +| `search_hub init-config` | Create a default config file at `~/.config/search_hub/config.toml` | -## Systemd User Timer (Automatic Imports) +All commands use `~/.local/share/search_hub/bookmarks.db` by default. Override with `--db-path` or set `db_path` in the config file. -Periodically import bookmarks from Zen Browser via a systemd user timer. +The first time you use a search or insert command, SearchHub downloads an ONNX embedding model to `.fastembed_cache/` in the project directory (about 30 MB). -### Setup +## Configuration -1. **Build the release binary** (if not already done): +Run `search_hub init-config` to create `~/.config/search_hub/config.toml` with all available options commented out. Or create it manually: - ``` - cargo build --release - ``` +```toml +# Bookmark database path (default: platform data directory) +# db_path = "/home/you/.local/share/search_hub/bookmarks.db" -2. **Install the binary** to `~/.cargo/bin/`: +# Custom tags override the built-in defaults +# [[tags]] +# name = "my-custom-tag" +# examples = ["example text one", "example text two"] - ``` - cargo install --path . - ``` +# Which external search engines to use (default: ["crates.io"]) +# enabled_engines = ["crates.io"] -3. **Copy the unit files** to the systemd user directory: +# Minimum confidence for auto-tagging (0.0 to 1.0, default: 0.6) +# tagging_threshold = 0.6 - ``` - mkdir -p ~/.config/systemd/user/ - cp contrib/search-hub-import.service ~/.config/systemd/user/ - cp contrib/search-hub-import.timer ~/.config/systemd/user/ - ``` +# Hosts to skip when fetching content for bookmarking (default: local addresses) +# exclude_urls = ["localhost", "127.0.0.1", "::1"] -4. **Reload systemd** and enable the timer: +# Per-engine configuration (optional) +# [engines.searxng] +# instance = "https://search.kael.ink" +# Best: use an existing public instance (see https://searx.space). +# Also possible: run your own with Docker: +# docker run -d --name searxng -p 8888:8080 searxng/searxng +``` + +## Run the web server as a systemd user service + +Keeps the web UI running in the background, starts automatically on login. - ``` - systemctl --user daemon-reload - systemctl --user enable --now search-hub-import.timer - ``` +```sh +cp contrib/search-hub-web.service ~/.config/systemd/user/ +systemctl --user daemon-reload +systemctl --user enable --now search-hub-web.service +``` + +Check status with `systemctl --user status search-hub-web`. View logs with `journalctl --user -u search-hub-web -f`. -5. **Verify** the timer is active: +## Auto-import with systemd (Zen Browser) - ``` - systemctl --user list-timers --all | grep search-hub - ``` +```sh +cp contrib/search-hub-import.service ~/.config/systemd/user/ +cp contrib/search-hub-import.timer ~/.config/systemd/user/ +systemctl --user daemon-reload +systemctl --user enable --now search-hub-import.timer +``` -The import runs daily and stores bookmarks in `~/.local/share/search_hub/bookmarks.db`. To customize the schedule or source, edit the user copies of the unit files and run `systemctl --user daemon-reload`. +This imports bookmarks from Zen Browser daily. -
modified build.rs
diff --git a/build.rs b/build.rs index 2e1e450..7049665 100644 --- a/build.rs +++ b/build.rs @@ -5,4 +5,14 @@ fn main() { .add_raw_template("index.html", &template) .expect("Failed to parse template"); println!("cargo:rerun-if-changed=templates/index.html"); + + let hash = std::process::Command::new("git") + .args(["rev-parse", "--short", "HEAD"]) + .output() + .ok() + .and_then(|o| String::from_utf8(o.stdout).ok()) + .map(|s| s.trim().to_string()) + .unwrap_or_else(|| "unknown".into()); + println!("cargo:rustc-env=SEARCH_HUB_GIT_HASH={}", hash); + println!("cargo:rerun-if-changed=.git/HEAD"); } -
added contrib/logo.svg
diff --git a/contrib/logo.svg b/contrib/logo.svg new file mode 100644 index 0000000..d989702 --- /dev/null +++ b/contrib/logo.svg @@ -0,0 +1,38 @@ +<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" fill="none"> + <defs> + <linearGradient id="g" x1="0" y1="0" x2="1" y2="1"> + <stop offset="0%" stop-color="#fbbf24"/> + <stop offset="100%" stop-color="#d97706"/> + </linearGradient> + </defs> + + <!-- Lens circle --> + <circle cx="220" cy="220" r="160" stroke="url(#g)" stroke-width="32" fill="none"/> + + <!-- Open book inside lens --> + <g transform="translate(220,220)"> + <!-- Left page --> + <path d="M-70,-50 Q-30,-60 0,-50 L0,50 Q-30,60 -70,50 Z" fill="#fbbf24" opacity="0.9"/> + <!-- Right page --> + <path d="M0,-50 Q30,-60 70,-50 L70,50 Q30,60 0,50 Z" fill="#d97706" opacity="0.9"/> + <!-- Spine highlight --> + <line x1="0" y1="-50" x2="0" y2="50" stroke="#faf6f0" stroke-width="4" opacity="0.6"/> + + <!-- Text lines on left page --> + <path d="M-55,-30 Q-45,-34 -35,-30 Q-25,-26 -15,-30" stroke="#1c1917" stroke-width="4" stroke-linecap="round" fill="none" opacity="0.4"/> + <path d="M-55,-15 Q-45,-19 -35,-15 Q-25,-11 -15,-15" stroke="#1c1917" stroke-width="4" stroke-linecap="round" fill="none" opacity="0.4"/> + <path d="M-55,0 Q-45,-4 -35,0 Q-25,4 -15,0" stroke="#1c1917" stroke-width="4" stroke-linecap="round" fill="none" opacity="0.4"/> + <path d="M-55,15 Q-45,11 -35,15 Q-25,19 -15,15" stroke="#1c1917" stroke-width="4" stroke-linecap="round" fill="none" opacity="0.4"/> + <path d="M-50,30 Q-40,26 -30,30 Q-20,34 -10,30" stroke="#1c1917" stroke-width="4" stroke-linecap="round" fill="none" opacity="0.4"/> + + <!-- Text lines on right page --> + <path d="M15,-30 Q25,-34 35,-30 Q45,-26 55,-30" stroke="#1c1917" stroke-width="4" stroke-linecap="round" fill="none" opacity="0.4"/> + <path d="M15,-15 Q25,-19 35,-15 Q45,-11 55,-15" stroke="#1c1917" stroke-width="4" stroke-linecap="round" fill="none" opacity="0.4"/> + <path d="M15,0 Q25,-4 35,0 Q45,4 55,0" stroke="#1c1917" stroke-width="4" stroke-linecap="round" fill="none" opacity="0.4"/> + <path d="M15,15 Q25,11 35,15 Q45,19 55,15" stroke="#1c1917" stroke-width="4" stroke-linecap="round" fill="none" opacity="0.4"/> + <path d="M10,30 Q20,26 30,30 Q40,34 50,30" stroke="#1c1917" stroke-width="4" stroke-linecap="round" fill="none" opacity="0.4"/> + </g> + + <!-- Handle --> + <line x1="340" y1="340" x2="440" y2="440" stroke="url(#g)" stroke-width="32" stroke-linecap="round"/> +</svg> -
added contrib/search-hub-web.service
diff --git a/contrib/search-hub-web.service b/contrib/search-hub-web.service new file mode 100644 index 0000000..03f08ff --- /dev/null +++ b/contrib/search-hub-web.service @@ -0,0 +1,12 @@ +[Unit] +Description=SearchHub web server +After=network.target + +[Service] +Type=simple +ExecStart=%h/.cargo/bin/search_hub serve --db-path %h/.local/share/search_hub/bookmarks.db +Restart=on-failure +RestartSec=5 + +[Install] +WantedBy=default.target -
added src/config.rs
diff --git a/src/config.rs b/src/config.rs new file mode 100644 index 0000000..4a1c1c1 --- /dev/null +++ b/src/config.rs @@ -0,0 +1,275 @@ +use figment::Figment; +use figment::providers::{Format, Toml}; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::path::PathBuf; + +/// A single search engine definition with an optional CSS selector for +/// inline result extraction. +/// +/// When `selector` is `Some`, the search handler uses `scraper` to find +/// that container in the engine's HTML and extract `<a>` links from it. +/// Engines without a selector are skipped for inline extraction. +/// +/// # Example +/// +/// ```rust +/// use search_hub::config::ForwarderDef; +/// +/// let ddg = ForwarderDef { +/// id: "duckduckgo".into(), +/// name: "DuckDuckGo".into(), +/// url: "https://duckduckgo.com/?q={}".into(), +/// selector: Some("article[data-testid='result']".into()), +/// }; +/// assert_eq!(ddg.id, "duckduckgo"); +/// ``` +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ForwarderDef { + /// URL query parameter identifier (e.g. "duckduckgo"). + pub id: String, + /// Display name (e.g. "DuckDuckGo"). + pub name: String, + /// URL template with `{}` placeholder for the query string. + pub url: String, + /// CSS selector for the result container in the engine's HTML page. + /// Used for inline result extraction; `None` skips inline extraction. + #[serde(default)] + pub selector: Option<String>, +} + +/// Application configuration loaded from the TOML config file. +/// +/// Supports `[[tags]]`, `enabled_engines`, `tagging_threshold`, and `[engines.*]`. +/// +/// # Example +/// +/// ```ignore +/// let cfg = search_hub::config::Config::load(); +/// if cfg.tags.is_empty() { +/// println!("using default tags"); +/// } +/// let engines = cfg.resolve_engines(); +/// println!("{} engines enabled", engines.len()); +/// ``` +#[derive(Debug, Deserialize)] +pub struct Config { + /// Custom tag definitions. If non-empty, these replace the hardcoded defaults. + #[serde(default)] + pub tags: Vec<crate::tagging::TagDef>, + /// List of engine IDs to enable for inline search results. + /// If `None`, all engines from `search_engines::default_search_engines()` are used. + #[serde(default)] + pub enabled_engines: Option<Vec<String>>, + /// Tagging threshold (0.0 to 1.0). Tags with a score below this are + /// discarded. Defaults to 0.60 if not set. + #[serde(default)] + pub tagging_threshold: Option<f64>, + /// Hostnames to exclude from content fetching during import. + /// Defaults to localhost addresses if not set. + #[serde(default)] + pub exclude_urls: Option<Vec<String>>, + /// Per-engine configuration, keyed by engine ID. + /// For example: `[engines.searxng]` with `instance = "https://..."`. + #[serde(default)] + pub engines: Option<HashMap<String, toml::Table>>, + /// Default bookmark database path. Overrides the platform default. + #[serde(default)] + pub db_path: Option<String>, +} + +impl Config { + /// Load configuration from the default config file path. + /// + /// Returns a default (empty) `Config` if the file doesn't exist or can't be parsed. + /// Parse errors are printed to stderr. + /// + /// # Example + /// + /// ```ignore + /// let cfg = search_hub::config::Config::load(); + /// ``` + pub fn load() -> Self { + Self::load_from(&config_file_path()) + } + + /// Load configuration from a specific file path. + /// + /// Returns a default (empty) `Config` if the file doesn't exist or can't be parsed. + /// Parse errors are printed to stderr. + /// + /// # Example + /// + /// ```ignore + /// let cfg = search_hub::config::Config::load_from(&PathBuf::from("/tmp/test.toml")); + /// ``` + pub fn load_from(path: &PathBuf) -> Self { + if path.exists() { + Figment::new() + .merge(Toml::file(path)) + .extract() + .unwrap_or_else(|e| { + eprintln!("Warning: failed to parse config file {:?}: {}", path, e); + Config::default() + }) + } else { + Config::default() + } + } +} + +impl Default for Config { + fn default() -> Self { + Config { + tags: Vec::new(), + enabled_engines: None, + tagging_threshold: None, + exclude_urls: None, + engines: None, + db_path: None, + } + } +} + +impl Config { + /// Resolve the list of enabled search engines. + /// + /// Default engines (`crates.io`) are included unless filtered by + /// `enabled_engines`. Engines with configuration in the `engines` map + /// (e.g. `searxng`) are added subject to the same filter. + pub fn resolve_engines(&self) -> Vec<Box<dyn crate::search_engines::SearchEngine>> { + let is_enabled = |id: &str| -> bool { + self.enabled_engines + .as_ref() + .map(|enabled| enabled.iter().any(|e| e == id)) + .unwrap_or(true) + }; + + let mut engines: Vec<Box<dyn crate::search_engines::SearchEngine>> = Vec::new(); + + for e in crate::search_engines::default_search_engines() { + if is_enabled(e.id()) { + engines.push(e); + } + } + + if let Some(ref configs) = self.engines { + for (id, config) in configs { + if !is_enabled(id) { + continue; + } + match id.as_str() { + "searxng" => { + if let Some(engine) = crate::search_engines::searxng::SearXng::from_config(config) { + engines.push(engine); + } + } + _ => {} + } + } + } + + engines + } +} + +/// Return the expected config file path (e.g. `~/.config/search_hub/config.toml` on Linux). +/// +/// # Example +/// +/// ```ignore +/// let path = search_hub::config::config_file_path(); +/// ``` +/// +/// # Panics +/// +/// Panics if the platform has no valid config directory. +pub fn config_file_path() -> PathBuf { + let dirs = directories::ProjectDirs::from("com", "search_hub", "search_hub") + .expect("no valid config directory"); + let config_dir = dirs.config_dir(); + config_dir.join("config.toml") +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::NamedTempFile; + use std::io::Write; + + #[test] + fn load_from_missing_file_returns_default() { + let cfg = Config::load_from(&PathBuf::from("/nonexistent/path.toml")); + assert!(cfg.tags.is_empty()); + assert!(cfg.engines.is_none()); + } + + #[test] + fn load_from_valid_file_with_engines() { + let mut file = NamedTempFile::new().unwrap(); + write!(file, r#" +[engines.searxng] +instance = "https://search.example.com" +"#).unwrap(); + + let cfg = Config::load_from(&file.path().to_path_buf()); + let engines = cfg.engines.unwrap(); + assert!(engines.contains_key("searxng")); + let searxng = &engines["searxng"]; + assert_eq!(searxng.get("instance").unwrap().as_str(), Some("https://search.example.com")); + } + + #[test] + fn resolve_engines_includes_searxng_from_engines_map() { + let mut file = NamedTempFile::new().unwrap(); + write!(file, r#" +[engines.searxng] +instance = "https://search.example.com" +"#).unwrap(); + + let cfg = Config::load_from(&file.path().to_path_buf()); + let engines = cfg.resolve_engines(); + assert!(engines.iter().any(|e| e.id() == "searxng")); + } + + #[test] + fn resolve_engines_filters_with_enabled_list() { + let mut file = NamedTempFile::new().unwrap(); + write!(file, r#" +enabled_engines = ["crates.io"] +[engines.searxng] +instance = "https://search.example.com" +"#).unwrap(); + + let cfg = Config::load_from(&file.path().to_path_buf()); + let engines = cfg.resolve_engines(); + // searxng should be excluded because it's not in enabled_engines + assert!(!engines.iter().any(|e| e.id() == "searxng")); + // crates.io should be included + assert!(engines.iter().any(|e| e.id() == "crates.io")); + } + + #[test] + fn resolve_engines_includes_searxng_when_in_enabled_list() { + let mut file = NamedTempFile::new().unwrap(); + write!(file, r#" +enabled_engines = ["crates.io", "searxng"] +[engines.searxng] +instance = "https://search.example.com" +"#).unwrap(); + + let cfg = Config::load_from(&file.path().to_path_buf()); + let engines = cfg.resolve_engines(); + assert!(engines.iter().any(|e| e.id() == "crates.io")); + assert!(engines.iter().any(|e| e.id() == "searxng")); + } + + #[test] + fn parse_error_returns_default() { + let mut file = NamedTempFile::new().unwrap(); + write!(file, "invalid toml [[[").unwrap(); + let cfg = Config::load_from(&file.path().to_path_buf()); + assert!(cfg.tags.is_empty()); + assert!(cfg.engines.is_none()); + } +} -
added src/importer/chrome.rs
diff --git a/src/importer/chrome.rs b/src/importer/chrome.rs new file mode 100644 index 0000000..da2940a --- /dev/null +++ b/src/importer/chrome.rs @@ -0,0 +1,299 @@ +use crate::importer::{open_backup_file, Importer}; +use crate::models::Bookmark; +use chrono::{DateTime, TimeZone, Utc}; +use serde::Deserialize; +use std::path::{Path, PathBuf}; + +/// Chrome/Chromium importer. +/// +/// Parses the `Bookmarks` JSON file from the browser profile directory. +/// Profiles are discovered in `~/.config/google-chrome/` and +/// `~/.config/chromium/`. +pub struct ChromeImporter; + +impl Importer for ChromeImporter { + fn name(&self) -> &'static str { + "Chrome/Chromium" + } + + fn discover_profiles(&self) -> Vec<PathBuf> { + let mut profiles = Vec::new(); + for base in ["google-chrome", "chromium"] { + if let Some(dir) = home_dir().map(|p| p.join(".config").join(base)) { + if let Ok(entries) = std::fs::read_dir(dir) { + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + let name = path.file_name().and_then(|n| n.to_str()); + if name == Some("Default") || name.map_or(false, |n| n.starts_with("Profile ")) { + if path.join("Bookmarks").exists() { + profiles.push(path); + } + } + } + } + } + } + } + profiles + } + + fn import(&self, profile_path: &Path) -> anyhow::Result<Vec<Bookmark>> { + let bookmarks_path = profile_path.join("Bookmarks"); + if !bookmarks_path.exists() { + anyhow::bail!("Bookmarks file not found in {:?}", profile_path); + } + + let content = std::fs::read_to_string(&bookmarks_path)?; + let root: ChromeBookmarks = serde_json::from_str(&content)?; + + let mut results = Vec::new(); + collect_children(&root.roots.bookmark_bar.children, &mut results, 0); + collect_children(&root.roots.other.children, &mut results, 0); + collect_children(&root.roots.synced.children, &mut results, 0); + + Ok(results) + } + + fn import_history(&self, profile_path: &Path) -> anyhow::Result<Vec<Bookmark>> { + let conn = open_backup_file(profile_path, "History")?; + + let mut stmt = conn.prepare( + "SELECT id, url, COALESCE(title, ''), last_visit_time + FROM urls + WHERE hidden = 0 + ORDER BY last_visit_time DESC", + )?; + + let entries = stmt + .query_map([], |row| { + let id: i32 = row.get(0)?; + let url: String = row.get(1)?; + let title: String = row.get(2)?; + let chrome_time: i64 = row.get(3)?; + + let created_at = chrome_time_i64_to_datetime(chrome_time) + .unwrap_or_else(|| DateTime::from_timestamp(0, 0).unwrap()); + + Ok(Bookmark { + id, + title, + url, + description: None, + source: "history".into(), + content: None, + tags: None, + created_at, + }) + })? + .collect::<Result<Vec<_>, _>>()?; + + Ok(entries) + } +} + +#[derive(Deserialize)] +struct ChromeBookmarks { + roots: ChromeRoots, +} + +#[derive(Deserialize)] +struct ChromeRoots { + bookmark_bar: ChromeFolder, + other: ChromeFolder, + synced: ChromeFolder, +} + +#[derive(Deserialize)] +struct ChromeFolder { + children: Vec<ChromeNode>, +} + +#[derive(Deserialize)] +struct ChromeNode { + #[serde(rename = "type")] + node_type: String, + name: String, + url: Option<String>, + #[serde(default)] + children: Vec<ChromeNode>, + date_added: Option<String>, +} + +fn chrome_time_to_datetime(chrome_time: &str) -> Option<DateTime<Utc>> { + let micros: i64 = chrome_time.parse().ok()?; + chrome_time_i64_to_datetime(micros) +} + +fn chrome_time_i64_to_datetime(micros: i64) -> Option<DateTime<Utc>> { + // Chrome time is microseconds since 1601-01-01 (Windows epoch). + // Unix epoch is 1970-01-01, offset by 11644473600 seconds. + let unix_secs = (micros / 1_000_000).checked_sub(11644473600)?; + Utc.timestamp_opt(unix_secs, 0).single() +} + +fn collect_children(nodes: &[ChromeNode], out: &mut Vec<Bookmark>, next_id: i32) -> i32 { + let mut id = next_id; + for node in nodes { + if node.node_type == "url" { + if let Some(ref url) = node.url { + id += 1; + let created_at = node + .date_added + .as_ref() + .and_then(|d| chrome_time_to_datetime(d)) + .unwrap_or_else(|| { + DateTime::from_timestamp(0, 0).unwrap() + }); + out.push(Bookmark { + id, + title: node.name.clone(), + url: url.clone(), + description: None, + source: "bookmark".into(), + content: None, + tags: None, + created_at, + }); + } + } else if node.node_type == "folder" { + id = collect_children(&node.children, out, id); + } + } + id +} + +fn home_dir() -> Option<PathBuf> { + std::env::var("HOME") + .ok() + .map(PathBuf::from) + .or_else(|| { + if cfg!(target_os = "windows") { + std::env::var("USERPROFILE").ok().map(PathBuf::from) + } else { + None + } + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + + fn chrome_bookmarks_json() -> String { + r#"{ + "checksum": "abc", + "roots": { + "bookmark_bar": { + "children": [ + { + "date_added": "13249872340500000", + "guid": "a", + "id": "1", + "name": "Rust Lang", + "type": "url", + "url": "https://rust-lang.org" + }, + { + "date_added": "13249872341500000", + "guid": "b", + "id": "2", + "name": "My Folder", + "type": "folder", + "children": [ + { + "date_added": "13249872342500000", + "guid": "c", + "id": "3", + "name": "Nested Bookmark", + "type": "url", + "url": "https://example.com/nested" + } + ] + } + ], + "date_added": "13249872340000000", + "date_modified": "13249872340000000", + "guid": "root", + "id": "0", + "name": "Bookmarks Bar", + "type": "folder" + }, + "other": { + "children": [ + { + "date_added": "13249872343500000", + "guid": "d", + "id": "4", + "name": "Other Bookmark", + "type": "url", + "url": "https://other.com" + } + ], + "date_added": "13249872340000000", + "date_modified": "13249872340000000", + "guid": "other", + "id": "0", + "name": "Other Bookmarks", + "type": "folder" + }, + "synced": { + "children": [], + "date_added": "13249872340000000", + "date_modified": "13249872340000000", + "guid": "synced", + "id": "0", + "name": "Synced Bookmarks", + "type": "folder" + } + }, + "version": 1 + }"#.to_string() + } + + #[test] + fn parse_chrome_bookmarks_json() { + let root: ChromeBookmarks = serde_json::from_str(&chrome_bookmarks_json()).expect("parse"); + assert_eq!(root.roots.bookmark_bar.children.len(), 2); + assert_eq!(root.roots.other.children.len(), 1); + assert!(root.roots.synced.children.is_empty()); + } + + #[test] + fn chrome_time_conversion() { + let dt = chrome_time_to_datetime("13249872340500000").expect("convert"); + // 13249872340500000 micros since 1601-01-01 = 2020-11-15T00:05:40Z + let expected = Utc.with_ymd_and_hms(2020, 11, 15, 0, 5, 40).unwrap(); + assert_eq!(dt, expected); + } + + #[test] + fn import_from_bookmarks_file() -> anyhow::Result<()> { + let dir = tempfile::tempdir()?; + let path = dir.path().join("Bookmarks"); + let mut f = std::fs::File::create(&path)?; + f.write_all(chrome_bookmarks_json().as_bytes())?; + + let importer = ChromeImporter; + let bookmarks = importer.import(dir.path())?; + + assert_eq!(bookmarks.len(), 3); + assert_eq!(bookmarks[0].title, "Rust Lang"); + assert_eq!(bookmarks[0].url, "https://rust-lang.org"); + assert_eq!(bookmarks[1].title, "Nested Bookmark"); + assert_eq!(bookmarks[1].url, "https://example.com/nested"); + assert_eq!(bookmarks[2].title, "Other Bookmark"); + assert_eq!(bookmarks[2].url, "https://other.com"); + + Ok(()) + } + + #[test] + fn error_on_missing_bookmarks_file() { + let dir = tempfile::tempdir().unwrap(); + let importer = ChromeImporter; + let result = importer.import(dir.path()); + assert!(result.is_err()); + } +} -
modified src/importer/firefox.rs
diff --git a/src/importer/firefox.rs b/src/importer/firefox.rs index 5b02859..6b5f21c 100644 --- a/src/importer/firefox.rs +++ b/src/importer/firefox.rs @@ -1,6 +1,6 @@ -use crate::importer::{open_backup, Importer}; +use crate::importer::gecko; +use crate::importer::Importer; use crate::models::Bookmark; -use chrono::DateTime; use std::path::{Path, PathBuf}; pub struct FirefoxImporter; @@ -11,74 +11,14 @@ impl Importer for FirefoxImporter { } fn discover_profiles(&self) -> Vec<PathBuf> { - let mut profiles = Vec::new(); - let mozilla_dir = home_dir().map(|p| p.join(".mozilla").join("firefox")); - - if let Some(dir) = mozilla_dir { - if let Ok(entries) = std::fs::read_dir(dir) { - for entry in entries.flatten() { - let path = entry.path(); - if path.is_dir() - && path - .file_name() - .and_then(|n| n.to_str()) - .map_or(false, |n| n.contains(".default")) - { - profiles.push(path); - } - } - } - } - - profiles + gecko::discover_profiles(".mozilla/firefox") } fn import(&self, profile_path: &Path) -> anyhow::Result<Vec<Bookmark>> { - let conn = open_backup(profile_path)?; - - let mut stmt = conn.prepare( - "SELECT b.id, COALESCE(b.title, p.title, ''), p.url, p.description, - CAST(b.dateAdded AS INTEGER) / 1000000 - FROM moz_bookmarks b - JOIN moz_places p ON b.fk = p.id - WHERE b.type = 1", - )?; - - let bookmarks = stmt - .query_map([], |row| { - let id: i32 = row.get(0)?; - let title: String = row.get(1)?; - let url: String = row.get(2)?; - let description: Option<String> = row.get(3)?; - let timestamp: i64 = row.get(4)?; - - let created_at = DateTime::from_timestamp(timestamp, 0) - .unwrap_or(DateTime::from_timestamp(0, 0).unwrap()); - - Ok(Bookmark { - id, - title, - url, - description: description.filter(|d| !d.is_empty()), - content: None, - created_at, - }) - })? - .collect::<Result<Vec<_>, _>>()?; - - Ok(bookmarks) + gecko::query_bookmarks(profile_path) } -} -fn home_dir() -> Option<PathBuf> { - std::env::var("HOME") - .ok() - .map(PathBuf::from) - .or_else(|| { - if cfg!(target_os = "windows") { - std::env::var("USERPROFILE").ok().map(PathBuf::from) - } else { - None - } - }) + fn import_history(&self, profile_path: &Path) -> anyhow::Result<Vec<Bookmark>> { + gecko::query_history(profile_path) + } } -
added src/importer/gecko.rs
diff --git a/src/importer/gecko.rs b/src/importer/gecko.rs new file mode 100644 index 0000000..98a464e --- /dev/null +++ b/src/importer/gecko.rs @@ -0,0 +1,112 @@ +use crate::importer::open_backup; +use crate::models::Bookmark; +use chrono::DateTime; +use std::path::{Path, PathBuf}; + +pub fn query_bookmarks(profile_path: &Path) -> anyhow::Result<Vec<Bookmark>> { + let conn = open_backup(profile_path)?; + + let mut stmt = conn.prepare( + "SELECT b.id, COALESCE(b.title, p.title, ''), p.url, p.description, + CAST(b.dateAdded AS INTEGER) / 1000000 + FROM moz_bookmarks b + JOIN moz_places p ON b.fk = p.id + WHERE b.type = 1", + )?; + + let bookmarks = stmt + .query_map([], |row| { + let id: i32 = row.get(0)?; + let title: String = row.get(1)?; + let url: String = row.get(2)?; + let description: Option<String> = row.get(3)?; + let timestamp: i64 = row.get(4)?; + + let created_at = DateTime::from_timestamp(timestamp, 0) + .unwrap_or(DateTime::from_timestamp(0, 0).unwrap()); + + Ok(Bookmark { + id, + title, + url, + description: description.filter(|d| !d.is_empty()), + source: "bookmark".into(), + content: None, + tags: None, + created_at, + }) + })? + .collect::<Result<Vec<_>, _>>()?; + + Ok(bookmarks) +} + +pub fn query_history(profile_path: &Path) -> anyhow::Result<Vec<Bookmark>> { + let conn = open_backup(profile_path)?; + + let mut stmt = conn.prepare( + "SELECT DISTINCT p.id, COALESCE(p.title, ''), p.url, p.description, + CAST(COALESCE(p.last_visit_date, 0) AS INTEGER) / 1000000 + FROM moz_places p + JOIN moz_historyvisits v ON p.id = v.place_id + WHERE p.hidden = 0 + ORDER BY p.last_visit_date DESC", + )?; + + let entries = stmt + .query_map([], |row| { + let id: i32 = row.get(0)?; + let title: String = row.get(1)?; + let url: String = row.get(2)?; + let description: Option<String> = row.get(3)?; + let timestamp: i64 = row.get(4)?; + + let created_at = DateTime::from_timestamp(timestamp, 0) + .unwrap_or(DateTime::from_timestamp(0, 0).unwrap()); + + Ok(Bookmark { + id, + title, + url, + description: description.filter(|d| !d.is_empty()), + source: "history".into(), + content: None, + tags: None, + created_at, + }) + })? + .collect::<Result<Vec<_>, _>>()?; + + Ok(entries) +} + +pub fn discover_profiles(base_dir: &str) -> Vec<PathBuf> { + let mut profiles = Vec::new(); + let dir = home_dir().map(|p| p.join(base_dir)); + + if let Some(dir) = dir { + if let Ok(entries) = std::fs::read_dir(dir) { + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() && path.join("places.sqlite").exists() { + profiles.push(path); + } + } + } + } + + profiles +} + +fn home_dir() -> Option<PathBuf> { + std::env::var("HOME") + .ok() + .map(PathBuf::from) + .or_else(|| { + if cfg!(target_os = "windows") { + std::env::var("USERPROFILE").ok().map(PathBuf::from) + } else { + None + } + }) +} -
modified src/importer/mod.rs
diff --git a/src/importer/mod.rs b/src/importer/mod.rs index ba1a2cb..7ec3548 100644 --- a/src/importer/mod.rs +++ b/src/importer/mod.rs @@ -1,37 +1,131 @@ +pub mod chrome; pub mod firefox; +pub mod gecko; pub mod zen; use crate::models::Bookmark; use rusqlite::{Connection, OpenFlags}; use std::path::{Path, PathBuf}; +/// Trait for browser-specific importers. +/// +/// Implementations discover browser profiles and parse their bookmark stores +/// and/or history into `Vec<Bookmark>`. +/// +/// # Example +/// +/// ```ignore +/// use search_hub::importer::Importer; +/// use search_hub::importer::firefox::FirefoxImporter; +/// +/// let importer = FirefoxImporter; +/// let profiles = importer.discover_profiles(); +/// if let Some(path) = profiles.first() { +/// let bookmarks = importer.import(path).expect("import"); +/// } +/// ``` pub trait Importer { + /// Human-readable name (e.g. "firefox", "zen"). fn name(&self) -> &'static str; + /// Return paths to all detected browser profile directories. fn discover_profiles(&self) -> Vec<PathBuf>; + /// Parse bookmarks from a profile directory. + /// + /// # Parameters + /// + /// * `profile_path` - Path to the browser profile directory. + /// + /// # Returns + /// + /// A `Vec<Bookmark>` with `source = "bookmark"`. fn import(&self, profile_path: &Path) -> anyhow::Result<Vec<Bookmark>>; + /// Parse browser history from a profile directory. + /// + /// The default implementation returns an empty vec. Override this if the + /// browser stores history (Firefox, Zen, Chrome/Chromium). + /// + /// # Parameters + /// + /// * `profile_path` - Path to the browser profile directory. + /// + /// # Returns + /// + /// A `Vec<Bookmark>` with `source = "history"`. + fn import_history(&self, _profile_path: &Path) -> anyhow::Result<Vec<Bookmark>> { + Ok(Vec::new()) + } } /// Copy `places.sqlite` from a browser profile to a temp file and open it /// read-only. This avoids SQLITE_BUSY when the browser has the database open. +/// +/// # Example +/// +/// ```ignore +/// use search_hub::importer::open_backup; +/// use std::path::Path; +/// +/// let conn = open_backup(Path::new("/path/to/profile")) +/// .expect("backup failed"); +/// ``` +/// +/// # Parameters +/// +/// * `profile_path` - Path to the browser profile directory containing +/// `places.sqlite`. +/// +/// # Returns +/// +/// A read-only `rusqlite::Connection` to the temporary copy. +/// +/// # Panics +/// +/// Returns an error (via `anyhow`) if `places.sqlite` is not found or +/// cannot be copied. pub fn open_backup(profile_path: &Path) -> anyhow::Result<Connection> { - let src = profile_path.join("places.sqlite"); + open_backup_file(profile_path, "places.sqlite") +} + +/// Copy an arbitrary SQLite file from a browser profile to a temp file and +/// open it read-only. This avoids SQLITE_BUSY when the browser has the +/// database open. +/// +/// # Example +/// +/// ```ignore +/// use search_hub::importer::open_backup_file; +/// use std::path::Path; +/// +/// let conn = open_backup_file(Path::new("/path/to/profile"), "History") +/// .expect("backup failed"); +/// ``` +/// +/// # Parameters +/// +/// * `profile_path` - Path to the browser profile directory. +/// * `filename` - Name of the SQLite file inside the profile directory +/// (e.g. `"History"`, `"places.sqlite"`). +/// +/// # Returns +/// +/// A read-only `rusqlite::Connection` to the temporary copy. +/// +/// # Panics +/// +/// Returns an error if the file is not found or cannot be copied. +pub fn open_backup_file(profile_path: &Path, filename: &str) -> anyhow::Result<Connection> { + let src = profile_path.join(filename); if !src.exists() { - anyhow::bail!("places.sqlite not found in {:?}", profile_path); + anyhow::bail!("{} not found in {:?}", filename, profile_path); } let tmp = tempfile::Builder::new() - .suffix(".places.sqlite") + .suffix(&format!(".{}", filename)) .tempfile()?; let tmp_path = tmp.path().to_owned(); std::fs::copy(&src, &tmp_path)?; let conn = Connection::open_with_flags(&tmp_path, OpenFlags::SQLITE_OPEN_READ_ONLY)?; - // Keep `tmp` alive so the temp file isn't deleted until the Connection drops. - // - // Safety: the temporary file lives for the same lifetime as `conn` – both - // are owned by this function's caller via the returned Connection. The - // `tmp` handle on the stack is **forgotten** so its destructor does not - // run; the file is cleaned up by the OS once the last handle is closed. std::mem::forget(tmp); Ok(conn) } -
modified src/importer/zen.rs
diff --git a/src/importer/zen.rs b/src/importer/zen.rs index 3575e72..1e19526 100644 --- a/src/importer/zen.rs +++ b/src/importer/zen.rs @@ -1,6 +1,6 @@ -use crate::importer::{open_backup, Importer}; +use crate::importer::gecko; +use crate::importer::Importer; use crate::models::Bookmark; -use chrono::DateTime; use std::path::{Path, PathBuf}; pub struct ZenImporter; @@ -11,74 +11,14 @@ impl Importer for ZenImporter { } fn discover_profiles(&self) -> Vec<PathBuf> { - let mut profiles = Vec::new(); - let zen_dir = home_dir().map(|p| p.join(".zen")); - - if let Some(dir) = zen_dir { - if let Ok(entries) = std::fs::read_dir(dir) { - for entry in entries.flatten() { - let path = entry.path(); - if path.is_dir() - && path - .file_name() - .and_then(|n| n.to_str()) - .map_or(false, |n| n.contains(".default")) - { - profiles.push(path); - } - } - } - } - - profiles + gecko::discover_profiles(".zen") } fn import(&self, profile_path: &Path) -> anyhow::Result<Vec<Bookmark>> { - let conn = open_backup(profile_path)?; - - let mut stmt = conn.prepare( - "SELECT b.id, COALESCE(b.title, p.title, ''), p.url, p.description, - CAST(b.dateAdded AS INTEGER) / 1000000 - FROM moz_bookmarks b - JOIN moz_places p ON b.fk = p.id - WHERE b.type = 1", - )?; - - let bookmarks = stmt - .query_map([], |row| { - let id: i32 = row.get(0)?; - let title: String = row.get(1)?; - let url: String = row.get(2)?; - let description: Option<String> = row.get(3)?; - let timestamp: i64 = row.get(4)?; - - let created_at = DateTime::from_timestamp(timestamp, 0) - .unwrap_or(DateTime::from_timestamp(0, 0).unwrap()); - - Ok(Bookmark { - id, - title, - url, - description: description.filter(|d| !d.is_empty()), - content: None, - created_at, - }) - })? - .collect::<Result<Vec<_>, _>>()?; - - Ok(bookmarks) + gecko::query_bookmarks(profile_path) } -} -fn home_dir() -> Option<PathBuf> { - std::env::var("HOME") - .ok() - .map(PathBuf::from) - .or_else(|| { - if cfg!(target_os = "windows") { - std::env::var("USERPROFILE").ok().map(PathBuf::from) - } else { - None - } - }) + fn import_history(&self, profile_path: &Path) -> anyhow::Result<Vec<Bookmark>> { + gecko::query_history(profile_path) + } } -
modified src/lib.rs
diff --git a/src/lib.rs b/src/lib.rs index c7e7801..f6fcf48 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,7 @@ +pub mod config; pub mod importer; pub mod models; +pub mod search_engines; pub mod storage; +pub mod tagging; pub mod web; -
modified src/main.rs
diff --git a/src/main.rs b/src/main.rs index 0f73245..ec4d473 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,18 +1,84 @@ use clap::{Parser, Subcommand}; use indicatif::{ProgressBar, ProgressStyle}; +use search_hub::config::{config_file_path, Config}; +use search_hub::search_engines::SearchEngine; +use search_hub::importer::chrome::ChromeImporter; use search_hub::importer::firefox::FirefoxImporter; use search_hub::importer::zen::ZenImporter; use search_hub::importer::Importer; use search_hub::models::Bookmark; use search_hub::storage; +use search_hub::tagging::{default_tags, TagDef, TaggingEngine}; use search_hub::web; use chrono::{Local, TimeZone, Utc}; +use colored::Colorize; +use robotstxt::DefaultMatcher; +use std::collections::{HashMap, VecDeque}; use std::path::PathBuf; +use std::sync::{Arc, Mutex}; use tracing::{error, info}; +const USER_AGENT: &str = concat!("search_hub/", env!("CARGO_PKG_VERSION")); + +struct Fetcher { + client: reqwest::Client, + rt: tokio::runtime::Runtime, + robots_cache: Mutex<HashMap<String, String>>, +} + +impl Fetcher { + fn new() -> anyhow::Result<Self> { + let client = reqwest::Client::builder() + .user_agent(USER_AGENT) + .build()?; + let rt = tokio::runtime::Runtime::new()?; + Ok(Self { client, rt, robots_cache: Mutex::new(HashMap::new()) }) + } + + fn fetch(&mut self, url: &str) -> Result<String, String> { + self.rt.block_on(self.fetch_async(url)) + } + + async fn fetch_async(&self, url: &str) -> Result<String, String> { + let parsed = url::Url::parse(url).map_err(|e| format!("invalid url: {}", e))?; + let domain = parsed.host_str().unwrap_or("").to_string(); + + { + let mut cache = self.robots_cache.lock().unwrap(); + if !cache.contains_key(&domain) { + let robots_url = format!("{}://{}/robots.txt", parsed.scheme(), domain); + let body = match self.client.get(&robots_url).send().await { + Ok(resp) => resp.text().await.unwrap_or_default(), + Err(_) => String::new(), + }; + cache.insert(domain.clone(), body); + } + } + + let allowed = { + let cache = self.robots_cache.lock().unwrap(); + DefaultMatcher::default() + .one_agent_allowed_by_robots(&cache[&domain], USER_AGENT, url) + }; + if !allowed { + return Err(format!("blocked by robots.txt")); + } + + let resp = self.client.get(url) + .send() + .await + .map_err(|e| format!("failed to fetch: {}", e))?; + + resp.text().await.map_err(|e| format!("failed to read body: {}", e)) + } +} + #[derive(Parser)] #[command(author, version, about)] struct Args { + #[arg(short, long, global = true)] + /// Path to config file (default: ~/.config/search_hub/config.toml) + config: Option<String>, #[command(subcommand)] command: Command, } @@ -51,25 +117,116 @@ Remove { /// Bookmark database path (default: platform data directory) db_path: Option<String>, }, -/// Import bookmarks from a browser -Import { - /// Browser to import from (e.g. "firefox") - source: String, +/// Search bookmarks from the terminal +Search { + /// FTS5 search query + query: String, + #[arg(long)] + /// Bookmark database path (default: platform data directory) + db_path: Option<String>, +}, +/// Re-run tagging on bookmarks +Retag { #[arg(short, long)] - /// Path to the browser profile directory (auto-discovered if omitted) - profile: Option<String>, + /// IDs of bookmarks to retag (comma-separated). If empty without --all, enter interactive mode. + id: Vec<i64>, + /// Retag all bookmarks that have content + #[arg(long)] + all: bool, #[arg(long)] - /// Target bookmark database (default: platform data directory) + /// Bookmark database path (default: platform data directory) db_path: Option<String>, }, +/// Import data from a browser +Import { + #[command(subcommand)] + action: ImportAction, +}, +/// Create a default config file at the default config path +InitConfig, +} + +#[derive(Subcommand)] +enum ImportAction { + /// Import bookmarks from a browser + Bookmarks { + /// Browser to import from (e.g. "firefox") + source: String, + #[arg(short, long)] + /// Path to the browser profile directory (auto-discovered if omitted) + profile: Option<String>, + #[arg(long)] + /// Target database path (default: platform data directory) + db_path: Option<String>, + }, + /// Import history from a browser + History { + /// Browser to import from (e.g. "firefox") + source: String, + #[arg(short, long)] + /// Path to the browser profile directory (auto-discovered if omitted) + profile: Option<String>, + #[arg(long)] + /// Target database path (default: platform data directory) + db_path: Option<String>, + }, + /// Import both bookmarks and history from a browser + All { + /// Browser to import from (e.g. "firefox") + source: String, + #[arg(short, long)] + /// Path to the browser profile directory (auto-discovered if omitted) + profile: Option<String>, + #[arg(long)] + /// Target database path (default: platform data directory) + db_path: Option<String>, + }, } -fn resolve_db_path(path: Option<String>) -> PathBuf { - path.map(PathBuf::from) - .unwrap_or_else(|| storage::default_db_path()) +fn print_bookmark(b: &Bookmark) { + let local_time = Local.from_utc_datetime(&b.created_at.naive_utc()); + let id_str = format!("#{}", b.id).bold().cyan(); + let title = b.title.bold(); + let url = b.url.dimmed(); + let time = local_time.format("%Y-%m-%d %H:%M:%S").to_string().dimmed(); + println!("{} {} {}", id_str, title, url); + if let Some(tags) = b.tags.as_ref().filter(|t| !t.is_empty()) { + let tag_parts: Vec<_> = tags.split(", ").filter_map(|t| { + let trimmed = t.trim(); + if trimmed.is_empty() { None } else { Some(format!("[{}]", trimmed).yellow().to_string()) } + }).collect(); + if !tag_parts.is_empty() { + println!(" {} {}", time, tag_parts.join(" ")); + } + } else { + println!(" {}", time); + } +} + +fn expand_path(s: &str) -> String { + let home = std::env::var("HOME").unwrap_or_default(); + let user = std::env::var("USER").unwrap_or_default(); + let s = s.replace("~", &home); + let s = s.replace("$HOME", &home); + s.replace("$USER", &user) } -fn main() { +fn resolve_db_path(cli_path: Option<String>, config_db_path: Option<&str>) -> PathBuf { + if let Some(p) = cli_path { + PathBuf::from(p) + } else if let Some(p) = config_db_path { + PathBuf::from(expand_path(p)) + } else { + storage::default_db_path() + } +} + +#[tokio::main] +async fn main() { + // Increase thread stack size so blocking tasks (fastembed ONNX model, + // htmd HTML conversion) don't overflow on deep call trees. + std::env::set_var("RUST_MIN_STACK", "8388608"); + let args = Args::parse(); match &args.command { @@ -86,135 +243,621 @@ fn main() { } } + let config_path = args.config.as_ref().map(PathBuf::from); + let config = match &config_path { + Some(p) => Config::load_from(p), + None => Config::load(), + }; + let engines: Vec<Box<dyn SearchEngine>> = config.resolve_engines(); + let tag_threshold: f32 = config.tagging_threshold.map(|t| t as f32).unwrap_or(0.60); + let exclude_hosts: Vec<String> = config.exclude_urls.clone().unwrap_or_else(|| { + vec!["localhost".into(), "127.0.0.1".into(), "::1".into()] + }); + let tags: Vec<TagDef> = if config.tags.is_empty() { + default_tags() + } else { + config.tags + }; + match args.command { Command::Serve { port, db_path } => { - let db_path = resolve_db_path(db_path); + let db_path = resolve_db_path(db_path, config.db_path.as_deref()); info!("Starting server on 127.0.0.1:{}", port); - let rt = tokio::runtime::Runtime::new().unwrap(); - if let Err(e) = rt.block_on(web::run_server(&db_path.to_string_lossy(), port)) { + if let Err(e) = web::run_server(&db_path.to_string_lossy(), port, engines).await { error!("Server error: {}", e); } } Command::Insert { title, url, description, db_path } => { - let db_path = resolve_db_path(db_path); + let db_path = resolve_db_path(db_path, config.db_path.as_deref()); let conn = storage::init_db(&db_path.to_string_lossy()).expect("Failed to open database"); - let content = match reqwest::blocking::get(&url) { - Ok(resp) => { - match resp.text() { - Ok(html) => match htmd::convert(&html) { - Ok(md) => Some(md), - Err(e) => { - eprintln!("Warning: failed to convert HTML to Markdown: {}", e); - None - } - }, - Err(e) => { - eprintln!("Warning: failed to read response body: {}", e); + if url::Url::parse(&url).is_err() { + eprintln!("Error: invalid URL '{}'", url); + return; + } + + let mut fetcher = match Fetcher::new() { + Ok(f) => f, + Err(e) => { + eprintln!("Warning: failed to create HTTP client: {}", e); + let bookmark = Bookmark { + id: 0, + title, + url, + description, + source: "bookmark".into(), + content: None, + tags: None, + created_at: Utc::now(), + }; + storage::insert_bookmark(&conn, &bookmark).expect("Failed to insert bookmark"); + println!("Bookmark inserted successfully (skipped HTTP fetch)."); + return; + } + }; + + let content = if is_excluded_url(&url, &exclude_hosts) { + info!("skipping fetch for excluded URL: {}", url); + None + } else { + fetch_and_convert(&mut fetcher, &url, None) + }; + let md = content.as_ref().and_then(|c| { + info!("tagging content..."); + match TaggingEngine::new(&tags, tag_threshold) { + Ok(mut engine) => { + let tags = engine.tags_for(c, 5).unwrap_or_default(); + if tags.is_empty() { + info!("no tags matched"); None + } else { + info!("tags: {}", tags.join(", ")); + Some(tags.join(", ")) } } + Err(e) => { + eprintln!("Warning: failed to initialize tagger: {}", e); + None + } } - Err(e) => { - eprintln!("Warning: failed to fetch URL: {}", e); - None - } - }; + }); let bookmark = Bookmark { id: 0, title, url, description, + source: "bookmark".into(), content, + tags: md, created_at: Utc::now(), }; storage::insert_bookmark(&conn, &bookmark).expect("Failed to insert bookmark"); println!("Bookmark inserted successfully."); } Command::List { db_path } => { - let db_path = resolve_db_path(db_path); + let db_path = resolve_db_path(db_path, config.db_path.as_deref()); let conn = storage::init_db(&db_path.to_string_lossy()).expect("Failed to open database"); - let bookmarks = storage::list_bookmarks(&conn).expect("Failed to list bookmarks"); + let bookmarks = storage::list_bookmarks(&conn, 1, 10000).expect("Failed to list bookmarks"); for b in bookmarks { - let local_time = Local.from_utc_datetime(&b.created_at.naive_utc()); - println!("{}: {} - {} (Created: {})", b.id, b.title, b.url, local_time.format("%Y-%m-%d %H:%M:%S")); + print_bookmark(&b); } } Command::Remove { id, db_path } => { - let db_path = resolve_db_path(db_path); + let db_path = resolve_db_path(db_path, config.db_path.as_deref()); let conn = storage::init_db(&db_path.to_string_lossy()).expect("Failed to open database"); storage::delete_bookmark(&conn, id).expect("Failed to delete bookmark"); println!("Bookmark removed successfully."); } - Command::Import { source, profile, db_path } => { - let db_path = resolve_db_path(db_path); - match source.as_str() { - "firefox" => run_import(FirefoxImporter, profile, &db_path.to_string_lossy()), - "zen" => run_import(ZenImporter, profile, &db_path.to_string_lossy()), - other => { - eprintln!("Unknown browser source: '{}'. Supported: firefox, zen", other); - std::process::exit(1); + Command::Search { query, db_path } => { + let db_path = resolve_db_path(db_path, config.db_path.as_deref()); + let conn = storage::init_db(&db_path.to_string_lossy()).expect("Failed to open database"); + let bookmarks = storage::search_bookmarks(&conn, &query, 1, 10000).expect("Failed to search"); + + if bookmarks.is_empty() { + println!("No results for \"{}\"", query); + return; + } + + println!("{} result{} for \"{}\":", bookmarks.len(), if bookmarks.len() == 1 { "" } else { "s" }, query); + println!(); + for b in bookmarks { + print_bookmark(&b); + } + } + Command::Retag { id, all, db_path } => { + let db_path = resolve_db_path(db_path, config.db_path.as_deref()); + let conn = storage::init_db(&db_path.to_string_lossy()).expect("Failed to open database"); + + let mut engine = match TaggingEngine::new(&tags, tag_threshold) { + Ok(e) => e, + Err(e) => { + eprintln!("Warning: failed to initialize tagger: {}", e); + return; + } + }; + + let ids: Vec<i64> = if all { + let bookmarks = storage::list_bookmarks(&conn, 1, 10000).expect("Failed to list bookmarks"); + bookmarks.into_iter() + .filter(|b| b.content.is_some()) + .map(|b| b.id as i64) + .collect() + } else if !id.is_empty() { + id + } else { + let bookmarks = storage::list_bookmarks(&conn, 1, 10000).expect("Failed to list bookmarks"); + if bookmarks.is_empty() { + println!("No bookmarks found."); + return; + } + println!("Bookmarks:"); + for (i, b) in bookmarks.iter().enumerate() { + let local_time = Local.from_utc_datetime(&b.created_at.naive_utc()); + let time = local_time.format("%Y-%m-%d %H:%M").to_string(); + let tags = b.tags.as_deref().unwrap_or("(no tags)"); + println!(" {}. #{} {} {} [{}]", i + 1, b.id, b.title, time, tags); + } + println!(); + print!("Enter IDs (comma-separated) or 'all': "); + std::io::Write::flush(&mut std::io::stdout()).unwrap(); + let mut input = String::new(); + std::io::stdin().read_line(&mut input).unwrap(); + let input = input.trim(); + if input.is_empty() { + println!("No IDs entered."); + return; + } + if input == "all" { + bookmarks.into_iter() + .filter(|b| b.content.is_some()) + .map(|b| b.id as i64) + .collect() + } else { + input.split(',') + .filter_map(|s| s.trim().parse::<i64>().ok()) + .collect() + } + }; + + if ids.is_empty() { + println!("No bookmarks to retag."); + return; + } + + let pb = ProgressBar::new(ids.len() as u64); + pb.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{elapsed_precise}] {bar:40.cyan/blue} {pos}/{len} {msg}") + .unwrap() + .progress_chars("##-"), + ); + + let mut tagged = 0u64; + let mut skipped = 0u64; + + for rowid in &ids { + match storage::get_bookmark(&conn, *rowid) { + Ok(Some(b)) => { + match b.content { + Some(ref content) => { + match engine.tags_for(content, 5) { + Ok(tags) => { + let tags_str = if tags.is_empty() { None } else { Some(tags.join(", ")) }; + storage::update_bookmark_tags(&conn, *rowid, tags_str.as_deref()) + .unwrap_or_else(|e| eprintln!("Warning: failed to update tags: {}", e)); + tagged += 1; + pb.set_message(format!("{} tagged", "✓")); + } + Err(e) => { + eprintln!("Warning: tagging failed for #{}: {}", rowid, e); + skipped += 1; + pb.set_message(format!("{} failed", "✘")); + } + } + } + None => { + eprintln!("Warning: #{} has no content, skipping", rowid); + skipped += 1; + pb.set_message(format!("{} no content", "✘")); + } + } + } + Ok(None) => { + eprintln!("Warning: bookmark #{} not found", rowid); + skipped += 1; + } + Err(e) => { + eprintln!("Warning: failed to read bookmark #{}: {}", rowid, e); + skipped += 1; + } + } + pb.inc(1); + } + + pb.finish_with_message(format!("{} {} tagged, {} {} skipped", "✔", tagged, "✘", skipped)); + } + Command::InitConfig => { + let path = config_path.clone().unwrap_or_else(|| config_file_path()); + if path.exists() { + eprintln!("Config file already exists at {:?}", path); + return; + } + let default_db = storage::default_db_path(); + let home = std::env::var("HOME").unwrap_or_default(); + let default_display = default_db.to_string_lossy(); + let default_display = if let Some(rest) = default_display.strip_prefix(&home) { + format!("~{}", rest) + } else { + default_display.to_string() + }; + let content = format!( + "# SearchHub configuration\n\ + \n\ + # Bookmark database path (default: platform data directory)\n\ + # db_path = \"{}\"\n\ + \n\ + # Custom tags override the built-in defaults\n\ + # [[tags]]\n\ + # name = \"my-custom-tag\"\n\ + # examples = [\"example text one\", \"example text two\"]\n\ + \n\ + # Which external search engines to use (default: [\"crates.io\"])\n\ + # enabled_engines = [\"crates.io\"]\n\ + \n\ + # Minimum confidence for auto-tagging (0.0 to 1.0, default: 0.6)\n\ + # tagging_threshold = 0.6\n\ + \n\ + # Hosts to skip when fetching content for bookmarking (default: localhost addresses)\n\ + # exclude_urls = [\"localhost\", \"127.0.0.1\", \"::1\"]\n\ + \n\ + # Per-engine configuration (optional)\n\ + # [engines.searxng]\n\ + # instance = \"https://search.kael.ink\"\n\ + # Best: use an existing public instance (see https://searx.space).\n\ + # Also possible: run your own with Docker:\n\ + # docker run -d --name searxng -p 8888:8080 searxng/searxng\n", + + default_display + ); + if let Some(parent) = path.parent() { + tokio::fs::create_dir_all(parent).await.expect("Failed to create config directory"); + } + tokio::fs::write(&path, content).await.expect("Failed to write config file"); + println!("Default config created at {:?}", path); + } + Command::Import { action } => { + match action { + ImportAction::Bookmarks { source, profile, db_path } => { + let db_path = resolve_db_path(db_path, config.db_path.as_deref()); + run_import(&source, profile, &db_path.to_string_lossy(), tags.clone(), tag_threshold, &exclude_hosts, ImportKind::Bookmarks).await; + } + ImportAction::History { source, profile, db_path } => { + let db_path = resolve_db_path(db_path, config.db_path.as_deref()); + run_import(&source, profile, &db_path.to_string_lossy(), tags.clone(), tag_threshold, &exclude_hosts, ImportKind::History).await; + } + ImportAction::All { source, profile, db_path } => { + let db_path = resolve_db_path(db_path, config.db_path.as_deref()); + run_import(&source, profile, &db_path.to_string_lossy(), tags.clone(), tag_threshold, &exclude_hosts, ImportKind::All).await; } } } } } -fn run_import(importer: impl Importer, profile: Option<String>, db_path: &str) { - let profile_path = match profile { - Some(p) => std::path::PathBuf::from(p), +enum ImportKind { Bookmarks, History, All } + +fn resolve_profiles(importer: &(impl Importer + ?Sized), profile: Option<String>) -> Vec<PathBuf> { + match profile { + Some(p) => vec![PathBuf::from(p)], None => { let profiles = importer.discover_profiles(); - match profiles.first() { - Some(p) => { - println!("Using {} profile: {:?}", importer.name(), p); - p.clone() + if profiles.is_empty() { + eprintln!("No {} profile found. Specify --profile.", importer.name()); + std::process::exit(1); + } + if profiles.len() == 1 { + println!("Using {} profile: {:?}", importer.name(), profiles[0]); + profiles + } else { + println!("Found {} {} profiles:", profiles.len(), importer.name()); + for (i, p) in profiles.iter().enumerate() { + println!(" {}. {:?}", i + 1, p); } - None => { - eprintln!("No {} profile found. Specify --profile.", importer.name()); + println!(); + print!("Enter numbers (comma-separated) or 'all': "); + std::io::Write::flush(&mut std::io::stdout()).unwrap(); + let mut input = String::new(); + std::io::stdin().read_line(&mut input).unwrap(); + let input = input.trim(); + if input.is_empty() { + eprintln!("No profiles selected."); std::process::exit(1); } + if input == "all" { + profiles + } else { + let selected: Vec<PathBuf> = input.split(',') + .filter_map(|s| { + let idx: usize = s.trim().parse().ok()?; + if idx >= 1 && idx <= profiles.len() { + Some(profiles[idx - 1].clone()) + } else { + None + } + }) + .collect(); + if selected.is_empty() { + eprintln!("No valid profiles selected."); + std::process::exit(1); + } + selected + } } } + } +} + +async fn run_import(source: &str, profile: Option<String>, db_path: &str, tags: Vec<TagDef>, tag_threshold: f32, exclude_hosts: &[String], kind: ImportKind) { + let importer: Box<dyn Importer> = match source { + "firefox" => Box::new(FirefoxImporter), + "zen" => Box::new(ZenImporter), + "chrome" | "chromium" => Box::new(ChromeImporter), + other => { + eprintln!("Unknown browser source: '{}'. Supported: firefox, zen, chrome, chromium", other); + std::process::exit(1); + } }; - match importer.import(&profile_path) { - Ok(bookmarks) => { - let conn = storage::init_db(db_path) - .expect("Failed to open target database"); - let total = bookmarks.len(); + let label = match kind { + ImportKind::Bookmarks => "bookmarks", + ImportKind::History => "history", + ImportKind::All => "bookmarks and history", + }; - let pb = ProgressBar::new(total as u64); - pb.enable_steady_tick(std::time::Duration::from_millis(100)); - pb.set_style( - ProgressStyle::default_bar() - .template("{spinner:.green} [{elapsed_precise}] {bar:40.cyan/blue} {pos}/{len} {msg}") - .unwrap() - .progress_chars("##-"), - ); + let selected_profiles = resolve_profiles(importer.as_ref(), profile); - let mut imported = 0u64; - let mut skipped = 0u64; + let conn = storage::init_db(db_path) + .expect("Failed to open target database"); - for bm in bookmarks { - if storage::bookmark_exists_by_url(&conn, &bm.url).unwrap_or(false) { - skipped += 1; - pb.set_message(format!("{} skipped", "✗")); - } else { - storage::insert_bookmark(&conn, &bm) - .expect("Failed to insert bookmark"); - imported += 1; - pb.set_message(format!("{} imported", "✓")); + let mut all_entries = Vec::new(); + for profile_path in &selected_profiles { + let results: [anyhow::Result<Vec<Bookmark>>; 2] = match kind { + ImportKind::Bookmarks => [importer.import(profile_path), Ok(Vec::new())], + ImportKind::History => [importer.import_history(profile_path), Ok(Vec::new())], + ImportKind::All => [importer.import(profile_path), importer.import_history(profile_path)], + }; + for result in results { + match result { + Ok(mut entries) => all_entries.append(&mut entries), + Err(e) => { + eprintln!("Warning: failed to import {} from {:?}: {}", label, profile_path, e); } - pb.inc(1); } + } + } + + for entry in &mut all_entries { + entry.url = strip_fragment(&entry.url); + } + + let total = all_entries.len(); + if total == 0 { + println!("No {} found to import.", label); + return; + } + + // First pass: validate, deduplicate, insert + let pb = ProgressBar::new(total as u64); + pb.enable_steady_tick(std::time::Duration::from_millis(100)); + pb.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{elapsed_precise}] {bar:40.cyan/blue} {pos}/{len} {msg}") + .unwrap() + .progress_chars("##-"), + ); - pb.finish_with_message(format!("{} {} imported, {} {} skipped (duplicates)", "✔", imported, "✘", skipped)); + let mut imported = 0u64; + let mut skipped = 0u64; + let mut invalid = 0u64; + let mut pending: Vec<(i64, String)> = Vec::new(); + + for entry in all_entries { + if url::Url::parse(&entry.url).is_err() { + invalid += 1; + pb.set_message(format!("{} invalid url", "✗")); + } else if storage::bookmark_exists_by_url(&conn, &entry.url).unwrap_or(false) { + skipped += 1; + pb.set_message(format!("{} skipped", "✗")); + } else { + let rowid = storage::insert_bookmark(&conn, &entry) + .expect("Failed to insert entry"); + imported += 1; + pending.push((rowid, entry.url.clone())); + pb.set_message(format!("{} inserted", "✓")); } + pb.inc(1); + } + + if pending.is_empty() { + pb.finish_with_message(format!("{} {} imported, {} {} skipped, {} {} invalid", "✔", imported, "✘", skipped, "⚠", invalid)); + return; + } + + pb.set_message("fetching and tagging..."); + + // Second pass: parallel fetch + tag via worker pool + let (tx, rx) = std::sync::mpsc::channel::<String>(); + let bar = Arc::new(pb.clone()); + let db = db_path.to_string(); + + let num_threads = std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(4) + .max(1); + let chunk_size = (pending.len() + num_threads - 1) / num_threads; + + let tasks: Vec<_> = pending.chunks(chunk_size).enumerate().map(|(task_id, chunk)| { + let owned: Vec<_> = chunk.to_vec(); + let tx = tx.clone(); + let bar = bar.clone(); + let db = db.clone(); + let task_tags = tags.clone(); + let task_threshold = tag_threshold; + let task_exclude = exclude_hosts.to_vec(); + tokio::task::spawn_blocking(move || { + let mut fetcher = match Fetcher::new() { + Ok(f) => f, + Err(e) => { + let _ = tx.send(format!("[{}] failed to create HTTP client: {}", task_id, e)); + return; + } + }; + let mut tagger = TaggingEngine::new(&task_tags, task_threshold).ok(); + let conn = storage::init_db(&db) + .expect("Failed to open target database"); + + for (rowid, url) in &owned { + if is_excluded_url(url, &task_exclude) { + bar.inc(1); + continue; + } + match fetch_and_convert(&mut fetcher, url, Some(task_id)) { + Some(md) => { + let entry_tags = tagger.as_mut() + .and_then(|e| e.tags_for(&md, 5).ok()) + .unwrap_or_default(); + let tags_str = if entry_tags.is_empty() { None } else { Some(entry_tags.join(", ")) }; + storage::update_bookmark_content_tags( + &conn, *rowid, Some(&md), tags_str.as_deref(), + ).unwrap_or_else(|e| { + let _ = tx.send(format!("failed to update bookmark {}: {}", rowid, e)); + }); + } + None => {} + } + bar.inc(1); + } + }) + }).collect(); + + // Read errors from channel, keep ringbuffer of last 10, print live + let error_handle = tokio::task::spawn_blocking(move || { + let mut error_buffer: VecDeque<String> = VecDeque::with_capacity(10); + while let Ok(err) = rx.recv() { + if error_buffer.len() < 10 { + error_buffer.push_back(err.clone()); + eprintln!("{}", err); + } else { + error_buffer.pop_front(); + error_buffer.push_back(err); + } + } + }); + + for task in tasks { + task.await.unwrap(); + } + drop(tx); + error_handle.await.unwrap(); + + pb.finish_with_message(format!("{} {} imported, {} {} skipped, {} {} invalid", "✔", imported, "✘", skipped, "⚠", invalid)); +} + +fn strip_fragment(url: &str) -> String { + url::Url::parse(url) + .ok() + .map(|mut u| { + u.set_fragment(None); + u.into() + }) + .unwrap_or_else(|| url.to_string()) +} + +fn is_excluded_url(url: &str, exclude_hosts: &[String]) -> bool { + url::Url::parse(url) + .ok() + .and_then(|u| match u.host() { + Some(url::Host::Domain(h)) => Some(h.to_lowercase()), + Some(url::Host::Ipv4(ip)) => Some(ip.to_string()), + Some(url::Host::Ipv6(ip)) => Some(ip.to_string()), + None => None, + }) + .map(|host| exclude_hosts.iter().any(|e| host == *e)) + .unwrap_or(false) +} + +fn fetch_and_convert(fetcher: &mut Fetcher, url: &str, task_id: Option<usize>) -> Option<String> { + let prefix = task_id.map(|id| format!("[importer-{}] ", id)).unwrap_or_default(); + match fetcher.fetch(url) { + Ok(html) => match htmd::convert(&html) { + Ok(md) => Some(md), + Err(e) => { + eprintln!("{}Warning: failed to convert HTML to Markdown: {}", prefix, e); + None + } + }, Err(e) => { - eprintln!("Import failed: {}", e); - std::process::exit(1); + eprintln!("{}Warning: failed to fetch URL '{}': {}", prefix, url, e); + None } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn strip_fragment_removes_hash() { + assert_eq!(strip_fragment("https://example.com/page#section"), "https://example.com/page"); + } + + #[test] + fn strip_fragment_no_change_without_hash() { + assert_eq!(strip_fragment("https://example.com/page"), "https://example.com/page"); + } + + #[test] + fn strip_fragment_keeps_query() { + assert_eq!(strip_fragment("https://example.com/page?q=test#section"), "https://example.com/page?q=test"); + } + + #[test] + fn strip_fragment_empty_fragment() { + assert_eq!(strip_fragment("https://example.com/page#"), "https://example.com/page"); + } + + #[test] + fn strip_fragment_invalid_url_unchanged() { + assert_eq!(strip_fragment("not a url"), "not a url"); + } + + #[test] + fn is_excluded_matches_localhost() { + let hosts = vec!["localhost".into(), "127.0.0.1".into(), "::1".into()]; + assert!(is_excluded_url("http://localhost:8080/page", &hosts)); + assert!(is_excluded_url("http://127.0.0.1:3000/", &hosts)); + assert!(is_excluded_url("http://[::1]:8080/page", &hosts)); + } + + #[test] + fn is_excluded_does_not_match_real_domains() { + let hosts = vec!["localhost".into(), "127.0.0.1".into(), "::1".into()]; + assert!(!is_excluded_url("https://example.com", &hosts)); + assert!(!is_excluded_url("https://rust-lang.org", &hosts)); + } + + #[test] + fn is_excluded_empty_list_allows_all() { + let hosts: Vec<String> = vec![]; + assert!(!is_excluded_url("http://localhost:8080/", &hosts)); + } + + #[test] + fn is_excluded_custom_hosts() { + let hosts = vec!["my-internal.dev".into()]; + assert!(is_excluded_url("http://my-internal.dev/api", &hosts)); + assert!(!is_excluded_url("https://example.com", &hosts)); + } +} -
modified src/models.rs
diff --git a/src/models.rs b/src/models.rs index 0dc9fcf..3e505c8 100644 --- a/src/models.rs +++ b/src/models.rs @@ -1,12 +1,48 @@ use chrono::{DateTime, Utc}; use serde::Serialize; +/// A single entry in the local database (bookmark or history). +/// +/// Fields map directly to the FTS5 virtual table columns. The `id` field +/// corresponds to the FTS5 `rowid`, not the `id` content column. +/// +/// # Example +/// +/// ```rust +/// use search_hub::models::Bookmark; +/// use chrono::Utc; +/// +/// let bm = Bookmark { +/// id: 0, +/// title: "Example".into(), +/// url: "https://example.com".into(), +/// description: Some("A sample bookmark".into()), +/// source: "bookmark".into(), +/// content: None, +/// tags: Some("web, tutorial".into()), +/// created_at: Utc::now(), +/// }; +/// # let _ = bm; +/// ``` #[derive(Debug, Serialize)] pub struct Bookmark { + /// FTS5 rowid (auto-generated on insert; 0 for new entries). pub id: i32, + /// Human-readable page title. pub title: String, + /// The page URL. pub url: String, + /// Optional user-provided description. pub description: Option<String>, + /// Origin of the entry ("bookmark" or "history"). + pub source: String, + /// Page body converted to Markdown, if fetching succeeded. pub content: Option<String>, + /// Comma-separated auto-assigned tags, if tagging succeeded. + pub tags: Option<String>, + /// UTC timestamp of when the entry was stored. pub created_at: DateTime<Utc>, } + +/// Re-export of the search engine result type for external search results. +pub use crate::search_engines::ResultEntry as ExternalResult; -
added src/search_engines/crates_io.rs
diff --git a/src/search_engines/crates_io.rs b/src/search_engines/crates_io.rs new file mode 100644 index 0000000..9e34d8c --- /dev/null +++ b/src/search_engines/crates_io.rs @@ -0,0 +1,91 @@ +use async_trait::async_trait; +use serde::Deserialize; + +use crate::search_engines::{EngineError, ResultEntry, SearchEngine}; + +pub struct CratesIo; + +#[derive(Deserialize)] +struct CrateResult { + name: String, + description: Option<String>, + homepage: Option<String>, + documentation: Option<String>, + repository: Option<String>, +} + +#[derive(Deserialize)] +struct ApiResponse { + crates: Vec<CrateResult>, +} + +#[async_trait] +impl SearchEngine for CratesIo { + fn id(&self) -> &str { + "crates.io" + } + + fn name(&self) -> &str { + "crates.io" + } + + fn url_template(&self) -> &str { + "https://crates.io/api/v1/crates?q={}&per_page=10" + } + + fn selector(&self) -> &str { + "" + } + + async fn fetch_results( + &self, + query: &str, + client: &reqwest::Client, + ) -> Result<Vec<ResultEntry>, EngineError> { + let url = self.search_url(query); + let body = client + .get(&url) + .header("Accept", "application/json") + .send() + .await + .map_err(|e| EngineError(format!("fetch failed: {e}")))? + .text() + .await + .map_err(|e| EngineError(format!("read body failed: {e}")))?; + + let resp: ApiResponse = serde_json::from_str(&body) + .map_err(|e| EngineError(format!("JSON parse failed: {e}")))?; + + let results: Vec<ResultEntry> = resp + .crates + .into_iter() + .map(|c| { + let title = format!("{}{}", + c.name, + c.description.as_ref().map(|d| format!(" - {d}")).unwrap_or_default(), + ); + let url = c + .homepage + .or(c.documentation) + .or(c.repository) + .unwrap_or_else(|| format!("https://crates.io/crates/{}", c.name)); + ResultEntry { + title, + url, + description: c.description, + engine: "crates.io".into(), + } + }) + .collect(); + + if results.is_empty() { + Err(EngineError("no results found".into())) + } else { + Ok(results) + } + } +} + +pub fn engine() -> CratesIo { + CratesIo +} -
added src/search_engines/mod.rs
diff --git a/src/search_engines/mod.rs b/src/search_engines/mod.rs new file mode 100644 index 0000000..216f8df --- /dev/null +++ b/src/search_engines/mod.rs @@ -0,0 +1,208 @@ +pub mod crates_io; +pub mod searxng; + +use async_trait::async_trait; +use scraper::{Html, Selector}; +use serde::Serialize; +use std::collections::HashSet; +use std::fmt; + +/// A single search result returned by an external search engine. +/// +/// # Example +/// +/// ```rust +/// use search_hub::search_engines::ResultEntry; +/// +/// let r = ResultEntry { +/// title: "Rust Lang".into(), +/// url: "https://rust-lang.org".into(), +/// description: Some("The Rust programming language".into()), +/// engine: "duckduckgo".into(), +/// }; +/// assert_eq!(r.engine, "duckduckgo"); +/// ``` +#[derive(Debug, Clone, Serialize)] +pub struct ResultEntry { + /// Result page title. + pub title: String, + /// Result page URL. + pub url: String, + /// Optional text snippet or description. + pub description: Option<String>, + /// Name of the search engine that returned this result. + pub engine: String, +} + +/// Error type for engine fetch and parse operations. +#[derive(Debug)] +pub struct EngineError(pub String); + +impl fmt::Display for EngineError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +impl std::error::Error for EngineError {} + +/// Trait for external search engine integrations. +/// +/// Implementors define the engine's metadata (`id`, `name`, `url_template`, +/// `selector`) and optionally override `fetch_results` or `parse_results` +/// for custom behavior. The default `fetch_results` fetches the search URL +/// via `reqwest` and delegates to `parse_results`. The default +/// `parse_results` uses the CSS `selector` to find the result container +/// and extracts `<a>` links from it (deduplicated by URL, max 10 results, +/// title must be at least 4 characters). +/// +/// # Example +/// +/// ```rust +/// use search_hub::search_engines::{SearchEngine, ResultEntry, EngineError}; +/// use async_trait::async_trait; +/// +/// struct ExampleEngine; +/// +/// #[async_trait] +/// impl SearchEngine for ExampleEngine { +/// fn id(&self) -> &str { "example" } +/// fn name(&self) -> &str { "Example" } +/// fn url_template(&self) -> &str { "https://example.com/search?q={}" } +/// fn selector(&self) -> &str { "div.results" } +/// } +/// +/// let e = ExampleEngine; +/// assert_eq!(e.id(), "example"); +/// assert_eq!(e.search_url("test"), "https://example.com/search?q=test"); +/// ``` +#[async_trait] +pub trait SearchEngine: Send + Sync { + /// Unique identifier for this engine (e.g. "duckduckgo"). + fn id(&self) -> &str; + /// Human-readable display name (e.g. "DuckDuckGo"). + fn name(&self) -> &str; + /// URL template with `{}` placeholder for the query string. + fn url_template(&self) -> &str; + /// CSS selector targeting the result container in the engine's HTML page. + fn selector(&self) -> &str; + + /// Build a search URL from the given query by replacing `{}` with the + /// URL-encoded query string. + fn search_url(&self, query: &str) -> String { + self.url_template().replace("{}", &urlencode(query)) + } + + /// Fetch search results from the engine for the given query. + /// + /// Default implementation: builds the search URL via `self.search_url()`, + /// fetches the page via the provided `reqwest::Client`, then delegates to + /// `self.parse_results()`. + async fn fetch_results( + &self, + query: &str, + client: &reqwest::Client, + ) -> Result<Vec<ResultEntry>, EngineError> { + let url = self.search_url(query); + let html = client + .get(&url) + .send() + .await + .map_err(|e| EngineError(format!("fetch failed: {e}")))? + .text() + .await + .map_err(|e| EngineError(format!("read body failed: {e}")))?; + self.parse_results(&html) + } + + /// Parse search results from raw HTML. + /// + /// Default implementation: uses `self.selector()` to find the result + /// container with `scraper`, extracts `<a>` links from it, deduplicates + /// by URL, filters to HTTP links with title >= 4 characters, and returns + /// at most 10 results. + fn parse_results(&self, html: &str) -> Result<Vec<ResultEntry>, EngineError> { + let doc = Html::parse_document(html); + let sel = Selector::parse(self.selector()) + .map_err(|e| EngineError(format!("bad selector: {e}")))?; + let link_sel = Selector::parse("a[href]") + .map_err(|e| EngineError(format!("bad link selector: {e}")))?; + + let container = doc + .select(&sel) + .next() + .ok_or_else(|| EngineError("no container matched".into()))?; + + let mut results = Vec::new(); + let mut seen = HashSet::new(); + + for link in container.select(&link_sel) { + let href = match link.value().attr("href") { + Some(h) => h.to_string(), + None => continue, + }; + let title: String = link.text().collect::<String>().trim().to_string(); + + if title.len() < 4 || href.is_empty() { + continue; + } + if !href.starts_with("http") { + continue; + } + if !seen.insert(href.clone()) { + continue; + } + if results.len() >= 10 { + break; + } + + results.push(ResultEntry { + title, + url: href, + description: None, + engine: self.name().to_string(), + }); + } + + if results.is_empty() { + Err(EngineError("no results found".into())) + } else { + Ok(results) + } + } +} + +/// Return the default set of search engines. +/// +/// These can be enabled or disabled via the `enabled_engines` config field. +/// DuckDuckGo, lib.rs, and StackOverflow were removed from defaults because +/// they now block automated requests. crates.io works via its public JSON API. +/// +/// # Example +/// +/// ```rust +/// let engines = search_hub::search_engines::default_search_engines(); +/// assert_eq!(engines.len(), 1); +/// assert_eq!(engines[0].id(), "crates.io"); +/// ``` +pub fn default_search_engines() -> Vec<Box<dyn SearchEngine>> { + vec![ + Box::new(crates_io::CratesIo), + ] +} + +fn urlencode(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for byte in s.bytes() { + match byte { + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => { + out.push(byte as char); + } + b' ' => out.push_str("+"), + _ => { + out.push_str(&format!("%{:02X}", byte)); + } + } + } + out +} -
added src/search_engines/searxng.rs
diff --git a/src/search_engines/searxng.rs b/src/search_engines/searxng.rs new file mode 100644 index 0000000..573c108 --- /dev/null +++ b/src/search_engines/searxng.rs @@ -0,0 +1,93 @@ +use async_trait::async_trait; +use serde::Deserialize; + +use crate::search_engines::{EngineError, ResultEntry, SearchEngine}; + +pub struct SearXng { + pub instance: String, + pub url_tpl: String, +} + +impl SearXng { + pub fn from_config(config: &toml::Table) -> Option<Box<dyn SearchEngine>> { + let instance = config.get("instance")?.as_str()?.to_string(); + let url_tpl = format!("{}/search?format=json&q={{}}", instance.trim_end_matches('/')); + Some(Box::new(SearXng { instance, url_tpl })) + } +} + +#[derive(Deserialize)] +struct SearXngResult { + title: Option<String>, + url: Option<String>, + content: Option<String>, + engine: Option<String>, +} + +#[derive(Deserialize)] +struct SearXngResponse { + results: Vec<SearXngResult>, +} + +#[async_trait] +impl SearchEngine for SearXng { + fn id(&self) -> &str { + "searxng" + } + + fn name(&self) -> &str { + "SearXNG" + } + + fn url_template(&self) -> &str { + &self.url_tpl + } + + fn selector(&self) -> &str { + "" + } + + async fn fetch_results( + &self, + query: &str, + client: &reqwest::Client, + ) -> Result<Vec<ResultEntry>, EngineError> { + let url = self.search_url(query); + let body = client + .get(&url) + .header("Accept", "application/json") + .send() + .await + .map_err(|e| EngineError(format!("searxng fetch failed: {e}")))? + .text() + .await + .map_err(|e| EngineError(format!("searxng read body failed: {e}")))?; + + let resp: SearXngResponse = serde_json::from_str(&body) + .map_err(|e| EngineError(format!("searxng JSON parse failed: {e}")))?; + + let results: Vec<ResultEntry> = resp + .results + .into_iter() + .filter_map(|r| { + let title = r.title.unwrap_or_default(); + let url = r.url?; + if title.is_empty() { + return None; + } + Some(ResultEntry { + title, + url, + description: r.content, + engine: r.engine.unwrap_or_else(|| "searxng".into()), + }) + }) + .collect(); + + if results.is_empty() { + Err(EngineError("no results found".into())) + } else { + Ok(results) + } + } +} -
modified src/storage.rs
diff --git a/src/storage.rs b/src/storage.rs index cac00e5..52daf51 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -3,6 +3,22 @@ use crate::models::Bookmark; use chrono::Utc; use std::path::PathBuf; +/// Return the platform-appropriate path for the bookmark database. +/// +/// On Linux this is `~/.local/share/search_hub/bookmarks.db`. The parent +/// directory is created if it does not exist. +/// +/// # Example +/// +/// ```ignore +/// let path = search_hub::storage::default_db_path(); +/// println!("{}", path.display()); +/// ``` +/// +/// # Panics +/// +/// Panics if the platform has no valid data directory or the directory +/// cannot be created. pub fn default_db_path() -> PathBuf { let dirs = directories::ProjectDirs::from("com", "search_hub", "search_hub") .expect("no valid data directory"); @@ -11,35 +27,54 @@ pub fn default_db_path() -> PathBuf { data_dir.join("bookmarks.db") } +/// Open (or create) the database at `path` and ensure the FTS5 table and +/// all required columns exist, running migrations if needed. +/// +/// # Example +/// +/// ```ignore +/// let conn = search_hub::storage::init_db("/tmp/test.db") +/// .expect("database init"); +/// ``` +/// +/// # Parameters +/// +/// * `path` - Filesystem path to the SQLite database file. +/// +/// # Returns +/// +/// A rusqlite `Connection` with the FTS5 table ready. +/// +/// # Panics +/// +/// May panic if the database cannot be opened or migrations fail. pub fn init_db(path: &str) -> Result<Connection> { let conn = Connection::open(path)?; conn.execute_batch( "CREATE VIRTUAL TABLE IF NOT EXISTS bookmarks USING fts5( - id UNINDEXED, title, url, description, content, created_at UNINDEXED + id UNINDEXED, title, url, description, source, content, tags, created_at UNINDEXED );", )?; // Migration: add content column to databases created before the column existed. - let has_content = conn + let cols = conn .prepare("PRAGMA table_info(bookmarks)") .and_then(|mut stmt| { let cols: Vec<String> = stmt .query_map([], |row| row.get::<_, String>(1))? .collect::<Result<Vec<_>>>()?; - Ok(cols.iter().any(|c| c == "content")) + Ok(cols) }) - .unwrap_or(false); + .unwrap_or_default(); - if !has_content { - // FTS5 supports ALTER TABLE ADD COLUMN since SQLite 3.25.0. + if !cols.iter().any(|c| c == "content") { if conn.execute("ALTER TABLE bookmarks ADD COLUMN content TEXT", []).is_err() { - // Fallback: rebuild the table entirely. conn.execute_batch( "CREATE TABLE bookmarks_backup AS SELECT rowid, id, title, url, description, created_at FROM bookmarks; DROP TABLE IF EXISTS bookmarks; CREATE VIRTUAL TABLE bookmarks USING fts5( - id UNINDEXED, title, url, description, content, created_at UNINDEXED + id UNINDEXED, title, url, description, source, content, tags, created_at UNINDEXED ); INSERT INTO bookmarks (rowid, id, title, url, description, created_at) SELECT rowid, id, title, url, description, created_at FROM bookmarks_backup; @@ -48,34 +83,150 @@ pub fn init_db(path: &str) -> Result<Connection> { } } + if !cols.iter().any(|c| c == "tags") { + if conn.execute("ALTER TABLE bookmarks ADD COLUMN tags TEXT", []).is_err() { + let _existing_cols = cols.join(", "); + conn.execute_batch( + &format!("CREATE TABLE bookmarks_backup AS SELECT rowid, id, title, url, description, {}, created_at FROM bookmarks; + DROP TABLE IF EXISTS bookmarks; + CREATE VIRTUAL TABLE bookmarks USING fts5( + id UNINDEXED, title, url, description, source, content, tags, created_at UNINDEXED + ); + INSERT INTO bookmarks (rowid, id, title, url, description, {}, created_at) + SELECT rowid, id, title, url, description, {}, created_at FROM bookmarks_backup; + DROP TABLE bookmarks_backup;", + if cols.iter().any(|c| c == "content") { "content" } else { "NULL as content" }, + if cols.iter().any(|c| c == "content") { "content, tags" } else { "tags" }, + if cols.iter().any(|c| c == "content") { "content, NULL as tags" } else { "NULL as content" }, + ), + )?; + } + } + + if !cols.iter().any(|c| c == "source") { + if conn.execute("ALTER TABLE bookmarks ADD COLUMN source TEXT", []).is_err() { + conn.execute_batch( + "CREATE TABLE bookmarks_backup AS SELECT rowid, id, title, url, description, content, tags, created_at FROM bookmarks; + DROP TABLE IF EXISTS bookmarks; + CREATE VIRTUAL TABLE bookmarks USING fts5( + id UNINDEXED, title, url, description, source, content, tags, created_at UNINDEXED + ); + INSERT INTO bookmarks (rowid, id, title, url, description, content, tags, created_at) + SELECT rowid, id, title, url, description, content, tags, created_at FROM bookmarks_backup; + DROP TABLE bookmarks_backup;", + )?; + } + } + Ok(conn) } +/// Check whether a bookmark with the given URL already exists. +/// +/// # Example +/// +/// ```ignore +/// let exists = search_hub::storage::bookmark_exists_by_url(&conn, "https://example.com") +/// .expect("query failed"); +/// ``` +/// +/// # Parameters +/// +/// * `conn` - An open database connection. +/// * `url` - The URL to check. +/// +/// # Returns +/// +/// `true` if at least one row with that URL exists. pub fn bookmark_exists_by_url(conn: &Connection, url: &str) -> Result<bool> { let mut stmt = conn.prepare("SELECT COUNT(*) FROM bookmarks WHERE url = ?")?; let count: i64 = stmt.query_row(params![url], |row| row.get(0))?; Ok(count > 0) } -pub fn insert_bookmark(conn: &Connection, bookmark: &Bookmark) -> Result<()> { +/// Insert a new bookmark row and return its FTS5 rowid. +/// +/// The `created_at` field on the passed struct is **ignored**; the current +/// timestamp is always used. +/// +/// # Example +/// +/// ```ignore +/// let rowid = search_hub::storage::insert_bookmark(&conn, &my_bookmark) +/// .expect("insert"); +/// ``` +/// +/// # Parameters +/// +/// * `conn` - An open database connection. +/// * `bookmark` - The bookmark data to insert. +/// +/// # Returns +/// +/// The newly-assigned FTS5 rowid. +pub fn insert_bookmark(conn: &Connection, bookmark: &Bookmark) -> Result<i64> { let now = Utc::now(); conn.execute( - "INSERT INTO bookmarks (title, url, description, content, created_at) VALUES (?, ?, ?, ?, ?)", - params![bookmark.title, bookmark.url, bookmark.description, bookmark.content, now], + "INSERT INTO bookmarks (title, url, description, source, content, tags, created_at) VALUES (?, ?, ?, ?, ?, ?, ?)", + params![bookmark.title, bookmark.url, bookmark.description, bookmark.source, bookmark.content, bookmark.tags, now], + )?; + Ok(conn.last_insert_rowid()) +} + +/// Update the `content` and `tags` columns for an existing bookmark. +/// +/// # Example +/// +/// ```ignore +/// search_hub::storage::update_bookmark_content_tags(&conn, rowid, Some(md), Some("web, rust")) +/// .expect("update"); +/// ``` +/// +/// # Parameters +/// +/// * `conn` - An open database connection. +/// * `rowid` - The FTS5 rowid of the bookmark to update. +/// * `content` - New Markdown content (or `None` to clear). +/// * `tags` - New comma-separated tags (or `None` to clear). +pub fn update_bookmark_content_tags(conn: &Connection, rowid: i64, content: Option<&str>, tags: Option<&str>) -> Result<()> { + conn.execute( + "UPDATE bookmarks SET content = ?, tags = ? WHERE rowid = ?", + params![content, tags, rowid], )?; Ok(()) } -pub fn list_bookmarks(conn: &Connection) -> Result<Vec<Bookmark>> { - let mut stmt = conn.prepare("SELECT rowid, title, url, description, content, created_at FROM bookmarks")?; - let book_iter = stmt.query_map([], |row| { +/// Return bookmarks from the database with pagination. +/// +/// # Examples +/// +/// ```ignore +/// let all = search_hub::storage::list_bookmarks(&conn, 1, 20) +/// .expect("list"); +/// ``` +/// +/// # Parameters +/// +/// * `conn` - An open database connection. +/// * `page` - 1-indexed page number. +/// * `page_size` - Results per page. +/// +/// # Returns +/// +/// A `Vec<Bookmark>` with the page of rows. +pub fn list_bookmarks(conn: &Connection, page: usize, page_size: usize) -> Result<Vec<Bookmark>> { + let offset = (page.saturating_sub(1)) * page_size; + let mut stmt = conn.prepare("SELECT rowid, title, url, description, source, content, tags, created_at FROM bookmarks ORDER BY rowid LIMIT ? OFFSET ?")?; + let book_iter = stmt.query_map(params![page_size as i64, offset as i64], |row| { Ok(Bookmark { id: row.get(0)?, title: row.get(1)?, url: row.get(2)?, description: row.get(3)?, - content: row.get(4)?, - created_at: row.get(5)?, + source: row.get(4)?, + content: row.get(5)?, + tags: row.get(6)?, + created_at: row.get(7)?, }) })?; @@ -86,16 +237,38 @@ pub fn list_bookmarks(conn: &Connection) -> Result<Vec<Bookmark>> { Ok(bookmarks) } -pub fn search_bookmarks(conn: &Connection, query: &str) -> Result<Vec<Bookmark>> { - let mut stmt = conn.prepare("SELECT rowid, title, url, description, content, created_at FROM bookmarks WHERE bookmarks MATCH ?")?; - let book_iter = stmt.query_map(params![query], |row| { +/// Full-text search the bookmarks table using the FTS5 MATCH syntax. +/// +/// # Example +/// +/// ```ignore +/// let results = search_hub::storage::search_bookmarks(&conn, "rust tutorial", 1, 20) +/// .expect("search"); +/// ``` +/// +/// # Parameters +/// +/// * `conn` - An open database connection. +/// * `query` - An FTS5 search query string. +/// * `page` - 1-indexed page number. +/// * `page_size` - Results per page. +/// +/// # Returns +/// +/// A `Vec<Bookmark>` matching the query (empty if none match). +pub fn search_bookmarks(conn: &Connection, query: &str, page: usize, page_size: usize) -> Result<Vec<Bookmark>> { + let offset = (page.saturating_sub(1)) * page_size; + let mut stmt = conn.prepare("SELECT rowid, title, url, description, source, content, tags, created_at FROM bookmarks WHERE bookmarks MATCH ? ORDER BY rank LIMIT ? OFFSET ?")?; + let book_iter = stmt.query_map(params![query, page_size as i64, offset as i64], |row| { Ok(Bookmark { id: row.get(0)?, title: row.get(1)?, url: row.get(2)?, description: row.get(3)?, - content: row.get(4)?, - created_at: row.get(5)?, + source: row.get(4)?, + content: row.get(5)?, + tags: row.get(6)?, + created_at: row.get(7)?, }) })?; @@ -106,7 +279,208 @@ pub fn search_bookmarks(conn: &Connection, query: &str) -> Result<Vec<Bookmark>> Ok(bookmarks) } +/// Count bookmarks matching an FTS5 query. +pub fn count_search_bookmarks(conn: &Connection, query: &str) -> Result<usize> { + let mut stmt = conn.prepare("SELECT COUNT(*) FROM bookmarks WHERE bookmarks MATCH ?")?; + let count: i64 = stmt.query_row(params![query], |row| row.get(0))?; + Ok(count as usize) +} + +/// Fetch a single bookmark by its FTS5 rowid. +/// +/// # Example +/// +/// ```ignore +/// if let Some(bm) = search_hub::storage::get_bookmark(&conn, 1).expect("query") { +/// println!("{}", bm.title); +/// } +/// ``` +/// +/// # Parameters +/// +/// * `conn` - An open database connection. +/// * `rowid` - The FTS5 rowid to look up. +/// +/// # Returns +/// +/// `Some(Bookmark)` if found, or `None` if no row matches. +pub fn get_bookmark(conn: &Connection, rowid: i64) -> Result<Option<Bookmark>> { + let mut stmt = conn.prepare("SELECT rowid, title, url, description, source, content, tags, created_at FROM bookmarks WHERE rowid = ?")?; + let mut rows = stmt.query_map(params![rowid], |row| { + Ok(Bookmark { + id: row.get(0)?, + title: row.get(1)?, + url: row.get(2)?, + description: row.get(3)?, + source: row.get(4)?, + content: row.get(5)?, + tags: row.get(6)?, + created_at: row.get(7)?, + }) + })?; + match rows.next() { + Some(Ok(b)) => Ok(Some(b)), + Some(Err(e)) => Err(e), + None => Ok(None), + } +} + +/// Update the `tags` column for an existing bookmark (leaving content unchanged). +/// +/// # Example +/// +/// ```ignore +/// search_hub::storage::update_bookmark_tags(&conn, rowid, Some("web, rust")) +/// .expect("update"); +/// ``` +/// +/// # Parameters +/// +/// * `conn` - An open database connection. +/// * `rowid` - The FTS5 rowid of the bookmark to update. +/// * `tags` - New comma-separated tags (or `None` to clear). +pub fn update_bookmark_tags(conn: &Connection, rowid: i64, tags: Option<&str>) -> Result<()> { + conn.execute( + "UPDATE bookmarks SET tags = ? WHERE rowid = ?", + params![tags, rowid], + )?; + Ok(()) +} + +/// Count all bookmarks in the database. +pub fn count_bookmarks(conn: &Connection) -> Result<usize> { + let mut stmt = conn.prepare("SELECT COUNT(*) FROM bookmarks")?; + let count: i64 = stmt.query_row([], |row| row.get(0))?; + Ok(count as usize) +} + +/// Delete a bookmark by its FTS5 rowid. +/// +/// # Example +/// +/// ```ignore +/// search_hub::storage::delete_bookmark(&conn, 1) +/// .expect("delete"); +/// ``` +/// +/// # Parameters +/// +/// * `conn` - An open database connection. +/// * `id` - The FTS5 rowid to delete. pub fn delete_bookmark(conn: &Connection, id: i32) -> Result<()> { conn.execute("DELETE FROM bookmarks WHERE rowid = ?", params![id])?; Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + + fn test_db() -> Connection { + init_db(":memory:").expect("init") + } + + fn sample_bm() -> Bookmark { + Bookmark { + id: 0, + title: "Test Page".into(), + url: "https://example.com".into(), + description: Some("A test".into()), + source: "bookmark".into(), + content: Some("some markdown content".into()), + tags: Some("rust, test".into()), + created_at: Utc::now(), + } + } + + #[test] + fn insert_and_list() { + let conn = test_db(); + insert_bookmark(&conn, &sample_bm()).expect("insert"); + let all = list_bookmarks(&conn, 1, 100).expect("list"); + assert_eq!(all.len(), 1); + assert_eq!(all[0].title, "Test Page"); + assert_eq!(all[0].url, "https://example.com"); + assert_eq!(all[0].tags.as_deref(), Some("rust, test")); + } + + #[test] + fn insert_and_search() { + let conn = test_db(); + insert_bookmark(&conn, &sample_bm()).expect("insert"); + let results = search_bookmarks(&conn, "Test", 1, 100).expect("search"); + assert_eq!(results.len(), 1); + assert_eq!(results[0].title, "Test Page"); + } + + #[test] + fn search_no_match() { + let conn = test_db(); + insert_bookmark(&conn, &sample_bm()).expect("insert"); + let results = search_bookmarks(&conn, "zzznotfound", 1, 100).expect("search"); + assert!(results.is_empty()); + } + + #[test] + fn insert_then_get() { + let conn = test_db(); + let rowid = insert_bookmark(&conn, &sample_bm()).expect("insert"); + let bm = get_bookmark(&conn, rowid).expect("get"); + assert!(bm.is_some()); + assert_eq!(bm.unwrap().url, "https://example.com"); + } + + #[test] + fn get_nonexistent() { + let conn = test_db(); + let bm = get_bookmark(&conn, 999).expect("get"); + assert!(bm.is_none()); + } + + #[test] + fn insert_then_delete() { + let conn = test_db(); + let rowid = insert_bookmark(&conn, &sample_bm()).expect("insert"); + delete_bookmark(&conn, rowid as i32).expect("delete"); + let all = list_bookmarks(&conn, 1, 100).expect("list"); + assert!(all.is_empty()); + } + + #[test] + fn update_content_and_tags() { + let conn = test_db(); + let rowid = insert_bookmark(&conn, &sample_bm()).expect("insert"); + update_bookmark_content_tags(&conn, rowid, Some("new content"), Some("web, updated")).expect("update"); + let bm = get_bookmark(&conn, rowid).expect("get").unwrap(); + assert_eq!(bm.content.as_deref(), Some("new content")); + assert_eq!(bm.tags.as_deref(), Some("web, updated")); + } + + #[test] + fn update_tags_only() { + let conn = test_db(); + let rowid = insert_bookmark(&conn, &sample_bm()).expect("insert"); + update_bookmark_tags(&conn, rowid, Some("only-tags")).expect("update"); + let bm = get_bookmark(&conn, rowid).expect("get").unwrap(); + assert_eq!(bm.tags.as_deref(), Some("only-tags")); + assert_eq!(bm.content.as_deref(), Some("some markdown content")); + } + + #[test] + fn exists_by_url() { + let conn = test_db(); + insert_bookmark(&conn, &sample_bm()).expect("insert"); + assert!(bookmark_exists_by_url(&conn, "https://example.com").expect("exists")); + assert!(!bookmark_exists_by_url(&conn, "https://other.com").expect("exists")); + } + + #[test] + fn list_multiple_bookmarks() { + let conn = test_db(); + let b1 = Bookmark { title: "First".into(), url: "https://a.com".into(), ..sample_bm() }; + let b2 = Bookmark { title: "Second".into(), url: "https://b.com".into(), ..sample_bm() }; + insert_bookmark(&conn, &b1).expect("insert"); + insert_bookmark(&conn, &b2).expect("insert"); + assert_eq!(list_bookmarks(&conn, 1, 100).expect("list").len(), 2); + } +} -
added src/tagging.rs
diff --git a/src/tagging.rs b/src/tagging.rs new file mode 100644 index 0000000..ecb4e5d --- /dev/null +++ b/src/tagging.rs @@ -0,0 +1,360 @@ +use fastembed::{EmbeddingModel, TextEmbedding, TextInitOptions}; +use fastembed::similarity::cosine_similarity; +use serde::Deserialize; + +/// A named tag with example texts used for semantic similarity scoring. +/// +/// # Example +/// +/// ```rust +/// use search_hub::tagging::TagDef; +/// +/// let tag = TagDef { +/// name: "rust".into(), +/// examples: vec!["Rust ownership".into(), "cargo build system".into()], +/// }; +/// assert_eq!(tag.name, "rust"); +/// ``` +#[derive(Debug, Clone, Deserialize)] +pub struct TagDef { + /// The tag label (e.g. "rust", "web"). + pub name: String, + /// Example phrases that exemplify this tag for embedding comparison. + pub examples: Vec<String>, +} + +/// Return the hardcoded default set of 25 tags with 3 example texts each. +/// +/// # Example +/// +/// ```rust +/// use search_hub::tagging::default_tags; +/// +/// let tags = default_tags(); +/// assert_eq!(tags.len(), 25); +/// assert_eq!(tags[0].name, "rust"); +/// ``` +pub fn default_tags() -> Vec<TagDef> { + vec![ + TagDef { name: "rust".into(), examples: vec![ + "Rust ownership and borrow checker enforcing memory safety at compile time".into(), + "pattern matching with enums and the Result type for error handling".into(), + "cargo build system, crates.io ecosystem, and procedural macros".into(), + ]}, + TagDef { name: "python".into(), examples: vec![ + "Python indentation-based syntax, list comprehensions, and generator expressions".into(), + "dynamic typing, duck typing, and Python's data model protocols".into(), + "pip packaging, virtual environments, and Python import system".into(), + ]}, + TagDef { name: "web".into(), examples: vec![ + "HTML semantic markup, accessibility attributes, and document structure".into(), + "CSS layout with flexbox and grid, responsive design with media queries".into(), + "DOM manipulation, event bubbling, and Web API interfaces in the browser".into(), + ]}, + TagDef { name: "audio".into(), examples: vec![ + "music streaming, albums, playlists, and artist discovery".into(), + "podcast episodes, RSS feeds, and audio content distribution".into(), + "radio stations, live broadcasts, and audio programming".into(), + ]}, + TagDef { name: "backend".into(), examples: vec![ + "HTTP server routing, request handling, and response middleware chains".into(), + "connection pooling, ORM patterns, and server-side template rendering".into(), + "backend service architecture, message queues, and inter-service communication".into(), + ]}, + TagDef { name: "devops".into(), examples: vec![ + "container images, Dockerfiles, and Kubernetes pod orchestration".into(), + "infrastructure provisioning with Terraform and configuration management".into(), + "CI/CD build pipelines, artifact management, and deployment strategies".into(), + ]}, + TagDef { name: "data".into(), examples: vec![ + "data frame operations, statistical analysis, and numerical computing".into(), + "data visualization with plotting libraries and charting techniques".into(), + "ETL workflows, data cleaning, and batch processing pipelines".into(), + ]}, + TagDef { name: "ai".into(), examples: vec![ + "transformer attention mechanisms, tokenization, and embedding layers".into(), + "gradient descent, backpropagation, and neural network loss functions".into(), + "model quantization, fine-tuning strategies, and inference optimization".into(), + ]}, + TagDef { name: "linux".into(), examples: vec![ + "file permission bits, process management, and signal handling".into(), + "piping stdout, redirecting file descriptors, and shell expansion rules".into(), + "package managers, init systems, and systemd unit files".into(), + ]}, + TagDef { name: "security".into(), examples: vec![ + "authentication tokens, OAuth flows, and JWT session handling".into(), + "input sanitization, parameterized queries, and XSS/CSRF prevention".into(), + "certificate authorities, TLS handshakes, and mTLS configurations".into(), + ]}, + TagDef { name: "design".into(), examples: vec![ + "design tokens, component libraries, and design system consistency".into(), + "typographic scale, whitespace rhythm, and visual hierarchy principles".into(), + "color contrast, WCAG accessibility ratios, and responsive breakpoints".into(), + ]}, + TagDef { name: "mobile".into(), examples: vec![ + "touch gesture handling, viewport sizing, and responsive mobile layouts".into(), + "app lifecycle, push notifications, and background task management".into(), + "native platform APIs, mobile sensors, and cross-platform mobile frameworks".into(), + ]}, + TagDef { name: "gaming".into(), examples: vec![ + "game loop architecture, frame-rate independence, and delta time".into(), + "physics simulation, collision detection, and spatial partitioning".into(), + "shader programs, GPU rendering pipeline, and 3D transformations".into(), + ]}, + TagDef { name: "tutorial".into(), examples: vec![ + "beginner-friendly walkthroughs with code examples and expected output".into(), + "learning objectives, prerequisite knowledge, and progressive skill building".into(), + "interactive code playgrounds, exercises, and quiz-based reinforcement".into(), + ]}, + TagDef { name: "news".into(), examples: vec![ + "version bumps, deprecation timelines, and migration announcements".into(), + "community announcements, conference talks, and ecosystem updates".into(), + "release notes, changelogs, and feature release highlights".into(), + ]}, + TagDef { name: "video".into(), examples: vec![ + "video streaming platforms, channels, and content creation".into(), + "video editing, encoding formats, and transcoding workflows".into(), + "live streaming, video on demand, and media playback".into(), + ]}, + TagDef { name: "tools".into(), examples: vec![ + "text editor configuration, IDE plugins, and developer workflow tooling".into(), + "version control workflows, git branching strategies, and merge patterns".into(), + "debugger breakpoints, profiling tools, and performance tracing utilities".into(), + ]}, + TagDef { name: "database".into(), examples: vec![ + "SQL table schemas, foreign key relationships, and constraint design".into(), + "index structures, query plan analysis, and query performance tuning".into(), + "ACID transactions, isolation levels, and connection pool configuration".into(), + ]}, + TagDef { name: "cli".into(), examples: vec![ + "command argument parsing, subcommand patterns, and flag conventions".into(), + "terminal output formatting, colored logging, and progress indicators".into(), + "stdin/stdout pipes, exit codes, and shell completion scripts".into(), + ]}, + TagDef { name: "social".into(), examples: vec![ + "social media platforms, feeds, and community discussions".into(), + "user profiles, followers, and content sharing features".into(), + "messaging systems, real-time chat, and social networking APIs".into(), + ]}, + TagDef { name: "testing".into(), examples: vec![ + "unit test assertions, test fixtures, and parametrized test cases".into(), + "mocking external dependencies, test doubles, and fake implementations".into(), + "integration tests, end-to-end testing, and continuous testing in CI".into(), + ]}, + TagDef { name: "javascript".into(), examples: vec![ + "JavaScript closures, prototypal inheritance, and the event loop".into(), + "async/await patterns, Promise chaining, and callback conventions".into(), + "ES modules, npm packages, and JavaScript bundler tooling".into(), + ]}, + TagDef { name: "api".into(), examples: vec![ + "RESTful resource design, URL patterns, and HTTP method semantics".into(), + "request validation, error response formatting, and status code conventions".into(), + "API versioning, rate limiting, and OpenAPI specification documents".into(), + ]}, + TagDef { name: "documentation".into(), examples: vec![ + "API reference docs, docstrings, and inline code annotations".into(), + "architecture decision records and design documentation practices".into(), + "README writing, project wikis, and onboarding guides for contributors".into(), + ]}, + TagDef { name: "productivity".into(), examples: vec![ + "habit tracking, time management, and personal workflow optimization".into(), + "note-taking systems, knowledge base management, and personal wikis".into(), + "task organization, prioritization frameworks, and automation of repetitive work".into(), + ]}, + ] +} + +/// Engine that embeds content and scores it against tag prototypes using cosine similarity. +/// +/// # Example +/// +/// ```ignore +/// let tags = search_hub::tagging::default_tags(); +/// let mut engine = search_hub::tagging::TaggingEngine::new(&tags, 0.40) +/// .expect("failed to init tagging engine"); +/// let matched = engine.tags_for("the rust programming language borrow checker", 3) +/// .expect("tagging failed"); +/// assert!(matched.contains(&"rust".to_string())); +/// ``` +pub struct TaggingEngine { + model: TextEmbedding, + tag_examples: Vec<(String, Vec<Vec<f32>>)>, + threshold: f32, +} + +impl TaggingEngine { + /// Create a new tagging engine from the given tag definitions. + /// + /// Downloads the ONNX embedding model on first run (cached afterwards). + /// + /// # Parameters + /// + /// * `tags` - Slice of `TagDef` entries (from config or `default_tags()`). + /// * `threshold` - Minimum cosine-similarity score (0.0 to 1.0) for a tag + /// to be assigned. Default 0.40 in `tags_for()` but can + /// be overridden per-call with `tags_for_with_threshold()`. + /// + /// # Returns + /// + /// A `TaggingEngine` ready to score content. + /// + /// # Errors + /// + /// Returns an error if the embedding model cannot be loaded or the + /// tag examples fail to embed. + /// + /// # Example + /// + /// ```ignore + /// let tags = search_hub::tagging::default_tags(); + /// let mut engine = search_hub::tagging::TaggingEngine::new(&tags, 0.60) + /// .expect("model init"); + /// ``` + pub fn new(tags: &[TagDef], threshold: f32) -> anyhow::Result<Self> { + let mut model = TextEmbedding::try_new( + TextInitOptions::new(EmbeddingModel::BGESmallENV15) + .with_show_download_progress(true), + )?; + + let mut all_examples: Vec<String> = Vec::new(); + let mut tag_indices: Vec<(usize, &str)> = Vec::new(); + + for (ti, tag) in tags.iter().enumerate() { + for example in &tag.examples { + tag_indices.push((ti, &tag.name)); + all_examples.push(format!("passage: {}", example)); + } + } + + let embeddings = model.embed(all_examples, None)?; + + let mut tag_examples: Vec<(String, Vec<Vec<f32>>)> = tags + .iter() + .map(|t| (t.name.clone(), Vec::new())) + .collect(); + + for ((ti, _name), emb) in tag_indices.iter().zip(embeddings.iter()) { + tag_examples[*ti].1.push(emb.clone()); + } + + Ok(Self { model, tag_examples, threshold }) + } + + fn truncate(content: &str, max_chars: usize) -> &str { + let end = content.char_indices() + .take(max_chars) + .last() + .map(|(i, c)| i + c.len_utf8()) + .unwrap_or(content.len()); + &content[..end.min(content.len())] + } + + fn score_content(&mut self, content: &str) -> anyhow::Result<Vec<(String, f32)>> { + let truncated = Self::truncate(content, 2000); + let emb = self.model.embed( + vec![format!("passage: {}", truncated)], + None, + )?; + if emb.is_empty() { + return Ok(Vec::new()); + } + let query_emb = &emb[0]; + + let mut scores: Vec<(usize, f32)> = self.tag_examples + .iter() + .enumerate() + .map(|(i, (_, examples))| { + let max_sim = examples + .iter() + .map(|proto| cosine_similarity(query_emb, proto)) + .fold(f32::NEG_INFINITY, f32::max); + (i, max_sim) + }) + .collect(); + + scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + + Ok(scores + .into_iter() + .map(|(i, score)| (self.tag_examples[i].0.clone(), score)) + .collect()) + } + + /// Score `content` against all tag prototypes and return tags above the + /// configured threshold. + /// + /// # Parameters + /// + /// * `content` - The text to tag (e.g. page body converted to Markdown). + /// * `max_tags` - Maximum number of tags to return. + /// + /// # Returns + /// + /// A `Vec<String>` of tag names matching the content, sorted by score + /// descending. + /// + /// # Errors + /// + /// Returns an error if the embedding model fails to process the content. + /// + /// # Example + /// + /// ```ignore + /// let tags = search_hub::tagging::default_tags(); + /// let mut engine = search_hub::tagging::TaggingEngine::new(&tags, 0.40) + /// .expect("model init"); + /// let matched = engine.tags_for("the rust programming language", 3) + /// .expect("tagging failed"); + /// println!("{:?}", matched); + /// ``` + pub fn tags_for(&mut self, content: &str, max_tags: usize) -> anyhow::Result<Vec<String>> { + Ok(self + .tags_for_with_threshold(content, max_tags, self.threshold)? + .into_iter() + .map(|(tag, _)| tag) + .collect()) + } + + /// Score `content` and return tag-score pairs above a custom threshold. + /// + /// # Parameters + /// + /// * `content` - The text to tag. + /// * `max_tags` - Maximum number of tags to return. + /// * `threshold` - Minimum cosine-similarity score (0.0 to 1.0). + /// + /// # Returns + /// + /// A `Vec<(String, f32)>` of (tag_name, score) matching the content, + /// sorted by score descending. + /// + /// # Errors + /// + /// Returns an error if the embedding model fails to process the content. + /// + /// # Example + /// + /// ```ignore + /// let tags = search_hub::tagging::default_tags(); + /// let mut engine = search_hub::tagging::TaggingEngine::new(&tags) + /// .expect("model init"); + /// let matched = engine.tags_for_with_threshold("rust programming", 5, 0.30) + /// .expect("tagging failed"); + /// for (tag, score) in &matched { + /// println!("{}: {:.3}", tag, score); + /// } + /// ``` + pub fn tags_for_with_threshold( + &mut self, + content: &str, + max_tags: usize, + threshold: f32, + ) -> anyhow::Result<Vec<(String, f32)>> { + let scored = self.score_content(content)?; + Ok(scored + .into_iter() + .filter(|(_, score)| *score >= threshold) + .take(max_tags) + .collect()) + } +} -
modified src/web.rs
diff --git a/src/web.rs b/src/web.rs index 8043194..4898c0e 100644 --- a/src/web.rs +++ b/src/web.rs @@ -1,9 +1,13 @@ +use crate::search_engines::{ResultEntry, SearchEngine}; use crate::storage; -use actix_web::{get, web, App, HttpResponse, HttpServer, Responder}; +use actix_web::{get, web, App, HttpRequest, HttpResponse, HttpServer, Responder}; use rusqlite::Connection; use std::sync::Mutex; +use std::time::Instant; use tera::Tera; -use tracing::error; +use tracing::{error, info}; + +const USER_AGENT: &str = concat!("search_hub/", env!("CARGO_PKG_VERSION")); pub struct DbPool(Mutex<Connection>); @@ -18,12 +22,19 @@ impl DbPool { } } -const VERSION: &str = env!("CARGO_PKG_VERSION"); +const VERSION: &str = concat!( + env!("CARGO_PKG_VERSION"), + " (", + env!("SEARCH_HUB_GIT_HASH"), + ")", +); #[get("/")] -async fn index(templates: web::Data<Tera>) -> impl Responder { +async fn index(templates: web::Data<Tera>, port: web::Data<Port>) -> impl Responder { + info!("serving index page"); let mut ctx = tera::Context::new(); ctx.insert("version", VERSION); + ctx.insert("port", &(**port).0); match templates.render("index.html", &ctx) { Ok(rendered) => HttpResponse::Ok().content_type("text/html").body(rendered), Err(e) => { @@ -33,23 +44,125 @@ async fn index(templates: web::Data<Tera>) -> impl Responder { } } +#[get("/opensearch.xml")] +async fn opensearch(templates: web::Data<Tera>, port: web::Data<Port>) -> impl Responder { + let mut ctx = tera::Context::new(); + ctx.insert("port", &(**port).0); + match templates.render("opensearch.xml", &ctx) { + Ok(xml) => HttpResponse::Ok().content_type("application/opensearchdescription+xml").body(xml), + Err(e) => { + error!("Template error: {}", e); + HttpResponse::InternalServerError().finish() + } + } +} + +struct Port(u16); + #[get("/search")] async fn search( + req: HttpRequest, query: web::Query<SearchQuery>, templates: web::Data<Tera>, db_pool: web::Data<DbPool>, + engines: web::Data<Vec<Box<dyn SearchEngine>>>, ) -> impl Responder { + let start = Instant::now(); let q = query.q.as_deref().unwrap_or(""); - let bookmarks = if !q.is_empty() { - storage::search_bookmarks(&db_pool.conn(), q).unwrap_or_default() + let page = query.page.unwrap_or(1).max(1); + let page_size: usize = 20; + let has_query = !q.is_empty(); + info!("search request: query=\"{}\" page={}", q, page); + + let total_results = if has_query { + storage::count_search_bookmarks(&db_pool.conn(), q).unwrap_or(0) + } else { + storage::count_bookmarks(&db_pool.conn()).unwrap_or(0) + }; + let total_pages = (total_results + page_size - 1) / page_size; + + let bookmarks = if has_query { + storage::search_bookmarks(&db_pool.conn(), q, page, page_size).unwrap_or_default() } else { - storage::list_bookmarks(&db_pool.conn()).unwrap_or_default() + storage::list_bookmarks(&db_pool.conn(), page, page_size).unwrap_or_default() }; + let user_agent = req + .headers() + .get("User-Agent") + .and_then(|v| v.to_str().ok()) + .unwrap_or(USER_AGENT); + + let mut external_results: Vec<ResultEntry> = Vec::new(); + let mut provider_count: usize = 0; + if has_query { + let client = reqwest::Client::builder() + .user_agent(user_agent) + .build() + .ok(); + if let Some(client) = client { + let mut handles = Vec::new(); + let engines = engines.clone(); + for i in 0..engines.len() { + let engine_name = engines[i].name().to_string(); + let q_owned = q.to_string(); + let client = client.clone(); + let engines = engines.clone(); + handles.push(tokio::spawn(async move { + let t0 = Instant::now(); + let result = engines[i].fetch_results(&q_owned, &client).await; + let elapsed = t0.elapsed(); + (engine_name, result, elapsed) + })); + } + for handle in handles { + if let Ok((name, result, elapsed)) = handle.await { + provider_count += 1; + match result { + Ok(mut results) => { + info!( + "external {} ({} results) [{:.2?}]", + name, + results.len(), + elapsed + ); + external_results.append(&mut results); + } + Err(e) => { + info!("external {} (error) [{:.2?}]: {}", name, elapsed, e); + } + } + } + } + } + } + + let page_elapsed = start.elapsed(); + let page_time_ms = format!("{:.1}", page_elapsed.as_secs_f64() * 1000.0); + info!( + "search completed: {} bookmark results, {} external providers [{:.2?}]", + bookmarks.len(), + provider_count, + page_elapsed + ); + + let mut external_engines: Vec<String> = external_results + .iter() + .map(|r| r.engine.clone()) + .collect(); + external_engines.sort(); + external_engines.dedup(); + let mut ctx = tera::Context::new(); ctx.insert("bookmarks", &bookmarks); ctx.insert("query", &q); + ctx.insert("page", &page); + ctx.insert("total_pages", &total_pages); + ctx.insert("total_results", &total_results); ctx.insert("version", VERSION); + ctx.insert("page_time_ms", &page_time_ms); + ctx.insert("external_results", &external_results); + ctx.insert("external_engines", &external_engines); match templates.render("index.html", &ctx) { Ok(rendered) => HttpResponse::Ok().content_type("text/html").body(rendered), @@ -63,6 +176,38 @@ async fn search( #[derive(serde::Deserialize)] pub struct SearchQuery { pub q: Option<String>, + pub page: Option<usize>, +} + +pub async fn run_server( + db_path: &str, + port: u16, + engines: Vec<Box<dyn SearchEngine>>, +) -> std::io::Result<()> { + let db_pool = web::Data::new(DbPool::new(db_path)); + let engines = web::Data::new(engines); + let port_data = web::Data::new(Port(port)); + let mut tera = Tera::default(); + tera.add_raw_template("index.html", include_str!("../templates/index.html")) + .expect("Failed to parse index template"); + tera.add_raw_template("opensearch.xml", include_str!("../templates/opensearch.xml")) + .expect("Failed to parse opensearch template"); + let tera = web::Data::new(tera); + + HttpServer::new(move || { + App::new() + .app_data(tera.clone()) + .app_data(db_pool.clone()) + .app_data(engines.clone()) + .app_data(port_data.clone()) + .service(index) + .service(search) + .service(opensearch) + }) + .workers(2) + .bind(("127.0.0.1", port))? + .run() + .await } #[cfg(test)] @@ -71,37 +216,44 @@ mod tests { use crate::models::Bookmark; use chrono::Utc; - #[test] - fn render_template_no_query() { + fn test_tera() -> Tera { let mut tera = Tera::default(); tera.add_raw_template("index.html", include_str!("../templates/index.html")) .expect("template parse"); + tera + } + + #[test] + fn render_template_no_query() { let mut ctx = tera::Context::new(); ctx.insert("version", &"0.0.0"); - let rendered = tera.render("index.html", &ctx).expect("render"); + let rendered = test_tera().render("index.html", &ctx).expect("render"); assert!(rendered.contains("Bookmark Search")); assert!(rendered.contains("enter a query")); } #[test] fn render_template_with_results() { - let mut tera = Tera::default(); - tera.add_raw_template("index.html", include_str!("../templates/index.html")) - .expect("template parse"); let mut ctx = tera::Context::new(); ctx.insert("query", &"rust"); + ctx.insert("page", &1usize); + ctx.insert("total_pages", &1usize); + ctx.insert("total_results", &1usize); ctx.insert("bookmarks", &vec![ Bookmark { id: 1, title: "Rust Lang".into(), url: "https://rust-lang.org".into(), description: Some("The Rust programming language".into()), + source: "bookmark".into(), content: None, + tags: None, created_at: Utc::now(), }, ]); ctx.insert("version", &"0.0.0"); - let rendered = tera.render("index.html", &ctx).expect("render"); + ctx.insert("external_results", &Vec::<ResultEntry>::new()); + let rendered = test_tera().render("index.html", &ctx).expect("render"); assert!(rendered.contains("Rust Lang")); assert!(rendered.contains("rust-lang.org")); assert!(rendered.contains("1 result")); @@ -109,35 +261,15 @@ mod tests { #[test] fn render_template_no_results() { - let mut tera = Tera::default(); - tera.add_raw_template("index.html", include_str!("../templates/index.html")) - .expect("template parse"); let mut ctx = tera::Context::new(); ctx.insert("query", &"zzznotfound"); + ctx.insert("page", &1usize); + ctx.insert("total_pages", &0usize); + ctx.insert("total_results", &0usize); ctx.insert("bookmarks", &Vec::<Bookmark>::new()); ctx.insert("version", &"0.0.0"); - let rendered = tera.render("index.html", &ctx).expect("render"); - assert!(rendered.contains("0 results")); + ctx.insert("external_results", &Vec::<ResultEntry>::new()); + let rendered = test_tera().render("index.html", &ctx).expect("render"); assert!(rendered.contains("no bookmarks found")); } } - -pub async fn run_server(db_path: &str, port: u16) -> std::io::Result<()> { - let db_pool = web::Data::new(DbPool::new(db_path)); - let mut tera = Tera::default(); - tera.add_raw_template("index.html", include_str!("../templates/index.html")) - .expect("Failed to parse embedded template"); - let tera = web::Data::new(tera); - - HttpServer::new(move || { - App::new() - .app_data(tera.clone()) - .app_data(db_pool.clone()) - .service(index) - .service(search) - }) - .workers(2) - .bind(("127.0.0.1", port))? - .run() - .await -} -
modified templates/index.html
diff --git a/templates/index.html b/templates/index.html index fda9dc4..da100ad 100644 --- a/templates/index.html +++ b/templates/index.html @@ -4,6 +4,7 @@ <meta charset="utf-8"> <meta name="viewport" content="width=device-width, initial-scale=1"> <title>Bookmark Search</title> + <link rel="search" type="application/opensearchdescription+xml" title="search_hub" href="/opensearch.xml"> <style> :root { --bg: #f5f0e8; @@ -97,7 +98,7 @@ font-family: "Courier New", Courier, monospace; background: var(--bg); color: var(--text); - max-width: 700px; + max-width: 960px; margin: 0 auto; padding: 1.5rem; transition: background 0.2s, color 0.2s; @@ -199,12 +200,149 @@ font-size: 0.85rem; color: var(--desc); } + .bookmark-tags { + margin-top: 0.3rem; + font-size: 0.7rem; + text-transform: uppercase; + letter-spacing: 0.08em; + color: var(--accent); + } + .external-engine { + margin-top: 0.25rem; + font-size: 0.7rem; + text-transform: uppercase; + letter-spacing: 0.08em; + color: var(--url); + } + .engine-filters { + display: flex; + flex-wrap: wrap; + gap: 0.35rem; + margin-bottom: 0.75rem; + } + .engine-badge { + font-family: inherit; + font-size: 0.6rem; + text-transform: uppercase; + letter-spacing: 0.08em; + padding: 0.2rem 0.45rem; + border: 1px solid var(--border); + background: transparent; + color: var(--meta); + cursor: pointer; + transition: background 0.15s, color 0.15s, border-color 0.15s; + } + .engine-badge.active { + background: var(--accent); + border-color: var(--accent); + color: var(--btn-text); + } + .engine-badge:hover { + border-color: var(--accent); + color: var(--text); + } + .bookmark.selected { + border-left: 3px solid var(--accent); + padding-left: 0.75rem; + } + .help-overlay { + display: none; + position: fixed; + top: 0; left: 0; right: 0; bottom: 0; + background: rgba(0,0,0,0.35); + z-index: 100; + justify-content: center; + align-items: center; + } + .help-overlay.show { + display: flex; + } + .help-box { + background: var(--bg); + border: 2px solid var(--border-strong); + padding: 1.5rem; + max-width: 360px; + width: 100%; + font-size: 0.8rem; + } + .help-box h2 { + font-size: 0.7rem; + text-transform: uppercase; + letter-spacing: 0.15em; + margin-bottom: 1rem; + color: var(--meta); + } + .help-row { + display: flex; + justify-content: space-between; + align-items: center; + gap: 2rem; + margin-bottom: 0.5rem; + } + .help-row:last-child { + margin-bottom: 0; + } + .help-key { + border: 1px solid var(--border); + padding: 0.05rem 0.35rem; + font-family: inherit; + font-size: 0.7rem; + color: var(--text); + } + .help-desc { + color: var(--meta); + font-size: 0.75rem; + } + .results-grid { + display: flex; + gap: 1.5rem; + align-items: flex-start; + } + .results-column { + flex: 1; + min-width: 0; + } + .column-header { + font-size: 0.75rem; + text-transform: uppercase; + letter-spacing: 0.15em; + color: var(--meta); + padding-bottom: 0.4rem; + margin-bottom: 0.75rem; + border-bottom: 2px solid var(--border-strong); + } + @media (max-width: 700px) { + .results-grid { + flex-direction: column; + } + } .empty { text-align: center; padding: 2rem 1rem; border: 2px dashed var(--empty-border); font-size: 0.85rem; } + .pagination { + display: flex; + gap: 0.75rem; + justify-content: center; + align-items: center; + margin-top: 1rem; + font-size: 0.8rem; + text-transform: uppercase; + letter-spacing: 0.08em; + } + .pagination a { + color: var(--link); + text-decoration: none; + border: 1px solid var(--border); + padding: 0.2rem 0.6rem; + transition: background 0.1s, color 0.1s; + } + .pagination a:hover { + background: var(--link-hover-bg); + color: var(--link-hover-text); + } .footer { margin-top: 2rem; padding-top: 0.5rem; @@ -214,43 +352,118 @@ letter-spacing: 0.1em; color: var(--footer); } + .help-hint { + background: none; + border: 1px solid var(--border); + font-family: inherit; + font-size: 0.7rem; + text-transform: uppercase; + letter-spacing: 0.1em; + padding: 0.2rem 0.5rem; + cursor: pointer; + color: var(--meta); + background: var(--input-bg); + transition: color 0.15s, border-color 0.15s; + } + .help-hint:hover { + border-color: var(--accent); + color: var(--accent); + } </style> </head> <body> <nav> - <span>search_hub // bookmarks</span> - <button class="theme-toggle" id="theme-toggle">[theme]</button> + <span>search_hub // personal web index</span> + <div style="display:flex;gap:0.5rem"> + <button class="help-hint" id="help-hint">[?]</button> + <button class="theme-toggle" id="theme-toggle">[theme]</button> + </div> </nav> <form class="search-form" action="/search" method="get"> - <input type="text" name="q" value="{{ query | default(value="") }}" placeholder="search bookmarks..." autofocus> + <input type="text" name="q" value="{{ query | default(value="") }}" placeholder="search bookmarks..." autofocus accesskey="s"> <button type="submit">search</button> </form> {% set q = query | default(value="") %} {% if q %} - {% set count = bookmarks | length %} - <div class="meta">{{ count }} result{% if count != 1 %}s{% endif %} for "{{ q }}"</div> - {% if bookmarks %} - <ul class="results"> - {% for bm in bookmarks %} - <li class="bookmark"> - <div class="bookmark-title"><a href="{{ bm.url }}">{{ bm.title }}</a></div> - <div class="bookmark-url">{{ bm.url }}</div> - {% if bm.description %} - <div class="bookmark-desc">{{ bm.description }}</div> - {% endif %} - </li> - {% endfor %} - </ul> - {% else %} - <div class="empty">no bookmarks found</div> + <div class="results-grid"> + <div class="results-column"> + <div class="column-header">local</div> + {% if total_results > 0 %} + <div class="meta"> + {% if total_pages > 1 %}Page {{ page }} of {{ total_pages }} - {% endif %} + {{ total_results }} result{% if total_results != 1 %}s{% endif %} for "{{ q }}" + </div> + <ul class="results"> + {% for bm in bookmarks %} + <li class="bookmark"> + <div class="bookmark-title"><a href="{{ bm.url }}">{{ bm.title }}</a></div> + <div class="bookmark-url">{{ bm.url }}</div> + {% if bm.description %} + <div class="bookmark-desc">{{ bm.description }}</div> + {% endif %} + {% if bm.tags %} + <div class="bookmark-tags">{{ bm.tags }}</div> + {% endif %} + </li> + {% endfor %} + </ul> + {% if total_pages > 1 %} + <div class="pagination"> + {% if page > 1 %}<a href="?q={{ q }}&page={{ page - 1 }}">[prev]</a>{% endif %} + <span>{{ page }} / {{ total_pages }}</span> + {% if page < total_pages %}<a href="?q={{ q }}&page={{ page + 1 }}">[next]</a>{% endif %} + </div> + {% endif %} + {% else %} + <div class="empty">no bookmarks found</div> + {% endif %} + </div> + {% if external_results %} + <div class="results-column"> + <div class="column-header">external</div> + {% if external_engines %} + <div class="engine-filters"> + {% for name in external_engines %} + <button class="engine-badge active" data-engine="{{ name }}">{{ name }}</button> + {% endfor %} + </div> + {% endif %} + <ul class="results"> + {% for r in external_results %} + <li class="bookmark" data-engine="{{ r.engine }}"> + <div class="bookmark-title"><a href="{{ r.url }}">{{ r.title }}</a></div> + <div class="bookmark-url">{{ r.url }}</div> + {% if r.description %} + <div class="bookmark-desc">{{ r.description }}</div> + {% endif %} + <div class="external-engine">{{ r.engine }}</div> + </li> + {% endfor %} + </ul> + </div> {% endif %} + </div> {% else %} <div class="empty" style="margin-top:2rem">enter a query to search your bookmarks</div> {% endif %} - <div class="footer">search_hub v{{ version }}</div> + <div class="footer"> + <span>search_hub v{{ version }}{% if page_time_ms %} / {{ page_time_ms }}ms{% endif %}</span> + </div> + + <div class="help-overlay" id="help-overlay"> + <div class="help-box"> + <h2>keyboard shortcuts</h2> + <div class="help-row"><span class="help-key">/</span><span class="help-desc">focus search</span></div> + <div class="help-row"><span class="help-key">j / ↓</span><span class="help-desc">next result</span></div> + <div class="help-row"><span class="help-key">k / ↑</span><span class="help-desc">previous result</span></div> + <div class="help-row"><span class="help-key">o</span><span class="help-desc">open result</span></div> + <div class="help-row"><span class="help-key">?</span><span class="help-desc">toggle this help</span></div> + <div class="help-row"><span class="help-key">esc</span><span class="help-desc">close / blur</span></div> + </div> + </div> <script> (function() { @@ -273,6 +486,104 @@ btn.textContent = '[light]'; } }; + + var selectedIndex = -1; + var resultLinks = []; + + function collectResults() { + resultLinks = []; + var items = document.querySelectorAll('.results .bookmark'); + for (var i = 0; i < items.length; i++) { + if (items[i].style.display !== 'none') { + var link = items[i].querySelector('.bookmark-title a'); + if (link) resultLinks.push({ el: items[i], link: link }); + } + } + if (selectedIndex >= resultLinks.length) selectedIndex = -1; + if (selectedIndex >= 0) resultLinks[selectedIndex].el.classList.add('selected'); + } + + function selectResult(idx) { + if (resultLinks.length === 0) return; + if (selectedIndex >= 0 && selectedIndex < resultLinks.length) { + resultLinks[selectedIndex].el.classList.remove('selected'); + } + selectedIndex = idx; + if (idx >= 0 && idx < resultLinks.length) { + resultLinks[idx].el.classList.add('selected'); + resultLinks[idx].link.focus({ preventScroll: true }); + resultLinks[idx].el.scrollIntoView({ block: 'nearest' }); + } + } + + function selectNext() { + if (resultLinks.length === 0) return; + var next = selectedIndex < resultLinks.length - 1 ? selectedIndex + 1 : 0; + selectResult(next); + } + + function selectPrev() { + if (resultLinks.length === 0) return; + var prev = selectedIndex > 0 ? selectedIndex - 1 : resultLinks.length - 1; + selectResult(prev); + } + + function openSelected() { + if (selectedIndex >= 0 && selectedIndex < resultLinks.length) { + resultLinks[selectedIndex].link.click(); + } + } + + function toggleHelp() { + document.getElementById('help-overlay').classList.toggle('show'); + } + + function closeHelp() { + document.getElementById('help-overlay').classList.remove('show'); + } + + document.addEventListener('keydown', function(e) { + var tag = e.target.tagName; + if (tag === 'INPUT' || tag === 'TEXTAREA' || tag === 'SELECT') { + if (e.key === 'Escape') { + e.target.blur(); + } + return; + } + switch (e.key) { + case '/': e.preventDefault(); document.querySelector('input[name="q"]').focus(); break; + case 'j': case 'ArrowDown': e.preventDefault(); selectNext(); break; + case 'k': case 'ArrowUp': e.preventDefault(); selectPrev(); break; + case 'o': case 'Enter': openSelected(); break; + case '?': e.preventDefault(); toggleHelp(); break; + case 'Escape': closeHelp(); break; + } + }); + + document.getElementById('help-overlay').onclick = function(e) { + if (e.target === this) closeHelp(); + }; + document.getElementById('help-hint').onclick = toggleHelp; + + var badges = document.querySelectorAll('.engine-badge'); + for (var i = 0; i < badges.length; i++) { + (function(badge) { + var engine = badge.getAttribute('data-engine'); + badge.onclick = function() { + this.classList.toggle('active'); + var items = document.querySelectorAll( + '[data-engine="' + engine + '"]' + ); + var show = this.classList.contains('active'); + for (var j = 0; j < items.length; j++) { + items[j].style.display = show ? '' : 'none'; + } + collectResults(); + }; + })(badges[i]); + } + + collectResults(); })(); </script> </body> -
added templates/opensearch.xml
diff --git a/templates/opensearch.xml b/templates/opensearch.xml new file mode 100644 index 0000000..5932a64 --- /dev/null +++ b/templates/opensearch.xml @@ -0,0 +1,7 @@ +<?xml version="1.0" encoding="UTF-8"?> +<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/"> + <ShortName>search_hub</ShortName> + <Description>Local bookmark and history search</Description> + <Url type="text/html" template="http://localhost:{{port}}/search?q={searchTerms}"/> + <InputEncoding>UTF-8</InputEncoding> +</OpenSearchDescription> -
added tests/convert_strips_html.rs
diff --git a/tests/convert_strips_html.rs b/tests/convert_strips_html.rs new file mode 100644 index 0000000..0ca47f3 --- /dev/null +++ b/tests/convert_strips_html.rs @@ -0,0 +1,77 @@ +use std::sync::OnceLock; + +static RT: OnceLock<tokio::runtime::Runtime> = OnceLock::new(); + +fn rt() -> &'static tokio::runtime::Runtime { + RT.get_or_init(|| tokio::runtime::Runtime::new().unwrap()) +} + +fn fetch(url: &str) -> String { + rt().block_on(async { + let client = reqwest::Client::builder() + .user_agent("search_hub_test") + .build() + .unwrap(); + let resp = client.get(url).send().await.unwrap(); + resp.text().await.unwrap() + }) +} + +fn print_md(name: &str, html: &str, md: &str) { + println!(); + println!("=== {} ===", name); + println!("Raw HTML : {} bytes", html.len()); + println!("Markdown : {} bytes", md.len()); + println!("Ratio : {:.1}x smaller", html.len() as f64 / md.len().max(1) as f64); + println!(); + println!("--- Markdown output ---"); + println!("{}", md); + println!("--- end ---"); +} + +#[test] +fn strips_html_tags_and_preserves_text_example() { + let html = fetch("https://example.com"); + assert!(html.contains("<h1>"), "expected HTML to contain tags before conversion"); + + let md = htmd::convert(&html).expect("conversion should succeed"); + print_md("example.com", &html, &md); + + assert!(!md.contains("<h1>"), "no HTML heading tags"); + assert!(!md.contains("<a "), "no HTML anchor tags"); + assert!(!md.contains("<div"), "no HTML div tags"); + assert!(!md.contains("</"), "no closing HTML tags"); + + assert!(md.contains("Example Domain"), "visible heading text preserved"); + assert_eq!(md.lines().filter(|l| l.starts_with('#')).count(), 1, "exactly one H1 in Markdown"); +} + +#[test] +fn strips_html_tags_and_preserves_text_rustlang() { + let html = fetch("https://www.rust-lang.org"); + assert!(html.contains("<html") || html.contains("<!DOCTYPE"), "expected valid HTML"); + + let md = htmd::convert(&html).expect("conversion should succeed"); + print_md("rust-lang.org", &html, &md); + + assert!(!md.contains("<script"), "no script tags in output"); + assert!(!md.contains("<style"), "no style tags in output"); + assert!(!md.contains("class=\""), "no HTML attribute syntax in output"); + assert!(!md.contains("id=\""), "no HTML id attributes in output"); + + assert!(md.contains("Rust"), "page title preserved in Markdown"); + assert!(md.lines().any(|l| l.starts_with("# Rust")), "heading preserved as Markdown H1"); + assert!(md.len() < html.len(), "Markdown smaller than raw HTML ({} vs {})", md.len(), html.len()); +} + +#[test] +fn markdown_output_is_readable() { + let html = fetch("https://example.com"); + let md = htmd::convert(&html).expect("conversion should succeed"); + + let lines: Vec<&str> = md.lines().filter(|l| !l.trim().is_empty()).collect(); + assert!(lines.len() >= 3, "at least 3 non-empty lines of content"); + + let words: Vec<&str> = md.split_whitespace().collect(); + assert!(words.len() >= 20, "at least 20 readable words in output"); +} -
added tests/tagging_thresholds.rs
diff --git a/tests/tagging_thresholds.rs b/tests/tagging_thresholds.rs new file mode 100644 index 0000000..9fd93a8 --- /dev/null +++ b/tests/tagging_thresholds.rs @@ -0,0 +1,427 @@ +use search_hub::tagging::{default_tags, TaggingEngine}; + +struct Sample { + name: &'static str, + text: &'static str, +} + +const SAMPLES: &[Sample] = &[ + Sample { + name: "Rust backend API", + text: r#" +Building a Modern Web API with Rust + +In this tutorial we will build a RESTful API using the Actix-web framework +in Rust. The API will expose CRUD endpoints for managing a book collection +stored in a PostgreSQL database. We will use SQLx for async database access +with connection pooling, and serde for JSON serialization and deserialization +of our request and response types. + +To get started, create a new cargo project and add the required dependencies +to your Cargo.toml: actix-web, serde, serde_json, sqlx, tokio, and uuid. +Enable the runtime-tokio feature for sqlx so we can use async database +operations throughout the application. + +First define our data model with a Book struct containing id, title, author, +isbn, and published_year fields. Derive Serialize and Deserialize from serde +so actix-web can automatically convert between JSON and our Rust types. +Then create database migration files that define the books table schema with +appropriate indexes on the isbn and author columns. + +Next implement the HTTP handlers. The list handler queries all books and +returns them as a JSON array. The create handler validates the incoming JSON +body, inserts a new row, and returns the created book with a 201 status code. +The get, update, and delete handlers follow the same pattern using the book +id extracted from the URL path parameters. + +For error handling we define an ApiError enum that maps to appropriate HTTP +status codes. Use actix-web's ResponseError trait to automatically convert +our error types into JSON error responses. This keeps the handler code clean +and focused on business logic rather than HTTP plumbing. + +Add middleware for logging, CORS support, and request validation. Configure +the server to bind on 0.0.0.0:8080 with 4 worker threads. Finally write +integration tests using actix_web::test to verify each endpoint works +correctly with both valid and invalid inputs. + +Deploy the application using Docker with a multi-stage build for minimal +image size. Use docker-compose to run the API server alongside a PostgreSQL +container, with environment variables for configuration. Add health check +endpoints and structured logging for production monitoring. +"#, + }, + Sample { + name: "Python data science", + text: r#" +Exploratory Data Analysis with Python and Pandas + +Data analysis begins with loading your dataset into a pandas DataFrame. +Use the read_csv function to import CSV files and inspect the first few +rows with the head method. Check data types with dtypes and get summary +statistics using the describe method on numerical columns. + +Data cleaning is a critical step before any modeling. Handle missing values +by either dropping rows with dropna or filling them with fillna using the +mean or median of the column. Remove duplicate rows with drop_duplicates +and convert data types as needed using the astype method. + +For data visualization, matplotlib and seaborn are the standard libraries +in the Python ecosystem. Create scatter plots with plt.scatter to explore +relationships between numeric variables, histograms with plt.hist to +understand distributions, and box plots with seaborn.boxplot to detect +outliers in your data. Customize your plots with titles, axis labels, +and color palettes for publication-quality figures. + +Feature engineering transforms raw data into inputs suitable for machine +learning models. Create new columns from existing ones, encode categorical +variables using one-hot encoding with pandas.get_dummies, and scale numeric +features using scikit-learn's StandardScaler. Split your data into training +and test sets with train_test_split to evaluate model performance. + +Build a regression model using scikit-learn's LinearRegression or a +classification model using RandomForestClassifier. Fit the model on the +training data with the fit method, make predictions with predict, and +evaluate accuracy using metrics like mean_squared_error for regression +or accuracy_score for classification tasks. + +Use Jupyter notebooks for interactive development with inline plotting +and markdown annotations. Document your analysis steps clearly so others +can reproduce your results. Save your cleaned datasets with to_csv for +future use and export your models with joblib or pickle for deployment. +"#, + }, + Sample { + name: "Frontend web design", + text: r#" +Responsive Web Design with Modern CSS + +Building a responsive website starts with a solid CSS foundation using +Flexbox and CSS Grid for layout. Define a container with display: flex +to create horizontal or vertical layouts that adapt to screen size. Use +justify-content and align-items to position elements within the flex +container, and the flex-wrap property to allow items to flow onto +multiple lines on smaller screens. + +CSS Grid provides two-dimensional layout control with grid-template-columns +and grid-template-rows. Define named grid areas with grid-template-areas +and place items using the grid-area property. This makes it easy to create +complex page layouts that reflow naturally from desktop to tablet to mobile. + +Typography is the foundation of good design. Set a harmonious type scale +using clamp for fluid typography that scales between minimum and maximum +values. Use custom properties (CSS variables) to maintain consistency +across your design system. Define --color-primary, --font-heading, and +--spacing-unit variables that can be changed globally. + +Accessibility is not optional. Use semantic HTML elements like header, +nav, main, section, and footer. Add aria labels to interactive elements +and ensure color contrast ratios meet WCAG AA standards. Test your site +with keyboard navigation and screen readers to verify all functionality +is accessible to users with disabilities. + +Animations enhance user experience when used thoughtfully. Use CSS +transitions for hover effects on buttons and links, and keyframe +animations for loading states and page transitions. The prefers-reduced- +motion media query respects users who prefer less animation. + +Mobile-first design means starting with the smallest screen and adding +complexity with min-width media queries. This approach ensures your site +works well on all devices and loads efficiently on mobile connections. +Test regularly using browser dev tools in responsive design mode. +"#, + }, + Sample { + name: "Linux devops", + text: r#" +Linux Server Administration and Automation + +Managing Linux servers efficiently requires mastery of the command line +and automation tools. Start with the basics of process management using +ps, top, and htop to monitor running processes. Use kill and killall +to terminate unresponsive processes and systemctl to manage systemd +services. Check resource usage with free for memory, df for disk space, +and netstat or ss for network connections. + +Shell scripting is essential for automation. Write bash scripts using +variables, loops, conditionals, and functions. Use find with exec to +batch-process files, grep with regex for pattern matching in logs, and +awk or sed for text processing. Schedule recurring tasks with cron +and systemd timers for more complex scheduling needs. + +Containerization with Docker simplifies application deployment. Write +Dockerfiles that specify the base image, install dependencies, copy +application code, and define the startup command. Use docker-compose +to orchestrate multi-container applications with linked services, +networks, and persistent volumes. Tag and push images to a registry +for deployment across environments. + +Kubernetes orchestrates containers at scale. Define deployments with +replica counts, services for networking, and configmaps for environment +configuration. Use kubectl to inspect pods, view logs, and scale +applications horizontally. Implement health checks with liveness and +readiness probes to ensure your applications are running correctly. + +Configuration management with Ansible keeps your infrastructure +consistent. Write playbooks in YAML that define the desired state of +your servers. Use roles to organize tasks, handlers, and variables +into reusable components. Run ad-hoc commands with ansible to quickly +check server status across your entire infrastructure. + +Monitor your infrastructure with Prometheus for metrics collection and +Grafana for dashboards. Set up alerts for critical conditions like high +CPU usage, disk space running low, or services going offline. Centralize +logs using the ELK stack or Loki for troubleshooting and analysis. +"#, + }, + Sample { + name: "AI machine learning", + text: r#" +Training Deep Learning Models with PyTorch + +Deep learning has transformed how we approach complex pattern recognition +tasks. PyTorch provides a flexible framework for building and training +neural networks using tensor computations with automatic differentiation. +Define a model by subclassing nn.Module and implementing the forward +method that specifies how input data flows through the network layers. + +Data preparation is crucial for model performance. Use the DataLoader +class to efficiently batch and shuffle your dataset during training. +Apply data augmentation techniques like random cropping, flipping, and +color jitter to reduce overfitting and improve generalization. Normalize +input tensors to have zero mean and unit variance for stable training. + +The training loop iterates over epochs, processing batches of data +through the model, computing the loss with a criterion like cross-entropy +for classification or mean squared error for regression, and calling +backward to compute gradients. Use an optimizer like Adam or SGD with +learning rate scheduling to minimize the loss function over time. + +Convolutional neural networks excel at image recognition tasks. Stack +Conv2d layers with increasing channel depth, interleaved with ReLU +activations and max-pooling layers to reduce spatial dimensions. +Add batch normalization to stabilize training and dropout layers to +prevent overfitting. End with fully connected layers for classification. + +Transformer architectures dominate natural language processing. The +self-attention mechanism allows the model to weigh the importance of +different positions in the input sequence. Multi-head attention runs +multiple attention operations in parallel, capturing different types +of relationships between tokens. Positional encodings provide sequence +order information to the model. + +Transfer learning leverages pretrained models for new tasks. Load a +model pretrained on ImageNet, freeze the early layers, and replace the +final classification head with new layers for your specific dataset. +Fine-tune the model with a lower learning rate to adapt the pretrained +features to your domain while preserving the general visual knowledge. +"#, + }, + Sample { + name: "Mobile development", + text: r#" +Building Cross-Platform Mobile Apps with Flutter + +Flutter enables building native-quality mobile applications for both +iOS and Android from a single Dart codebase. The framework uses a +widget-based architecture where everything from a simple text label +to complex layouts is a widget. Compose widgets together using +child and children properties to build your user interface hierarchy. + +State management is a key concern in mobile app development. Use +setState for simple local state, or adopt Provider, Riverpod, or +Bloc for more complex application state that needs to be shared +across multiple screens. Keep your business logic separate from +your UI code by using ViewModels or Controllers that manage state +and expose it to widgets via streams or change notifiers. + +Navigation and routing handle moving between screens in your app. +Use the Navigator widget with named routes for simple apps, or +implement a router with GoRouter for more complex navigation +patterns including deep linking and nested navigation. Pass data +between screens using constructor arguments or route parameters. + +Platform-specific features require accessing native APIs through +platform channels. Implement features like camera access, location +services, biometric authentication, and push notifications by +writing platform-specific code in Kotlin or Swift and invoking it +from Dart through MethodChannel calls. Use community packages from +pub.dev for common native features. + +Performance optimization is critical for a smooth user experience. +Profile your app using the Flutter DevTools to identify widget +rebuilds and jank. Use const constructors where possible to reduce +rebuilds, implement lazy loading for lists with ListView.builder, +and cache images using cached_network_image. Reduce app size by +removing unused resources and using code shrinking. + +Testing mobile apps requires multiple approaches. Write unit tests +for your business logic and data models. Use widget tests to verify +individual widget behavior and integration tests for full user flows. +Run tests on both iOS and Android simulators to catch platform- +specific issues before releasing to app stores. +"#, + }, + Sample { + name: "Gaming graphics", + text: r#" +Real-Time 3D Rendering with Vulkan and GLSL + +Modern game engines leverage GPU compute capabilities to render +complex 3D scenes at interactive frame rates. The Vulkan API +provides low-level access to graphics hardware with explicit +control over memory management and command buffers. Set up a +Vulkan instance, select a physical device, create a logical +device, and configure graphics and present queues for rendering. + +The rendering pipeline transforms 3D geometry into 2D images. +Vertex shaders process individual vertices, applying model-view- +projection matrix transformations to place objects in clip space. +Fragment shaders determine the color of each pixel using lighting +calculations, texture sampling, and material properties defined +in GLSL shading language source code. + +A game loop runs at 60 frames per second, processing input events, +updating game state, and rendering each frame. Use delta time to +ensure consistent movement speeds regardless of frame rate. Implement +fixed time step for physics simulations to maintain stability. +Separate the update and render phases for better parallelism. + +Physics simulation brings game worlds to life. Use a physics engine +like PhysX or Bullet for rigid body dynamics including collision +detection, joint constraints, and force-based movement. Implement +broad phase and narrow phase collision detection to efficiently +find colliding pairs among thousands of objects in the scene. + +Spatial data structures accelerate rendering by culling objects +outside the camera view frustum. Use bounding volume hierarchies, +octrees, or binary space partitioning trees to organize scene +geometry. Implement occlusion culling to skip rendering objects +hidden behind other geometry, saving GPU processing time. + +Post-processing effects enhance visual quality after the main +render pass. Apply bloom for glowing highlights, ambient occlusion +for realistic shadowing in corners, and tone mapping to convert +HDR values to displayable colors. Use compute shaders for GPU- +based particle systems and screen-space reflections. +"#, + }, + Sample { + name: "Audio production", + text: r#" +Digital Audio Production and Music Streaming + +Digital audio workstations have revolutionized music production by +providing powerful tools for recording, editing, and mixing audio. +Record multiple tracks simultaneously through audio interfaces with +low-latency monitoring. Edit waveform regions with cut, copy, paste, +and crossfade operations to arrange your recordings into a coherent +composition. + +Audio effects processing shapes the character of your sound. Use +equalizers to boost or cut specific frequency ranges, compressors +to control dynamic range by reducing loud peaks, and reverbs to +simulate acoustic spaces from small rooms to large concert halls. +Delay and chorus effects add depth and width to your mixes. + +Podcast production follows a different workflow focused on spoken +word clarity. Record with quality microphones in acoustically treated +spaces to minimize background noise and room reflections. Use noise +gates to silence pauses between speech, de-essers to reduce sibilance, +and compressors to smooth out volume variations across the episode. + +Music streaming platforms deliver audio content to millions of +listeners worldwide. Encode audio files using codecs like AAC or +Opus that balance sound quality with bandwidth efficiency. Generate +album artwork, metadata tags, and playlist descriptions to help +listeners discover your content through search and recommendations. + +Radio broadcasting combines live and pre-recorded content with +scheduling automation. Use playout software to manage playlists, +cues, and commercial breaks. Broadcast audio over internet radio +using Icecast or Shoutcast servers with streaming protocols like +HLS for adaptive bitrate delivery to listeners on various devices. + +Live sound reinforcement requires understanding of acoustics and +signal flow. Set up a mixing console with auxiliary sends for +stage monitors and effects returns. Use graphic equalizers to tune +the room response and feedback suppressors to prevent howling. +Balance the front-of-house mix so every instrument and voice is +clear and present in the audience area. +"#, + }, + Sample { + name: "Social community", + text: r#" +Building Online Communities and Social Platforms + +Social media platforms connect people around shared interests and +experiences. Designing a social platform requires careful consideration +of user profiles, content feeds, and interaction mechanisms. Users +create profiles with biographical information, profile pictures, and +privacy settings that control who can see their content and activity. + +Content feeds are the central feature of any social platform. Implement +algorithms that surface relevant posts based on recency, engagement +metrics, and user preferences. Support multiple content types including +text posts, image sharing, video uploads, and link previews with rich +metadata fetched from shared URLs. + +Real-time messaging enables direct communication between users. Build +chat systems using WebSocket connections for instant message delivery +with typing indicators, read receipts, and push notifications. Organize +conversations into private direct messages and group chats with support +for media attachments, emoji reactions, and message threading. + +Community management tools help moderators maintain healthy discussions. +Provide reporting mechanisms for inappropriate content, automated spam +detection using machine learning classifiers, and moderation queues +where flagged content is reviewed before being shown to the broader +community. Implement warning systems and temporary or permanent bans. + +Social features like likes, shares, comments, and follows create +engagement loops that keep users returning to the platform. Notify +users when someone interacts with their content through in-app +notifications and email digests. Show trending topics and popular +content in discovery sections to help users find new communities. + +Content moderation at scale requires both automated and human review. +Train natural language models to detect hate speech, harassment, and +misinformation. Establish clear community guidelines that define +acceptable behavior and content standards. Provide appeals processes +so users can challenge moderation decisions they disagree with. +"#, + }, +]; + +#[test] +fn explore_tagging_thresholds() { + let tags = default_tags(); + let mut engine = TaggingEngine::new(&tags, 0.40).expect("failed to init tagging engine"); + + let thresholds = [ + 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, + ]; + + for sample in SAMPLES { + println!(); + println!("=== {} ===", sample.name); + println!("Text length: {} chars", sample.text.len()); + println!(); + + for &threshold in &thresholds { + let matched = engine + .tags_for_with_threshold(sample.text, 5, threshold) + .expect("tagging failed"); + + if matched.is_empty() { + println!(" {:.2}: (none)", threshold); + } else { + let tags_repr: Vec<String> = matched + .iter() + .map(|(tag, score)| format!("{} ({:.3})", tag, score)) + .collect(); + println!(" {:.2}: {}", threshold, tags_repr.join(", ")); + } + } + } +}