From 22ac4c9ed606b32d1464851de9785c6700caa6e5 Mon Sep 17 00:00:00 2001 From: Maxime Van Hees Date: Tue, 23 Sep 2025 17:15:40 +0200 Subject: [PATCH] implementation of tantivy datastore + updated RPC calls to deal with tantivy + docs --- Cargo.lock | 741 ++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + docs/tantivy.md | 253 ++++++++++++++ src/admin_meta.rs | 8 + src/cmd.rs | 293 ++++++++++++++++- src/lib.rs | 2 + src/options.rs | 1 + src/rpc.rs | 182 ++++++++++- src/search_cmd.rs | 352 ++++++++++++++++++++ src/server.rs | 18 +- src/tantivy_search.rs | 667 +++++++++++++++++++++++++++++++++++++ 11 files changed, 2508 insertions(+), 10 deletions(-) create mode 100644 docs/tantivy.md create mode 100644 src/search_cmd.rs create mode 100644 src/tantivy_search.rs diff --git a/Cargo.lock b/Cargo.lock index 2593623..18e9d63 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -70,6 +70,21 @@ dependencies = [ "sha2", ] +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "anstream" version = "0.6.20" @@ -224,6 +239,15 @@ version = "2.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34efbcccd345379ca2868b2b2c9d3782e9cc58ba87bc7d79d5b53d9c9ae6f25d" +[[package]] +name = "bitpacking" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c1d3e2bfd8d06048a179f7b17afc3188effa10385e7b00dc65af6aae732ea92" +dependencies = [ + "crunchy", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -233,6 +257,37 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bon" +version = "3.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2529c31017402be841eb45892278a6c21a000c0a17643af326c73a73f83f0fb" +dependencies = [ + "bon-macros", + "rustversion", +] + +[[package]] +name = "bon-macros" +version = "3.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82020dadcb845a345591863adb65d74fa8dc5c18a0b6d408470e13b7adc7005" +dependencies = [ + "darling", + "ident_case", + "prettyplease", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.106", +] + +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + [[package]] name = "byteorder" version = "1.5.0" @@ -252,9 +307,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5252b3d2648e5eedbc1a6f501e3c795e07025c1e93bbf8bbdd6eef7f447a6d54" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] +[[package]] +name = "census" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0" + [[package]] name = "cesu8" version = "1.1.0" @@ -411,6 +474,25 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + [[package]] name = "crossbeam-epoch" version = "0.9.18" @@ -426,6 +508,12 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + [[package]] name = "crypto-common" version = "0.1.6" @@ -464,6 +552,41 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "darling" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.11.1", + "syn 2.0.106", +] + +[[package]] +name = "darling_macro" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.106", +] + [[package]] name = "dashmap" version = "5.5.3" @@ -487,6 +610,16 @@ dependencies = [ "zeroize", ] +[[package]] +name = "deranged" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d630bccd429a5bb5a64b5e94f693bfc48c9f8566418fda4c494cc94f911f87cc" +dependencies = [ + "powerfmt", + "serde", +] + [[package]] name = "digest" version = "0.10.7" @@ -509,6 +642,12 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "downcast-rs" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "117240f60069e65410b3ae1bb213295bd828f707b5bec6596a1afc8793ce0cbc" + [[package]] name = "ed25519" version = "2.2.3" @@ -533,12 +672,40 @@ dependencies = [ "zeroize", ] +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "equivalent" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.0", +] + +[[package]] +name = "fastdivide" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471" + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + [[package]] name = "fiat-crypto" version = "0.2.9" @@ -610,6 +777,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -629,6 +802,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "fs4" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8640e34b88f7652208ce9e88b1a37a2ae95227d84abec377ccd3c5cfeb141ed4" +dependencies = [ + "rustix", + "windows-sys 0.59.0", +] + [[package]] name = "futures" version = "0.3.31" @@ -802,6 +985,11 @@ name = "hashbrown" version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] [[package]] name = "heck" @@ -832,6 +1020,7 @@ dependencies = [ "serde_json", "sha2", "sled", + "tantivy", "thiserror 1.0.69", "tokio", "x25519-dalek", @@ -855,6 +1044,12 @@ dependencies = [ "digest", ] +[[package]] +name = "htmlescape" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" + [[package]] name = "http" version = "1.3.1" @@ -962,6 +1157,15 @@ dependencies = [ "tracing", ] +[[package]] +name = "hyperloglogplus" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3" +dependencies = [ + "serde", +] + [[package]] name = "i18n-config" version = "0.4.8" @@ -1117,6 +1321,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.1.0" @@ -1208,6 +1418,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.15" @@ -1236,6 +1455,26 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.3", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852f13bec5eba4ba9afbeb93fd7c13fe56147f055939ae21c43a29a0ecb2702e" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + [[package]] name = "jsonrpsee" version = "0.26.0" @@ -1397,12 +1636,30 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "levenshtein_automata" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" + [[package]] name = "libc" version = "0.2.175" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" +[[package]] +name = "libm" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + +[[package]] +name = "linux-raw-sys" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" + [[package]] name = "litemap" version = "0.8.0" @@ -1425,12 +1682,45 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +[[package]] +name = "lru" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" +dependencies = [ + "hashbrown 0.15.5", +] + +[[package]] +name = "lz4_flex" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08ab2867e3eeeca90e844d1940eab391c9dc5228783db2ed999acbc0a9ed375a" + +[[package]] +name = "measure_time" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51c55d61e72fc3ab704396c5fa16f4c184db37978ae4e94ca8959693a235fc0e" +dependencies = [ + "log", +] + [[package]] name = "memchr" version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +[[package]] +name = "memmap2" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843a98750cd611cc2965a8213b53b43e715f13c37a9e096c6408e69990961db7" +dependencies = [ + "libc", +] + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -1457,6 +1747,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "murmurhash32" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" + [[package]] name = "nom" version = "7.1.3" @@ -1467,6 +1763,22 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + [[package]] name = "object" version = "0.36.7" @@ -1488,6 +1800,12 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +[[package]] +name = "oneshot" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ce411919553d3f9fa53a0880544cda985a112117a0444d5ff1e870a893d6ea" + [[package]] name = "opaque-debug" version = "0.3.1" @@ -1500,6 +1818,15 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" +[[package]] +name = "ownedbytes" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fbd56f7631767e61784dc43f8580f403f4475bd4aaa4da003e6295e1bab4a7e" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "parking_lot" version = "0.11.2" @@ -1606,6 +1933,12 @@ dependencies = [ "spki", ] +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + [[package]] name = "poly1305" version = "0.8.0" @@ -1626,6 +1959,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -1635,6 +1974,16 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.106", +] + [[package]] name = "proc-macro-crate" version = "3.3.0" @@ -1751,6 +2100,36 @@ dependencies = [ "getrandom 0.3.3", ] +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "redb" version = "2.6.2" @@ -1799,6 +2178,35 @@ dependencies = [ "bitflags 2.9.3", ] +[[package]] +name = "regex" +version = "1.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" + [[package]] name = "ring" version = "0.17.14" @@ -1853,6 +2261,16 @@ dependencies = [ "walkdir", ] +[[package]] +name = "rust-stemmers" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" +dependencies = [ + "serde", + "serde_derive", +] + [[package]] name = "rustc-demangle" version = "0.1.26" @@ -1880,6 +2298,19 @@ dependencies = [ "semver", ] +[[package]] +name = "rustix" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" +dependencies = [ + "bitflags 2.9.3", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.0", +] + [[package]] name = "rustls" version = "0.23.31" @@ -1954,6 +2385,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + [[package]] name = "ryu" version = "1.0.20" @@ -2141,6 +2578,15 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "sketches-ddsketch" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a" +dependencies = [ + "serde", +] + [[package]] name = "slab" version = "0.4.11" @@ -2277,6 +2723,165 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "tantivy" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "502915c7381c5cb2d2781503962610cb880ad8f1a0ca95df1bae645d5ebf2545" +dependencies = [ + "aho-corasick", + "arc-swap", + "base64 0.22.1", + "bitpacking", + "bon", + "byteorder", + "census", + "crc32fast", + "crossbeam-channel", + "downcast-rs", + "fastdivide", + "fnv", + "fs4", + "htmlescape", + "hyperloglogplus", + "itertools", + "levenshtein_automata", + "log", + "lru", + "lz4_flex", + "measure_time", + "memmap2", + "once_cell", + "oneshot", + "rayon", + "regex", + "rust-stemmers", + "rustc-hash 2.1.1", + "serde", + "serde_json", + "sketches-ddsketch", + "smallvec", + "tantivy-bitpacker", + "tantivy-columnar", + "tantivy-common", + "tantivy-fst", + "tantivy-query-grammar", + "tantivy-stacker", + "tantivy-tokenizer-api", + "tempfile", + "thiserror 2.0.16", + "time", + "uuid", + "winapi", +] + +[[package]] +name = "tantivy-bitpacker" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3b04eed5108d8283607da6710fe17a7663523440eaf7ea5a1a440d19a1448b6" +dependencies = [ + "bitpacking", +] + +[[package]] +name = "tantivy-columnar" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b628488ae936c83e92b5c4056833054ca56f76c0e616aee8339e24ac89119cd" +dependencies = [ + "downcast-rs", + "fastdivide", + "itertools", + "serde", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-sstable", + "tantivy-stacker", +] + +[[package]] +name = "tantivy-common" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f880aa7cab0c063a47b62596d10991cdd0b6e0e0575d9c5eeb298b307a25de55" +dependencies = [ + "async-trait", + "byteorder", + "ownedbytes", + "serde", + "time", +] + +[[package]] +name = "tantivy-fst" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18" +dependencies = [ + "byteorder", + "regex-syntax", + "utf8-ranges", +] + +[[package]] +name = "tantivy-query-grammar" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "768fccdc84d60d86235d42d7e4c33acf43c418258ff5952abf07bd7837fcd26b" +dependencies = [ + "nom", + "serde", + "serde_json", +] + +[[package]] +name = "tantivy-sstable" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8292095d1a8a2c2b36380ec455f910ab52dde516af36321af332c93f20ab7d5" +dependencies = [ + "futures-util", + "itertools", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-fst", + "zstd", +] + +[[package]] +name = "tantivy-stacker" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23d38a379411169f0b3002c9cba61cdfe315f757e9d4f239c00c282497a0749d" +dependencies = [ + "murmurhash32", + "rand_distr", + "tantivy-common", +] + +[[package]] +name = "tantivy-tokenizer-api" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23024f6aeb25ceb1a0e27740c84bdb0fae52626737b7e9a9de6ad5aa25c7b038" +dependencies = [ + "serde", +] + +[[package]] +name = "tempfile" +version = "3.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" +dependencies = [ + "fastrand", + "getrandom 0.3.3", + "once_cell", + "rustix", + "windows-sys 0.61.0", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -2317,6 +2922,37 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "time" +version = "0.3.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" + +[[package]] +name = "time-macros" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" +dependencies = [ + "num-conv", + "time-core", +] + [[package]] name = "tinystr" version = "0.8.1" @@ -2550,6 +3186,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "utf8-ranges" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -2562,6 +3204,18 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "uuid" +version = "1.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" +dependencies = [ + "getrandom 0.3.3", + "js-sys", + "serde", + "wasm-bindgen", +] + [[package]] name = "version_check" version = "0.9.5" @@ -2611,6 +3265,65 @@ dependencies = [ "wit-bindgen", ] +[[package]] +name = "wasm-bindgen" +version = "0.2.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab10a69fbd0a177f5f649ad4d8d3305499c42bab9aef2f7ff592d0ec8f833819" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bb702423545a6007bbc368fde243ba47ca275e549c8a28617f56f6ba53b1d1c" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn 2.0.106", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc65f4f411d91494355917b605e1480033152658d71f722a90647f56a70c88a0" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffc003a991398a8ee604a401e194b6b3a39677b3173d6e74495eb51b82e99a32" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "293c37f4efa430ca14db3721dfbe48d8c33308096bd44d80ebaa775ab71ba1cf" +dependencies = [ + "unicode-ident", +] + [[package]] name = "webpki-root-certs" version = "0.26.11" @@ -3053,3 +3766,31 @@ dependencies = [ "quote", "syn 2.0.106", ] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index f07b6fd..b3713e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ ed25519-dalek = "2" x25519-dalek = "2" base64 = "0.22" jsonrpsee = { version = "0.26.0", features = ["http-client", "ws-client", "server", "macros"] } +tantivy = "0.25.0" [dev-dependencies] redis = { version = "0.24", features = ["aio", "tokio-comp"] } diff --git a/docs/tantivy.md b/docs/tantivy.md new file mode 100644 index 0000000..d217395 --- /dev/null +++ b/docs/tantivy.md @@ -0,0 +1,253 @@ +# Tantivy Full‑Text Backend (JSON‑RPC) + +This document explains how to use HeroDB’s Tantivy-backed full‑text search as a dedicated database backend and provides copy‑pasteable JSON‑RPC requests. Tantivy is available only for non‑admin databases (db_id >= 1). Admin DB 0 always uses Redb/Sled and rejects FT operations. + +Important characteristics: +- Tantivy is a third backend alongside Redb and Sled. It provides search indexes only; there is no KV store backing it. +- On Tantivy databases, Redis KV/list/hash commands are rejected; only FT commands and basic control (SELECT, CLIENT, INFO, etc.) are allowed. +- FT JSON‑RPC is namespaced as "herodb" and methods are named with underscore: herodb_ftCreate, herodb_ftAdd, herodb_ftSearch, herodb_ftDel, herodb_ftInfo, herodb_ftDrop. + +Reference to server implementation: +- RPC methods are defined in [rust.trait Rpc()](src/rpc.rs:70): + - [rust.fn ft_create()](src/rpc.rs:121) + - [rust.fn ft_add()](src/rpc.rs:130) + - [rust.fn ft_search()](src/rpc.rs:141) + - [rust.fn ft_del()](src/rpc.rs:154) + - [rust.fn ft_info()](src/rpc.rs:158) + - [rust.fn ft_drop()](src/rpc.rs:162) + +Notes on responses: +- ftCreate/ftAdd/ftDel/ftDrop return a JSON boolean: true on success. +- ftSearch/ftInfo return a JSON object with a single key "resp" containing a RESP‑encoded string (wire format used by Redis). You can display or parse it on the client side as needed. + +RESP usage (redis-cli): +- For RESP clients, you must SELECT the Tantivy database first. SELECT now succeeds for Tantivy DBs without opening KV storage. +- After SELECT, you can run FT.* commands within that DB context. + +Example with redis-cli: +```bash +# Connect to server +redis-cli -p 6379 + +# Select Tantivy DB 1 (public by default) +SELECT 1 +# → OK + +# Create index +FT.CREATE product_catalog SCHEMA title TEXT description TEXT category TAG price NUMERIC rating NUMERIC location GEO +# → OK + +# Add a document +FT.ADD product_catalog product:1 1.0 title "Wireless Bluetooth Headphones" description "Premium noise-canceling headphones with 30-hour battery life" category "electronics,audio" price 299.99 rating 4.5 location "-122.4194,37.7749" +# → OK + +# Search +FT.SEARCH product_catalog wireless LIMIT 0 3 +# → RESP array with hits +``` + +Storage layout (on disk): +- Indices are stored per database under: + - /search_indexes// +- Example: /tmp/test/search_indexes/1/product_catalog + +0) Create a new Tantivy database + +Use herodb_createDatabase with backend "Tantivy". DB 0 cannot be Tantivy. + +```json +{ + "jsonrpc": "2.0", + "id": 1, + "method": "herodb_createDatabase", + "params": [ + "Tantivy", + { "name": "search-db", "storage_path": null, "max_size": null, "redis_version": null }, + null + ] +} +``` + +The response contains the allocated db_id (>= 1). Use that id in the calls below. + +1) FT.CREATE — create an index with schema + +Method: herodb_ftCreate → [rust.fn ft_create()](src/rpc.rs:121) + +Schema format is an array of tuples: [ [field_name, field_type, [options...] ], ... ] +Supported field types: "TEXT", "NUMERIC" (defaults to F64), "TAG", "GEO" +Supported options (subset): "WEIGHT", "SORTABLE", "NOINDEX", "SEPARATOR", "CASESENSITIVE" + +```json +{ + "jsonrpc": "2.0", + "id": 2, + "method": "herodb_ftCreate", + "params": [ + 1, + "product_catalog", + [ + ["title", "TEXT", ["SORTABLE"]], + ["description", "TEXT", []], + ["category", "TAG", ["SEPARATOR", ","]], + ["price", "NUMERIC", ["SORTABLE"]], + ["rating", "NUMERIC", []], + ["location", "GEO", []] + ] + ] +} +``` + +Returns: true on success. + +2) FT.ADD — add or replace a document + +Method: herodb_ftAdd → [rust.fn ft_add()](src/rpc.rs:130) + +Fields is an object (map) of field_name → value (all values are sent as strings). GEO expects "lat,lon". + +```json +{ + "jsonrpc": "2.0", + "id": 3, + "method": "herodb_ftAdd", + "params": [ + 1, + "product_catalog", + "product:1", + 1.0, + { + "title": "Wireless Bluetooth Headphones", + "description": "Premium noise-canceling headphones with 30-hour battery life", + "category": "electronics,audio", + "price": "299.99", + "rating": "4.5", + "location": "-122.4194,37.7749" + } + ] +} +``` + +Returns: true on success. + +3) FT.SEARCH — query an index + +Method: herodb_ftSearch → [rust.fn ft_search()](src/rpc.rs:141) + +Parameters: (db_id, index_name, query, filters?, limit?, offset?, return_fields?) +- filters: array of [field, value] pairs (Equals filter) +- limit/offset: numbers (defaults: limit=10, offset=0) +- return_fields: array of field names to include (optional) + +Simple query: +```json +{ + "jsonrpc": "2.0", + "id": 4, + "method": "herodb_ftSearch", + "params": [1, "product_catalog", "wireless", null, 10, 0, null] +} +``` + +Pagination + filters + selected fields: +```json +{ + "jsonrpc": "2.0", + "id": 5, + "method": "herodb_ftSearch", + "params": [ + 1, + "product_catalog", + "mouse", + [["category", "electronics"]], + 5, + 0, + ["title", "price", "rating"] + ] +} +``` + +Response shape: +```json +{ + "jsonrpc": "2.0", + "id": 5, + "result": { "resp": "*...RESP encoded array..." } +} +``` + +4) FT.INFO — index metadata + +Method: herodb_ftInfo → [rust.fn ft_info()](src/rpc.rs:158) + +```json +{ + "jsonrpc": "2.0", + "id": 6, + "method": "herodb_ftInfo", + "params": [1, "product_catalog"] +} +``` + +Response shape: +```json +{ + "jsonrpc": "2.0", + "id": 6, + "result": { "resp": "*...RESP encoded array with fields and counts..." } +} +``` + +5) FT.DEL — delete by doc id + +Method: herodb_ftDel → [rust.fn ft_del()](src/rpc.rs:154) + +```json +{ + "jsonrpc": "2.0", + "id": 7, + "method": "herodb_ftDel", + "params": [1, "product_catalog", "product:1"] +} +``` + +Returns: true on success. Note: current implementation logs and returns success; physical delete may be a no‑op until delete is finalized in the engine. + +6) FT.DROP — drop an index + +Method: herodb_ftDrop → [rust.fn ft_drop()](src/rpc.rs:162) + +```json +{ + "jsonrpc": "2.0", + "id": 8, + "method": "herodb_ftDrop", + "params": [1, "product_catalog"] +} +``` + +Returns: true on success. + +Field types and options + +- TEXT: stored/indexed/tokenized text. "SORTABLE" marks it fast (stored + fast path in our wrapper). +- NUMERIC: stored/indexed numeric; default precision F64. "SORTABLE" enables fast column. +- TAG: exact matching terms. Options: "SEPARATOR" (default ","), "CASESENSITIVE" (default false). +- GEO: "lat,lon" string; stored as two numeric fields internally. + +Backend and permission gating + +- FT methods are rejected on DB 0. +- FT methods require the database backend to be Tantivy; otherwise RPC returns an error. +- Write‑like FT methods (create/add/del/drop) follow the same permission model as Redis writes on selected databases. + +Troubleshooting + +- "DB backend is not Tantivy": ensure the database was created with backend "Tantivy". +- "FT not allowed on DB 0": use a non‑admin database id (>= 1). +- Empty search results: confirm that the queried fields are tokenized/indexed (TEXT) and that documents were added successfully. + +Related docs + +- Command‑level search overview: [docs/search.md](docs/search.md:1) +- RPC definitions: [src/rpc.rs](src/rpc.rs:1) \ No newline at end of file diff --git a/src/admin_meta.rs b/src/admin_meta.rs index 9039402..5d9c545 100644 --- a/src/admin_meta.rs +++ b/src/admin_meta.rs @@ -48,6 +48,9 @@ fn init_admin_storage( let storage: Arc = match backend { options::BackendType::Redb => Arc::new(Storage::new(&db_file, true, Some(admin_secret))?), options::BackendType::Sled => Arc::new(SledStorage::new(&db_file, true, Some(admin_secret))?), + options::BackendType::Tantivy => { + return Err(DBError("Admin DB 0 cannot use Tantivy backend".to_string())) + } }; Ok(storage) } @@ -199,6 +202,9 @@ pub fn open_data_storage( let storage: Arc = match effective_backend { options::BackendType::Redb => Arc::new(Storage::new(&db_file, should_encrypt, enc.as_deref())?), options::BackendType::Sled => Arc::new(SledStorage::new(&db_file, should_encrypt, enc.as_deref())?), + options::BackendType::Tantivy => { + return Err(DBError("Tantivy backend has no KV storage; use FT.* commands only".to_string())) + } }; // Publish to registry @@ -291,6 +297,7 @@ pub fn set_database_backend( let val = match db_backend { options::BackendType::Redb => "Redb", options::BackendType::Sled => "Sled", + options::BackendType::Tantivy => "Tantivy", }; let _ = admin.hset(&mk, vec![("backend".to_string(), val.to_string())])?; Ok(()) @@ -307,6 +314,7 @@ pub fn get_database_backend( match admin.hget(&mk, "backend")? { Some(s) if s == "Redb" => Ok(Some(options::BackendType::Redb)), Some(s) if s == "Sled" => Ok(Some(options::BackendType::Sled)), + Some(s) if s == "Tantivy" => Ok(Some(options::BackendType::Tantivy)), _ => Ok(None), } } diff --git a/src/cmd.rs b/src/cmd.rs index 3405f02..45f0fea 100644 --- a/src/cmd.rs +++ b/src/cmd.rs @@ -91,6 +91,41 @@ pub enum Cmd { SymKeygen, SymEncrypt(String, String), // key_b64, message SymDecrypt(String, String), // key_b64, ciphertext_b64 + + // Full-text search commands with schema support + FtCreate { + index_name: String, + schema: Vec<(String, String, Vec)>, // (field_name, field_type, options) + }, + FtAdd { + index_name: String, + doc_id: String, + score: f64, + fields: std::collections::HashMap, + }, + FtSearch { + index_name: String, + query: String, + filters: Vec<(String, String)>, // field, value pairs + limit: Option, + offset: Option, + return_fields: Option>, + }, + FtDel(String, String), // index_name, doc_id + FtInfo(String), // index_name + FtDrop(String), // index_name + FtAlter { + index_name: String, + field_name: String, + field_type: String, + options: Vec, + }, + FtAggregate { + index_name: String, + query: String, + group_by: Vec, + reducers: Vec, + } } impl Cmd { @@ -646,6 +681,140 @@ impl Cmd { _ => return Err(DBError(format!("unsupported SYM subcommand {:?}", cmd))), } } + "ft.create" => { + if cmd.len() < 4 || cmd[2].to_uppercase() != "SCHEMA" { + return Err(DBError("ERR FT.CREATE requires: indexname SCHEMA field1 type1 [options] ...".to_string())); + } + let index_name = cmd[1].clone(); + let mut schema = Vec::new(); + let mut i = 3; + while i < cmd.len() { + if i + 1 >= cmd.len() { + return Err(DBError("ERR incomplete field definition".to_string())); + } + let field_name = cmd[i].clone(); + let field_type = cmd[i + 1].to_uppercase(); + let mut options = Vec::new(); + i += 2; + // Parse field options until we hit another field name or end + while i < cmd.len() + && ["WEIGHT","SORTABLE","NOINDEX","SEPARATOR","CASESENSITIVE"] + .contains(&cmd[i].to_uppercase().as_str()) + { + options.push(cmd[i].to_uppercase()); + i += 1; + // If this option takes a value, consume it too + if i > 0 && ["SEPARATOR","WEIGHT"].contains(&cmd[i - 1].to_uppercase().as_str()) && i < cmd.len() { + options.push(cmd[i].clone()); + i += 1; + } + } + schema.push((field_name, field_type, options)); + } + Cmd::FtCreate { index_name, schema } + } + "ft.add" => { + if cmd.len() < 5 { + return Err(DBError("ERR FT.ADD requires: index_name doc_id score field value ...".to_string())); + } + let index_name = cmd[1].clone(); + let doc_id = cmd[2].clone(); + let score = cmd[3].parse::().map_err(|_| DBError("ERR score must be a number".to_string()))?; + let mut fields = std::collections::HashMap::new(); + let mut i = 4; + while i + 1 < cmd.len() { + fields.insert(cmd[i].clone(), cmd[i + 1].clone()); + i += 2; + } + Cmd::FtAdd { index_name, doc_id, score, fields } + } + "ft.search" => { + if cmd.len() < 3 { + return Err(DBError("ERR FT.SEARCH requires: index_name query [options]".to_string())); + } + let index_name = cmd[1].clone(); + let query = cmd[2].clone(); + let mut filters = Vec::new(); + let mut limit = None; + let mut offset = None; + let mut return_fields = None; + let mut i = 3; + while i < cmd.len() { + match cmd[i].to_uppercase().as_str() { + "FILTER" => { + if i + 2 >= cmd.len() { + return Err(DBError("ERR FILTER requires field and value".to_string())); + } + filters.push((cmd[i + 1].clone(), cmd[i + 2].clone())); + i += 3; + } + "LIMIT" => { + if i + 2 >= cmd.len() { + return Err(DBError("ERR LIMIT requires offset and num".to_string())); + } + offset = Some(cmd[i + 1].parse().unwrap_or(0)); + limit = Some(cmd[i + 2].parse().unwrap_or(10)); + i += 3; + } + "RETURN" => { + if i + 1 >= cmd.len() { + return Err(DBError("ERR RETURN requires field count".to_string())); + } + let count: usize = cmd[i + 1].parse().unwrap_or(0); + i += 2; + let mut fields = Vec::new(); + for _ in 0..count { + if i < cmd.len() { + fields.push(cmd[i].clone()); + i += 1; + } + } + return_fields = Some(fields); + } + _ => i += 1, + } + } + Cmd::FtSearch { index_name, query, filters, limit, offset, return_fields } + } + "ft.del" => { + if cmd.len() != 3 { + return Err(DBError("ERR FT.DEL requires: index_name doc_id".to_string())); + } + Cmd::FtDel(cmd[1].clone(), cmd[2].clone()) + } + "ft.info" => { + if cmd.len() != 2 { + return Err(DBError("ERR FT.INFO requires: index_name".to_string())); + } + Cmd::FtInfo(cmd[1].clone()) + } + "ft.drop" => { + if cmd.len() != 2 { + return Err(DBError("ERR FT.DROP requires: index_name".to_string())); + } + Cmd::FtDrop(cmd[1].clone()) + } + "ft.alter" => { + if cmd.len() < 5 { + return Err(DBError("ERR FT.ALTER requires: index_name field_name field_type [options]".to_string())); + } + let index_name = cmd[1].clone(); + let field_name = cmd[2].clone(); + let field_type = cmd[3].clone(); + let options = if cmd.len() > 4 { cmd[4..].to_vec() } else { vec![] }; + Cmd::FtAlter { index_name, field_name, field_type, options } + } + "ft.aggregate" => { + if cmd.len() < 3 { + return Err(DBError("ERR FT.AGGREGATE requires: index_name query [options]".to_string())); + } + let index_name = cmd[1].clone(); + let query = cmd[2].clone(); + // Minimal parse for now + let group_by = Vec::new(); + let reducers = Vec::new(); + Cmd::FtAggregate { index_name, query, group_by, reducers } + } _ => Cmd::Unknow(cmd[0].clone()), }, protocol, @@ -671,6 +840,59 @@ impl Cmd { return Ok(Protocol::SimpleString("QUEUED".to_string())); } + // Backend gating for Tantivy-only DBs: allow only FT.* and basic control/info commands + // Determine per-selected-db backend via admin meta (not process default). + let is_tantivy_backend = crate::admin_meta::get_database_backend( + &server.option.dir, + server.option.backend.clone(), + &server.option.admin_secret, + server.selected_db, + ) + .ok() + .flatten() + .map(|b| matches!(b, crate::options::BackendType::Tantivy)) + .unwrap_or(false); + + if is_tantivy_backend { + match &self { + Cmd::Select(..) + | Cmd::Quit + | Cmd::Client(..) + | Cmd::ClientSetName(..) + | Cmd::ClientGetName + | Cmd::Command(..) + | Cmd::Info(..) + | Cmd::FtCreate { .. } + | Cmd::FtAdd { .. } + | Cmd::FtSearch { .. } + | Cmd::FtDel(..) + | Cmd::FtInfo(..) + | Cmd::FtDrop(..) + | Cmd::FtAlter { .. } + | Cmd::FtAggregate { .. } => {} + _ => { + return Ok(Protocol::err("ERR backend is Tantivy; only FT.* commands are allowed")); + } + } + } + + // If selected DB is not Tantivy, forbid all FT.* commands here. + if !is_tantivy_backend { + match &self { + Cmd::FtCreate { .. } + | Cmd::FtAdd { .. } + | Cmd::FtSearch { .. } + | Cmd::FtDel(..) + | Cmd::FtInfo(..) + | Cmd::FtDrop(..) + | Cmd::FtAlter { .. } + | Cmd::FtAggregate { .. } => { + return Ok(Protocol::err("ERR DB backend is not Tantivy; FT.* commands are not allowed")); + } + _ => {} + } + } + match self { Cmd::Select(db, key) => select_cmd(server, db, key).await, Cmd::Ping => Ok(Protocol::SimpleString("PONG".to_string())), @@ -767,6 +989,32 @@ impl Cmd { Cmd::SymEncrypt(key_b64, message) => Ok(crate::sym::cmd_sym_encrypt(&key_b64, &message).await), Cmd::SymDecrypt(key_b64, ct_b64) => Ok(crate::sym::cmd_sym_decrypt(&key_b64, &ct_b64).await), + // Full-text search commands + Cmd::FtCreate { index_name, schema } => { + crate::search_cmd::ft_create_cmd(server, index_name, schema).await + } + Cmd::FtAdd { index_name, doc_id, score, fields } => { + crate::search_cmd::ft_add_cmd(server, index_name, doc_id, score, fields).await + } + Cmd::FtSearch { index_name, query, filters, limit, offset, return_fields } => { + crate::search_cmd::ft_search_cmd(server, index_name, query, filters, limit, offset, return_fields).await + } + Cmd::FtDel(index_name, doc_id) => { + crate::search_cmd::ft_del_cmd(server, index_name, doc_id).await + } + Cmd::FtInfo(index_name) => { + crate::search_cmd::ft_info_cmd(server, index_name).await + } + Cmd::FtDrop(index_name) => { + crate::search_cmd::ft_drop_cmd(server, index_name).await + } + Cmd::FtAlter { .. } => { + Ok(Protocol::err("FT.ALTER not implemented yet")) + } + Cmd::FtAggregate { .. } => { + Ok(Protocol::err("FT.AGGREGATE not implemented yet")) + } + Cmd::Unknow(s) => Ok(Protocol::err(&format!("ERR unknown command `{}`", s))), } } @@ -852,13 +1100,28 @@ async fn select_cmd(server: &mut Server, db: u64, key: Option) -> Result None => return Ok(Protocol::err("ERR invalid access key")), }; - // Set selected database and permissions, then open storage + // Set selected database and permissions, then open storage (skip for Tantivy backend) server.selected_db = db; server.current_permissions = Some(perms); - match server.current_storage() { - Ok(_) => Ok(Protocol::SimpleString("OK".to_string())), - Err(e) => Ok(Protocol::err(&e.0)), + // Resolve effective backend for this db_id from admin meta + let eff_backend = crate::admin_meta::get_database_backend( + &server.option.dir, + server.option.backend.clone(), + &server.option.admin_secret, + db, + ) + .ok() + .flatten(); + + if matches!(eff_backend, Some(crate::options::BackendType::Tantivy)) { + // Tantivy DBs have no KV storage; allow SELECT to succeed + Ok(Protocol::SimpleString("OK".to_string())) + } else { + match server.current_storage() { + Ok(_) => Ok(Protocol::SimpleString("OK".to_string())), + Err(e) => Ok(Protocol::err(&e.0)), + } } } @@ -1196,7 +1459,27 @@ async fn dbsize_cmd(server: &Server) -> Result { } async fn info_cmd(server: &Server, section: &Option) -> Result { - let storage_info = server.current_storage()?.info()?; + // For Tantivy backend, there is no KV storage; synthesize minimal info. + // Determine effective backend for the currently selected db. + let is_tantivy_db = crate::admin_meta::get_database_backend( + &server.option.dir, + server.option.backend.clone(), + &server.option.admin_secret, + server.selected_db, + ) + .ok() + .flatten() + .map(|b| matches!(b, crate::options::BackendType::Tantivy)) + .unwrap_or(false); + + let storage_info: Vec<(String, String)> = if is_tantivy_db { + vec![ + ("db_size".to_string(), "0".to_string()), + ("is_encrypted".to_string(), "false".to_string()), + ] + } else { + server.current_storage()?.info()? + }; let mut info_map: std::collections::HashMap = storage_info.into_iter().collect(); info_map.insert("redis_version".to_string(), "7.0.0".to_string()); diff --git a/src/lib.rs b/src/lib.rs index 24a3208..fdccb0a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,3 +12,5 @@ pub mod storage; pub mod storage_trait; pub mod storage_sled; pub mod admin_meta; +pub mod tantivy_search; +pub mod search_cmd; diff --git a/src/options.rs b/src/options.rs index c819ca0..a7a0216 100644 --- a/src/options.rs +++ b/src/options.rs @@ -2,6 +2,7 @@ pub enum BackendType { Redb, Sled, + Tantivy, // Full-text search backend (no KV storage) } #[derive(Debug, Clone)] diff --git a/src/rpc.rs b/src/rpc.rs index 207f45f..c43e509 100644 --- a/src/rpc.rs +++ b/src/rpc.rs @@ -14,6 +14,7 @@ use crate::admin_meta; pub enum BackendType { Redb, Sled, + Tantivy, // Full-text search backend (no KV storage) // Future: InMemory, Custom(String) } @@ -112,6 +113,53 @@ pub trait Rpc { /// Set database public/private status #[method(name = "setDatabasePublic")] async fn set_database_public(&self, db_id: u64, public: bool) -> RpcResult; + + // ----- Full-text (Tantivy) minimal RPC endpoints ----- + + /// Create a new FT index in a Tantivy-backed DB + #[method(name = "ftCreate")] + async fn ft_create( + &self, + db_id: u64, + index_name: String, + schema: Vec<(String, String, Vec)>, + ) -> RpcResult; + + /// Add or replace a document in an FT index + #[method(name = "ftAdd")] + async fn ft_add( + &self, + db_id: u64, + index_name: String, + doc_id: String, + score: f64, + fields: HashMap, + ) -> RpcResult; + + /// Search an FT index + #[method(name = "ftSearch")] + async fn ft_search( + &self, + db_id: u64, + index_name: String, + query: String, + filters: Option>, + limit: Option, + offset: Option, + return_fields: Option>, + ) -> RpcResult; + + /// Delete a document by id from an FT index + #[method(name = "ftDel")] + async fn ft_del(&self, db_id: u64, index_name: String, doc_id: String) -> RpcResult; + + /// Get FT index info + #[method(name = "ftInfo")] + async fn ft_info(&self, db_id: u64, index_name: String) -> RpcResult; + + /// Drop an FT index + #[method(name = "ftDrop")] + async fn ft_drop(&self, db_id: u64, index_name: String) -> RpcResult; } /// RPC Server implementation @@ -187,13 +235,14 @@ impl RpcServerImpl { } // Create server instance with resolved backend + let is_tantivy = matches!(effective_backend, crate::options::BackendType::Tantivy); let db_option = DBOption { dir: self.base_dir.clone(), port: 0, // Not used for RPC-managed databases debug: false, encryption_key: None, encrypt: false, - backend: effective_backend, + backend: effective_backend.clone(), admin_secret: self.admin_secret.clone(), }; @@ -203,7 +252,10 @@ impl RpcServerImpl { server.selected_db = db_id; // Lazily open/create physical storage according to admin meta (per-db encryption) - let _ = server.current_storage(); + // Skip for Tantivy backend (no KV storage to open) + if !is_tantivy { + let _ = server.current_storage(); + } // Store the server let mut servers = self.servers.write().await; @@ -290,6 +342,7 @@ impl RpcServerImpl { let backend = match server.option.backend { crate::options::BackendType::Redb => BackendType::Redb, crate::options::BackendType::Sled => BackendType::Sled, + crate::options::BackendType::Tantivy => BackendType::Tantivy, }; DatabaseInfo { @@ -340,18 +393,20 @@ impl RpcServer for RpcServerImpl { let opt_backend = match backend { BackendType::Redb => crate::options::BackendType::Redb, BackendType::Sled => crate::options::BackendType::Sled, + BackendType::Tantivy => crate::options::BackendType::Tantivy, }; admin_meta::set_database_backend(&self.base_dir, self.backend.clone(), &self.admin_secret, db_id, opt_backend.clone()) .map_err(|e| jsonrpsee::types::ErrorObjectOwned::owned(-32000, e.0, None::<()>))?; // Create server instance using base_dir, chosen backend and admin secret + let is_tantivy_new = matches!(opt_backend, crate::options::BackendType::Tantivy); let option = DBOption { dir: self.base_dir.clone(), port: 0, // Not used for RPC-managed databases debug: false, encryption_key: None, // per-db key is stored in admin DB 0 encrypt: false, // encryption decided per-db at open time - backend: opt_backend, + backend: opt_backend.clone(), admin_secret: self.admin_secret.clone(), }; @@ -359,7 +414,10 @@ impl RpcServer for RpcServerImpl { server.selected_db = db_id; // Initialize storage to create physical .db with proper encryption from admin meta - let _ = server.current_storage(); + // Skip for Tantivy backend (no KV storage to initialize) + if !is_tantivy_new { + let _ = server.current_storage(); + } // Store the server in cache let mut servers = self.servers.write().await; @@ -420,6 +478,7 @@ impl RpcServer for RpcServerImpl { let db_ids = self.discover_databases().await; let mut stats = HashMap::new(); + stats.insert("total_databases".to_string(), serde_json::json!(db_ids.len())); stats.insert("uptime".to_string(), serde_json::json!( std::time::SystemTime::now() @@ -431,6 +490,121 @@ impl RpcServer for RpcServerImpl { Ok(stats) } + // ----- Full-text (Tantivy) minimal RPC endpoints ----- + + async fn ft_create( + &self, + db_id: u64, + index_name: String, + schema: Vec<(String, String, Vec)>, + ) -> RpcResult { + let server = self.get_or_create_server(db_id).await?; + if db_id == 0 { + return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "FT not allowed on DB 0", None::<()>)); + } + if !matches!(server.option.backend, crate::options::BackendType::Tantivy) { + return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "DB backend is not Tantivy", None::<()>)); + } + crate::search_cmd::ft_create_cmd(&*server, index_name, schema) + .await + .map_err(|e| jsonrpsee::types::ErrorObjectOwned::owned(-32000, e.0, None::<()>))?; + Ok(true) + } + + async fn ft_add( + &self, + db_id: u64, + index_name: String, + doc_id: String, + score: f64, + fields: HashMap, + ) -> RpcResult { + let server = self.get_or_create_server(db_id).await?; + if db_id == 0 { + return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "FT not allowed on DB 0", None::<()>)); + } + if !matches!(server.option.backend, crate::options::BackendType::Tantivy) { + return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "DB backend is not Tantivy", None::<()>)); + } + crate::search_cmd::ft_add_cmd(&*server, index_name, doc_id, score, fields) + .await + .map_err(|e| jsonrpsee::types::ErrorObjectOwned::owned(-32000, e.0, None::<()>))?; + Ok(true) + } + + async fn ft_search( + &self, + db_id: u64, + index_name: String, + query: String, + filters: Option>, + limit: Option, + offset: Option, + return_fields: Option>, + ) -> RpcResult { + let server = self.get_or_create_server(db_id).await?; + if db_id == 0 { + return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "FT not allowed on DB 0", None::<()>)); + } + if !matches!(server.option.backend, crate::options::BackendType::Tantivy) { + return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "DB backend is not Tantivy", None::<()>)); + } + let proto = crate::search_cmd::ft_search_cmd( + &*server, + index_name, + query, + filters.unwrap_or_default(), + limit, + offset, + return_fields, + ) + .await + .map_err(|e| jsonrpsee::types::ErrorObjectOwned::owned(-32000, e.0, None::<()>))?; + Ok(serde_json::json!({ "resp": proto.encode() })) + } + + async fn ft_del(&self, db_id: u64, index_name: String, doc_id: String) -> RpcResult { + let server = self.get_or_create_server(db_id).await?; + if db_id == 0 { + return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "FT not allowed on DB 0", None::<()>)); + } + if !matches!(server.option.backend, crate::options::BackendType::Tantivy) { + return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "DB backend is not Tantivy", None::<()>)); + } + crate::search_cmd::ft_del_cmd(&*server, index_name, doc_id) + .await + .map_err(|e| jsonrpsee::types::ErrorObjectOwned::owned(-32000, e.0, None::<()>))?; + Ok(true) + } + + async fn ft_info(&self, db_id: u64, index_name: String) -> RpcResult { + let server = self.get_or_create_server(db_id).await?; + if db_id == 0 { + return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "FT not allowed on DB 0", None::<()>)); + } + if !matches!(server.option.backend, crate::options::BackendType::Tantivy) { + return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "DB backend is not Tantivy", None::<()>)); + } + let proto = crate::search_cmd::ft_info_cmd(&*server, index_name) + .await + .map_err(|e| jsonrpsee::types::ErrorObjectOwned::owned(-32000, e.0, None::<()>))?; + Ok(serde_json::json!({ "resp": proto.encode() })) + } + + async fn ft_drop(&self, db_id: u64, index_name: String) -> RpcResult { + let server = self.get_or_create_server(db_id).await?; + if db_id == 0 { + return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "FT not allowed on DB 0", None::<()>)); + } + if !matches!(server.option.backend, crate::options::BackendType::Tantivy) { + return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "DB backend is not Tantivy", None::<()>)); + } + crate::search_cmd::ft_drop_cmd(&*server, index_name) + .await + .map_err(|e| jsonrpsee::types::ErrorObjectOwned::owned(-32000, e.0, None::<()>))?; + Ok(true) + } + async fn add_access_key(&self, db_id: u64, key: String, permissions: String) -> RpcResult { let perms = match permissions.to_lowercase().as_str() { "read" => Permissions::Read, diff --git a/src/search_cmd.rs b/src/search_cmd.rs new file mode 100644 index 0000000..02e9550 --- /dev/null +++ b/src/search_cmd.rs @@ -0,0 +1,352 @@ +use crate::{ + error::DBError, + protocol::Protocol, + server::Server, + tantivy_search::{ + FieldDef, Filter, FilterType, IndexConfig, NumericType, SearchOptions, TantivySearch, + }, +}; +use std::collections::HashMap; +use std::sync::Arc; + +pub async fn ft_create_cmd( + server: &Server, + index_name: String, + schema: Vec<(String, String, Vec)>, +) -> Result { + if server.selected_db == 0 { + return Ok(Protocol::err("FT commands are not allowed on DB 0")); + } + // Enforce Tantivy backend for selected DB + let is_tantivy = crate::admin_meta::get_database_backend( + &server.option.dir, + server.option.backend.clone(), + &server.option.admin_secret, + server.selected_db, + ) + .ok() + .flatten() + .map(|b| matches!(b, crate::options::BackendType::Tantivy)) + .unwrap_or(false); + if !is_tantivy { + return Ok(Protocol::err("ERR DB backend is not Tantivy; FT.* commands are not allowed")); + } + + // Parse schema into field definitions + let mut field_definitions = Vec::new(); + for (field_name, field_type, options) in schema { + let field_def = match field_type.to_uppercase().as_str() { + "TEXT" => { + let mut sortable = false; + let mut no_index = false; + // Weight is not used in current implementation + let mut _weight = 1.0f32; + let mut i = 0; + while i < options.len() { + match options[i].to_uppercase().as_str() { + "WEIGHT" => { + if i + 1 < options.len() { + _weight = options[i + 1].parse::().unwrap_or(1.0); + i += 2; + continue; + } + } + "SORTABLE" => { + sortable = true; + } + "NOINDEX" => { + no_index = true; + } + _ => {} + } + i += 1; + } + FieldDef::Text { + stored: true, + indexed: !no_index, + tokenized: true, + fast: sortable, + } + } + "NUMERIC" => { + // default to F64 + let mut sortable = false; + for opt in &options { + if opt.to_uppercase() == "SORTABLE" { + sortable = true; + } + } + FieldDef::Numeric { + stored: true, + indexed: true, + fast: sortable, + precision: NumericType::F64, + } + } + "TAG" => { + let mut separator = ",".to_string(); + let mut case_sensitive = false; + let mut i = 0; + while i < options.len() { + match options[i].to_uppercase().as_str() { + "SEPARATOR" => { + if i + 1 < options.len() { + separator = options[i + 1].clone(); + i += 2; + continue; + } + } + "CASESENSITIVE" => { + case_sensitive = true; + } + _ => {} + } + i += 1; + } + FieldDef::Tag { + stored: true, + separator, + case_sensitive, + } + } + "GEO" => FieldDef::Geo { stored: true }, + _ => { + return Err(DBError(format!("Unknown field type: {}", field_type))); + } + }; + field_definitions.push((field_name, field_def)); + } + + // Create the search index + let search_path = server.search_index_path(); + let config = IndexConfig::default(); + let search_index = TantivySearch::new_with_schema( + search_path, + index_name.clone(), + field_definitions, + Some(config), + )?; + + // Store in registry + let mut indexes = server.search_indexes.write().unwrap(); + indexes.insert(index_name, Arc::new(search_index)); + + Ok(Protocol::SimpleString("OK".to_string())) +} + +pub async fn ft_add_cmd( + server: &Server, + index_name: String, + doc_id: String, + _score: f64, + fields: HashMap, +) -> Result { + if server.selected_db == 0 { + return Ok(Protocol::err("FT commands are not allowed on DB 0")); + } + // Enforce Tantivy backend for selected DB + let is_tantivy = crate::admin_meta::get_database_backend( + &server.option.dir, + server.option.backend.clone(), + &server.option.admin_secret, + server.selected_db, + ) + .ok() + .flatten() + .map(|b| matches!(b, crate::options::BackendType::Tantivy)) + .unwrap_or(false); + if !is_tantivy { + return Ok(Protocol::err("ERR DB backend is not Tantivy; FT.* commands are not allowed")); + } + let indexes = server.search_indexes.read().unwrap(); + let search_index = indexes + .get(&index_name) + .ok_or_else(|| DBError(format!("Index '{}' not found", index_name)))?; + search_index.add_document_with_fields(&doc_id, fields)?; + Ok(Protocol::SimpleString("OK".to_string())) +} + +pub async fn ft_search_cmd( + server: &Server, + index_name: String, + query: String, + filters: Vec<(String, String)>, + limit: Option, + offset: Option, + return_fields: Option>, +) -> Result { + if server.selected_db == 0 { + return Ok(Protocol::err("FT commands are not allowed on DB 0")); + } + // Enforce Tantivy backend for selected DB + let is_tantivy = crate::admin_meta::get_database_backend( + &server.option.dir, + server.option.backend.clone(), + &server.option.admin_secret, + server.selected_db, + ) + .ok() + .flatten() + .map(|b| matches!(b, crate::options::BackendType::Tantivy)) + .unwrap_or(false); + if !is_tantivy { + return Ok(Protocol::err("ERR DB backend is not Tantivy; FT.* commands are not allowed")); + } + let indexes = server.search_indexes.read().unwrap(); + let search_index = indexes + .get(&index_name) + .ok_or_else(|| DBError(format!("Index '{}' not found", index_name)))?; + + let search_filters = filters + .into_iter() + .map(|(field, value)| Filter { + field, + filter_type: FilterType::Equals(value), + }) + .collect(); + + let options = SearchOptions { + limit: limit.unwrap_or(10), + offset: offset.unwrap_or(0), + filters: search_filters, + sort_by: None, + return_fields, + highlight: false, + }; + + let results = search_index.search_with_options(&query, options)?; + + // Format results as Redis protocol + let mut response = Vec::new(); + // First element is the total count + response.push(Protocol::SimpleString(results.total.to_string())); + // Then each document + for doc in results.documents { + let mut doc_array = Vec::new(); + // Add document ID if it exists + if let Some(id) = doc.fields.get("_id") { + doc_array.push(Protocol::BulkString(id.clone())); + } + // Add score + doc_array.push(Protocol::BulkString(doc.score.to_string())); + // Add fields as key-value pairs + for (field_name, field_value) in doc.fields { + if field_name != "_id" { + doc_array.push(Protocol::BulkString(field_name)); + doc_array.push(Protocol::BulkString(field_value)); + } + } + response.push(Protocol::Array(doc_array)); + } + + Ok(Protocol::Array(response)) +} + +pub async fn ft_del_cmd( + server: &Server, + index_name: String, + doc_id: String, +) -> Result { + if server.selected_db == 0 { + return Ok(Protocol::err("FT commands are not allowed on DB 0")); + } + // Enforce Tantivy backend for selected DB + let is_tantivy = crate::admin_meta::get_database_backend( + &server.option.dir, + server.option.backend.clone(), + &server.option.admin_secret, + server.selected_db, + ) + .ok() + .flatten() + .map(|b| matches!(b, crate::options::BackendType::Tantivy)) + .unwrap_or(false); + if !is_tantivy { + return Ok(Protocol::err("ERR DB backend is not Tantivy; FT.* commands are not allowed")); + } + let indexes = server.search_indexes.read().unwrap(); + let _search_index = indexes + .get(&index_name) + .ok_or_else(|| DBError(format!("Index '{}' not found", index_name)))?; + // Not fully implemented yet: Tantivy delete by term would require a writer session and commit coordination. + println!("Deleting document '{}' from index '{}'", doc_id, index_name); + Ok(Protocol::SimpleString("1".to_string())) +} + +pub async fn ft_info_cmd(server: &Server, index_name: String) -> Result { + if server.selected_db == 0 { + return Ok(Protocol::err("FT commands are not allowed on DB 0")); + } + // Enforce Tantivy backend for selected DB + let is_tantivy = crate::admin_meta::get_database_backend( + &server.option.dir, + server.option.backend.clone(), + &server.option.admin_secret, + server.selected_db, + ) + .ok() + .flatten() + .map(|b| matches!(b, crate::options::BackendType::Tantivy)) + .unwrap_or(false); + if !is_tantivy { + return Ok(Protocol::err("ERR DB backend is not Tantivy; FT.* commands are not allowed")); + } + let indexes = server.search_indexes.read().unwrap(); + let search_index = indexes + .get(&index_name) + .ok_or_else(|| DBError(format!("Index '{}' not found", index_name)))?; + let info = search_index.get_info()?; + + // Format info as Redis protocol + let mut response = Vec::new(); + response.push(Protocol::BulkString("index_name".to_string())); + response.push(Protocol::BulkString(info.name)); + response.push(Protocol::BulkString("num_docs".to_string())); + response.push(Protocol::BulkString(info.num_docs.to_string())); + response.push(Protocol::BulkString("num_fields".to_string())); + response.push(Protocol::BulkString(info.fields.len().to_string())); + response.push(Protocol::BulkString("fields".to_string())); + let fields_str = info + .fields + .iter() + .map(|f| format!("{}:{}", f.name, f.field_type)) + .collect::>() + .join(", "); + response.push(Protocol::BulkString(fields_str)); + Ok(Protocol::Array(response)) +} + +pub async fn ft_drop_cmd(server: &Server, index_name: String) -> Result { + if server.selected_db == 0 { + return Ok(Protocol::err("FT commands are not allowed on DB 0")); + } + // Enforce Tantivy backend for selected DB + let is_tantivy = crate::admin_meta::get_database_backend( + &server.option.dir, + server.option.backend.clone(), + &server.option.admin_secret, + server.selected_db, + ) + .ok() + .flatten() + .map(|b| matches!(b, crate::options::BackendType::Tantivy)) + .unwrap_or(false); + if !is_tantivy { + return Ok(Protocol::err("ERR DB backend is not Tantivy; FT.* commands are not allowed")); + } + + // Remove from registry + { + let mut indexes = server.search_indexes.write().unwrap(); + indexes.remove(&index_name); + } + + // Remove the index files from disk + let index_path = server.search_index_path().join(&index_name); + if index_path.exists() { + std::fs::remove_dir_all(&index_path) + .map_err(|e| DBError(format!("Failed to remove index files: {}", e)))?; + } + + Ok(Protocol::SimpleString("OK".to_string())) +} \ No newline at end of file diff --git a/src/server.rs b/src/server.rs index f02f065..c71daab 100644 --- a/src/server.rs +++ b/src/server.rs @@ -23,6 +23,9 @@ pub struct Server { pub queued_cmd: Option>, pub current_permissions: Option, + // In-memory registry of Tantivy search indexes for this server + pub search_indexes: Arc>>>, + // BLPOP waiter registry: per (db_index, key) FIFO of waiters pub list_waiters: Arc>>>>, pub waiter_seq: Arc, @@ -49,12 +52,25 @@ impl Server { selected_db: 0, queued_cmd: None, current_permissions: None, - + + search_indexes: Arc::new(std::sync::RwLock::new(HashMap::new())), list_waiters: Arc::new(Mutex::new(HashMap::new())), waiter_seq: Arc::new(AtomicU64::new(1)), } } + // Path where search indexes are stored, namespaced per selected DB: + // /search_indexes/ + pub fn search_index_path(&self) -> std::path::PathBuf { + let base = std::path::PathBuf::from(&self.option.dir) + .join("search_indexes") + .join(self.selected_db.to_string()); + if !base.exists() { + let _ = std::fs::create_dir_all(&base); + } + base + } + pub fn current_storage(&self) -> Result, DBError> { let mut cache = self.db_cache.write().unwrap(); diff --git a/src/tantivy_search.rs b/src/tantivy_search.rs new file mode 100644 index 0000000..2c0a8ae --- /dev/null +++ b/src/tantivy_search.rs @@ -0,0 +1,667 @@ +use crate::error::DBError; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::{Arc, RwLock}; +use tantivy::{ + collector::TopDocs, + directory::MmapDirectory, + query::{BooleanQuery, Occur, Query, QueryParser, TermQuery}, + schema::{ + DateOptions, Field, IndexRecordOption, NumericOptions, Schema, TextFieldIndexing, TextOptions, STORED, STRING, + }, + tokenizer::TokenizerManager, + DateTime, Index, IndexReader, IndexWriter, TantivyDocument, Term, +}; +use tantivy::schema::Value; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum FieldDef { + Text { + stored: bool, + indexed: bool, + tokenized: bool, + fast: bool, + }, + Numeric { + stored: bool, + indexed: bool, + fast: bool, + precision: NumericType, + }, + Tag { + stored: bool, + separator: String, + case_sensitive: bool, + }, + Geo { + stored: bool, + }, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum NumericType { + I64, + U64, + F64, + Date, +} + +pub struct IndexSchema { + schema: Schema, + fields: HashMap, + default_search_fields: Vec, +} + +pub struct TantivySearch { + index: Index, + writer: Arc>, + reader: IndexReader, + index_schema: IndexSchema, + name: String, + config: IndexConfig, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct IndexConfig { + pub language: String, + pub stopwords: Vec, + pub stemming: bool, + pub max_doc_count: Option, + pub default_score: f64, +} + +impl Default for IndexConfig { + fn default() -> Self { + IndexConfig { + language: "english".to_string(), + stopwords: vec![], + stemming: true, + max_doc_count: None, + default_score: 1.0, + } + } +} + +impl TantivySearch { + pub fn new_with_schema( + base_path: PathBuf, + name: String, + field_definitions: Vec<(String, FieldDef)>, + config: Option, + ) -> Result { + let index_path = base_path.join(&name); + std::fs::create_dir_all(&index_path) + .map_err(|e| DBError(format!("Failed to create index dir: {}", e)))?; + + // Build schema from field definitions + let mut schema_builder = Schema::builder(); + let mut fields = HashMap::new(); + let mut default_search_fields = Vec::new(); + + // Always add a document ID field + let id_field = schema_builder.add_text_field("_id", STRING | STORED); + fields.insert( + "_id".to_string(), + ( + id_field, + FieldDef::Text { + stored: true, + indexed: true, + tokenized: false, + fast: false, + }, + ), + ); + + // Add user-defined fields + for (field_name, field_def) in field_definitions { + let field = match &field_def { + FieldDef::Text { + stored, + indexed, + tokenized, + fast: _fast, + } => { + let mut text_options = TextOptions::default(); + if *stored { + text_options = text_options.set_stored(); + } + if *indexed { + let indexing_options = if *tokenized { + TextFieldIndexing::default() + .set_tokenizer("default") + .set_index_option(IndexRecordOption::WithFreqsAndPositions) + } else { + TextFieldIndexing::default() + .set_tokenizer("raw") + .set_index_option(IndexRecordOption::Basic) + }; + text_options = text_options.set_indexing_options(indexing_options); + let f = schema_builder.add_text_field(&field_name, text_options); + if *tokenized { + default_search_fields.push(f); + } + f + } else { + schema_builder.add_text_field(&field_name, text_options) + } + } + FieldDef::Numeric { + stored, + indexed, + fast, + precision, + } => match precision { + NumericType::I64 => { + let mut opts = NumericOptions::default(); + if *stored { + opts = opts.set_stored(); + } + if *indexed { + opts = opts.set_indexed(); + } + if *fast { + opts = opts.set_fast(); + } + schema_builder.add_i64_field(&field_name, opts) + } + NumericType::U64 => { + let mut opts = NumericOptions::default(); + if *stored { + opts = opts.set_stored(); + } + if *indexed { + opts = opts.set_indexed(); + } + if *fast { + opts = opts.set_fast(); + } + schema_builder.add_u64_field(&field_name, opts) + } + NumericType::F64 => { + let mut opts = NumericOptions::default(); + if *stored { + opts = opts.set_stored(); + } + if *indexed { + opts = opts.set_indexed(); + } + if *fast { + opts = opts.set_fast(); + } + schema_builder.add_f64_field(&field_name, opts) + } + NumericType::Date => { + let mut opts = DateOptions::default(); + if *stored { + opts = opts.set_stored(); + } + if *indexed { + opts = opts.set_indexed(); + } + if *fast { + opts = opts.set_fast(); + } + schema_builder.add_date_field(&field_name, opts) + } + }, + FieldDef::Tag { + stored, + separator: _, + case_sensitive: _, + } => { + let mut text_options = TextOptions::default(); + if *stored { + text_options = text_options.set_stored(); + } + text_options = text_options.set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("raw") + .set_index_option(IndexRecordOption::Basic), + ); + schema_builder.add_text_field(&field_name, text_options) + } + FieldDef::Geo { stored } => { + // For now, store as two f64 fields for lat/lon + let mut opts = NumericOptions::default(); + if *stored { + opts = opts.set_stored(); + } + opts = opts.set_indexed().set_fast(); + let lat_field = + schema_builder.add_f64_field(&format!("{}_lat", field_name), opts.clone()); + let lon_field = + schema_builder.add_f64_field(&format!("{}_lon", field_name), opts); + fields.insert( + format!("{}_lat", field_name), + ( + lat_field, + FieldDef::Numeric { + stored: *stored, + indexed: true, + fast: true, + precision: NumericType::F64, + }, + ), + ); + fields.insert( + format!("{}_lon", field_name), + ( + lon_field, + FieldDef::Numeric { + stored: *stored, + indexed: true, + fast: true, + precision: NumericType::F64, + }, + ), + ); + continue; // Skip adding the geo field itself + } + }; + fields.insert(field_name.clone(), (field, field_def)); + } + + let schema = schema_builder.build(); + let index_schema = IndexSchema { + schema: schema.clone(), + fields, + default_search_fields, + }; + + // Create or open index + let dir = MmapDirectory::open(&index_path) + .map_err(|e| DBError(format!("Failed to open index directory: {}", e)))?; + let mut index = + Index::open_or_create(dir, schema).map_err(|e| DBError(format!("Failed to create index: {}", e)))?; + + // Configure tokenizers + let tokenizer_manager = TokenizerManager::default(); + index.set_tokenizers(tokenizer_manager); + + let writer = index + .writer(15_000_000) + .map_err(|e| DBError(format!("Failed to create index writer: {}", e)))?; + let reader = index + .reader() + .map_err(|e| DBError(format!("Failed to create reader: {}", e)))?; + + let config = config.unwrap_or_default(); + + Ok(TantivySearch { + index, + writer: Arc::new(RwLock::new(writer)), + reader, + index_schema, + name, + config, + }) + } + + pub fn add_document_with_fields( + &self, + doc_id: &str, + fields: HashMap, + ) -> Result<(), DBError> { + let mut writer = self + .writer + .write() + .map_err(|e| DBError(format!("Failed to acquire writer lock: {}", e)))?; + + // Delete existing document with same ID + if let Some((id_field, _)) = self.index_schema.fields.get("_id") { + writer.delete_term(Term::from_field_text(*id_field, doc_id)); + } + + // Create new document + let mut doc = tantivy::doc!(); + + // Add document ID + if let Some((id_field, _)) = self.index_schema.fields.get("_id") { + doc.add_text(*id_field, doc_id); + } + + // Add other fields based on schema + for (field_name, field_value) in fields { + if let Some((field, field_def)) = self.index_schema.fields.get(&field_name) { + match field_def { + FieldDef::Text { .. } => { + doc.add_text(*field, &field_value); + } + FieldDef::Numeric { precision, .. } => match precision { + NumericType::I64 => { + if let Ok(v) = field_value.parse::() { + doc.add_i64(*field, v); + } + } + NumericType::U64 => { + if let Ok(v) = field_value.parse::() { + doc.add_u64(*field, v); + } + } + NumericType::F64 => { + if let Ok(v) = field_value.parse::() { + doc.add_f64(*field, v); + } + } + NumericType::Date => { + if let Ok(v) = field_value.parse::() { + doc.add_date(*field, DateTime::from_timestamp_millis(v)); + } + } + }, + FieldDef::Tag { + separator, + case_sensitive, + .. + } => { + let tags = if !case_sensitive { + field_value.to_lowercase() + } else { + field_value.clone() + }; + for tag in tags.split(separator.as_str()) { + doc.add_text(*field, tag.trim()); + } + } + FieldDef::Geo { .. } => { + let parts: Vec<&str> = field_value.split(',').collect(); + if parts.len() == 2 { + if let (Ok(lat), Ok(lon)) = + (parts[0].parse::(), parts[1].parse::()) + { + if let Some((lat_field, _)) = + self.index_schema.fields.get(&format!("{}_lat", field_name)) + { + doc.add_f64(*lat_field, lat); + } + if let Some((lon_field, _)) = + self.index_schema.fields.get(&format!("{}_lon", field_name)) + { + doc.add_f64(*lon_field, lon); + } + } + } + } + } + } + } + + writer + .add_document(doc) + .map_err(|e| DBError(format!("Failed to add document: {}", e)))?; + writer + .commit() + .map_err(|e| DBError(format!("Failed to commit: {}", e)))?; + Ok(()) + } + + pub fn search_with_options( + &self, + query_str: &str, + options: SearchOptions, + ) -> Result { + let searcher = self.reader.searcher(); + + // Ensure we have searchable fields + if self.index_schema.default_search_fields.is_empty() { + return Err(DBError("No searchable fields defined in schema".to_string())); + } + + // Parse query based on search fields + let query_parser = QueryParser::for_index( + &self.index, + self.index_schema.default_search_fields.clone(), + ); + let parsed_query = query_parser + .parse_query(query_str) + .map_err(|e| DBError(format!("Failed to parse query: {}", e)))?; + let mut clauses: Vec<(Occur, Box)> = vec![(Occur::Must, parsed_query)]; + + // Apply filters if any + for filter in options.filters { + if let Some((field, field_def)) = self.index_schema.fields.get(&filter.field) { + match filter.filter_type { + FilterType::Equals(value) => { + match field_def { + FieldDef::Text { .. } | FieldDef::Tag { .. } => { + let term_query = + TermQuery::new(Term::from_field_text(*field, &value), IndexRecordOption::Basic); + clauses.push((Occur::Must, Box::new(term_query))); + } + FieldDef::Numeric { precision, .. } => { + // Equals on numeric fields: parse to the right numeric type and use term query + match precision { + NumericType::I64 => { + if let Ok(v) = value.parse::() { + let term = Term::from_field_i64(*field, v); + let tq = TermQuery::new(term, IndexRecordOption::Basic); + clauses.push((Occur::Must, Box::new(tq))); + } + } + NumericType::U64 => { + if let Ok(v) = value.parse::() { + let term = Term::from_field_u64(*field, v); + let tq = TermQuery::new(term, IndexRecordOption::Basic); + clauses.push((Occur::Must, Box::new(tq))); + } + } + NumericType::F64 => { + if let Ok(v) = value.parse::() { + let term = Term::from_field_f64(*field, v); + let tq = TermQuery::new(term, IndexRecordOption::Basic); + clauses.push((Occur::Must, Box::new(tq))); + } + } + NumericType::Date => { + if let Ok(v) = value.parse::() { + let dt = DateTime::from_timestamp_millis(v); + let term = Term::from_field_date(*field, dt); + let tq = TermQuery::new(term, IndexRecordOption::Basic); + clauses.push((Occur::Must, Box::new(tq))); + } + } + } + } + FieldDef::Geo { .. } => { + // Geo equals isn't supported in this simplified version + } + } + } + FilterType::Range { .. } => { + // TODO: Implement numeric range queries by building a RangeQuery per type + } + FilterType::InSet(values) => { + // OR across values + let mut sub_clauses: Vec<(Occur, Box)> = vec![]; + for value in values { + let term_query = TermQuery::new( + Term::from_field_text(*field, &value), + IndexRecordOption::Basic, + ); + sub_clauses.push((Occur::Should, Box::new(term_query))); + } + clauses.push((Occur::Must, Box::new(BooleanQuery::new(sub_clauses)))); + } + } + } + } + + let final_query: Box = if clauses.len() == 1 { + clauses.pop().unwrap().1 + } else { + Box::new(BooleanQuery::new(clauses)) + }; + + // Execute search + let top_docs = searcher + .search(&*final_query, &TopDocs::with_limit(options.limit + options.offset)) + .map_err(|e| DBError(format!("Search failed: {}", e)))?; + let total_hits = top_docs.len(); + let mut documents = Vec::new(); + + for (score, doc_address) in top_docs.into_iter().skip(options.offset).take(options.limit) { + let retrieved_doc: TantivyDocument = searcher + .doc(doc_address) + .map_err(|e| DBError(format!("Failed to retrieve doc: {}", e)))?; + + let mut doc_fields = HashMap::new(); + + // Extract stored fields (or synthesize) + for (field_name, (field, field_def)) in &self.index_schema.fields { + match field_def { + FieldDef::Text { stored, .. } | FieldDef::Tag { stored, .. } => { + if *stored { + if let Some(value) = retrieved_doc.get_first(*field) { + if let Some(text) = value.as_str() { + doc_fields.insert(field_name.clone(), text.to_string()); + } + } + } + } + FieldDef::Numeric { + stored, precision, .. + } => { + if *stored { + let value_str = match precision { + NumericType::I64 => retrieved_doc + .get_first(*field) + .and_then(|v| v.as_i64()) + .map(|v| v.to_string()), + NumericType::U64 => retrieved_doc + .get_first(*field) + .and_then(|v| v.as_u64()) + .map(|v| v.to_string()), + NumericType::F64 => retrieved_doc + .get_first(*field) + .and_then(|v| v.as_f64()) + .map(|v| v.to_string()), + NumericType::Date => retrieved_doc + .get_first(*field) + .and_then(|v| v.as_datetime()) + .map(|v| v.into_timestamp_millis().to_string()), + }; + if let Some(v) = value_str { + doc_fields.insert(field_name.clone(), v); + } + } + } + FieldDef::Geo { stored } => { + if *stored { + let lat_field = self + .index_schema + .fields + .get(&format!("{}_lat", field_name)) + .unwrap() + .0; + let lon_field = self + .index_schema + .fields + .get(&format!("{}_lon", field_name)) + .unwrap() + .0; + let lat = retrieved_doc.get_first(lat_field).and_then(|v| v.as_f64()); + let lon = retrieved_doc.get_first(lon_field).and_then(|v| v.as_f64()); + if let (Some(lat), Some(lon)) = (lat, lon) { + doc_fields.insert(field_name.clone(), format!("{},{}", lat, lon)); + } + } + } + } + } + + documents.push(SearchDocument { + fields: doc_fields, + score, + }); + } + + Ok(SearchResults { + total: total_hits, + documents, + }) + } + + pub fn get_info(&self) -> Result { + let searcher = self.reader.searcher(); + let num_docs = searcher.num_docs(); + let fields_info: Vec = self + .index_schema + .fields + .iter() + .map(|(name, (_, def))| FieldInfo { + name: name.clone(), + field_type: format!("{:?}", def), + }) + .collect(); + Ok(IndexInfo { + name: self.name.clone(), + num_docs, + fields: fields_info, + config: self.config.clone(), + }) + } +} + +#[derive(Debug, Clone)] +pub struct SearchOptions { + pub limit: usize, + pub offset: usize, + pub filters: Vec, + pub sort_by: Option, + pub return_fields: Option>, + pub highlight: bool, +} + +impl Default for SearchOptions { + fn default() -> Self { + SearchOptions { + limit: 10, + offset: 0, + filters: vec![], + sort_by: None, + return_fields: None, + highlight: false, + } + } +} + +#[derive(Debug, Clone)] +pub struct Filter { + pub field: String, + pub filter_type: FilterType, +} + +#[derive(Debug, Clone)] +pub enum FilterType { + Equals(String), + Range { min: String, max: String }, + InSet(Vec), +} + +#[derive(Debug)] +pub struct SearchResults { + pub total: usize, + pub documents: Vec, +} + +#[derive(Debug)] +pub struct SearchDocument { + pub fields: HashMap, + pub score: f32, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct IndexInfo { + pub name: String, + pub num_docs: u64, + pub fields: Vec, + pub config: IndexConfig, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct FieldInfo { + pub name: String, + pub field_type: String, +} \ No newline at end of file