diff --git a/Cargo.lock b/Cargo.lock index 6340222..7057448 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -235,40 +235,19 @@ version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3f15b4c6b148206ff3a2b35002e08929c2462467b62b9c02036d9c34f9ef994" dependencies = [ - "arrow-arith 55.2.0", - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-cast 55.2.0", - "arrow-csv 55.2.0", - "arrow-data 55.2.0", - "arrow-ipc 55.2.0", - "arrow-json 55.2.0", - "arrow-ord 55.2.0", - "arrow-row 55.2.0", - "arrow-schema 55.2.0", - "arrow-select 55.2.0", - "arrow-string 55.2.0", -] - -[[package]] -name = "arrow" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd798aea3553913a5986813e9c6ad31a2d2b04e931fe8ea4a37155eb541cebb5" -dependencies = [ - "arrow-arith 56.0.0", - "arrow-array 56.0.0", - "arrow-buffer 56.0.0", - "arrow-cast 56.0.0", - "arrow-csv 56.0.0", - "arrow-data 56.0.0", - "arrow-ipc 56.0.0", - "arrow-json 56.0.0", - "arrow-ord 56.0.0", - "arrow-row 56.0.0", - "arrow-schema 56.0.0", - "arrow-select 56.0.0", - "arrow-string 56.0.0", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", ] [[package]] @@ -277,24 +256,10 @@ version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30feb679425110209ae35c3fbf82404a39a4c0436bb3ec36164d8bffed2a4ce4" dependencies = [ - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-data 55.2.0", - "arrow-schema 55.2.0", - "chrono", - "num", -] - -[[package]] -name = "arrow-arith" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "508dafb53e5804a238cab7fd97a59ddcbfab20cc4d9814b1ab5465b9fa147f2e" -dependencies = [ - "arrow-array 56.0.0", - "arrow-buffer 56.0.0", - "arrow-data 56.0.0", - "arrow-schema 56.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "num", ] @@ -306,9 +271,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70732f04d285d49054a48b72c54f791bb3424abae92d27aafdf776c98af161c8" dependencies = [ "ahash", - "arrow-buffer 55.2.0", - "arrow-data 55.2.0", - "arrow-schema 55.2.0", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "chrono-tz", "half", @@ -316,22 +281,6 @@ dependencies = [ "num", ] -[[package]] -name = "arrow-array" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2730bc045d62bb2e53ef8395b7d4242f5c8102f41ceac15e8395b9ac3d08461" -dependencies = [ - "ahash", - "arrow-buffer 56.0.0", - "arrow-data 56.0.0", - "arrow-schema 56.0.0", - "chrono", - "half", - "hashbrown 0.15.5", - "num", -] - [[package]] name = "arrow-buffer" version = "55.2.0" @@ -343,28 +292,17 @@ dependencies = [ "num", ] -[[package]] -name = "arrow-buffer" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54295b93beb702ee9a6f6fbced08ad7f4d76ec1c297952d4b83cf68755421d1d" -dependencies = [ - "bytes", - "half", - "num", -] - [[package]] name = "arrow-cast" version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4f12eccc3e1c05a766cafb31f6a60a46c2f8efec9b74c6e0648766d30686af8" dependencies = [ - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-data 55.2.0", - "arrow-schema 55.2.0", - "arrow-select 55.2.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "atoi", "base64 0.22.1", "chrono", @@ -375,50 +313,15 @@ dependencies = [ "ryu", ] -[[package]] -name = "arrow-cast" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67e8bcb7dc971d779a7280593a1bf0c2743533b8028909073e804552e85e75b5" -dependencies = [ - "arrow-array 56.0.0", - "arrow-buffer 56.0.0", - "arrow-data 56.0.0", - "arrow-schema 56.0.0", - "arrow-select 56.0.0", - "atoi", - "base64 0.22.1", - "chrono", - "half", - "lexical-core", - "num", - "ryu", -] - [[package]] name = "arrow-csv" version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "012c9fef3f4a11573b2c74aec53712ff9fdae4a95f4ce452d1bbf088ee00f06b" dependencies = [ - "arrow-array 55.2.0", - "arrow-cast 55.2.0", - "arrow-schema 55.2.0", - "chrono", - "csv", - "csv-core", - "regex", -] - -[[package]] -name = "arrow-csv" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "673fd2b5fb57a1754fdbfac425efd7cf54c947ac9950c1cce86b14e248f1c458" -dependencies = [ - "arrow-array 56.0.0", - "arrow-cast 56.0.0", - "arrow-schema 56.0.0", + "arrow-array", + "arrow-cast", + "arrow-schema", "chrono", "csv", "csv-core", @@ -431,20 +334,8 @@ version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8de1ce212d803199684b658fc4ba55fb2d7e87b213de5af415308d2fee3619c2" dependencies = [ - "arrow-buffer 55.2.0", - "arrow-schema 55.2.0", - "half", - "num", -] - -[[package]] -name = "arrow-data" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97c22fe3da840039c69e9f61f81e78092ea36d57037b4900151f063615a2f6b4" -dependencies = [ - "arrow-buffer 56.0.0", - "arrow-schema 56.0.0", + "arrow-buffer", + "arrow-schema", "half", "num", ] @@ -455,61 +346,26 @@ version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9ea5967e8b2af39aff5d9de2197df16e305f47f404781d3230b2dc672da5d92" dependencies = [ - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-data 55.2.0", - "arrow-schema 55.2.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "flatbuffers", "lz4_flex", "zstd", ] -[[package]] -name = "arrow-ipc" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "778de14c5a69aedb27359e3dd06dd5f9c481d5f6ee9fbae912dba332fd64636b" -dependencies = [ - "arrow-array 56.0.0", - "arrow-buffer 56.0.0", - "arrow-data 56.0.0", - "arrow-schema 56.0.0", - "flatbuffers", -] - [[package]] name = "arrow-json" version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5709d974c4ea5be96d900c01576c7c0b99705f4a3eec343648cb1ca863988a9c" dependencies = [ - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-cast 55.2.0", - "arrow-data 55.2.0", - "arrow-schema 55.2.0", - "chrono", - "half", - "indexmap", - "lexical-core", - "memchr", - "num", - "serde", - "serde_json", - "simdutf8", -] - -[[package]] -name = "arrow-json" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3860db334fe7b19fcf81f6b56f8d9d95053f3839ffe443d56b5436f7a29a1794" -dependencies = [ - "arrow-array 56.0.0", - "arrow-buffer 56.0.0", - "arrow-cast 56.0.0", - "arrow-data 56.0.0", - "arrow-schema 56.0.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", "chrono", "half", "indexmap", @@ -527,24 +383,11 @@ version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6506e3a059e3be23023f587f79c82ef0bcf6d293587e3272d20f2d30b969b5a7" dependencies = [ - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-data 55.2.0", - "arrow-schema 55.2.0", - "arrow-select 55.2.0", -] - -[[package]] -name = "arrow-ord" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "425fa0b42a39d3ff55160832e7c25553e7f012c3f187def3d70313e7a29ba5d9" -dependencies = [ - "arrow-array 56.0.0", - "arrow-buffer 56.0.0", - "arrow-data 56.0.0", - "arrow-schema 56.0.0", - "arrow-select 56.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", ] [[package]] @@ -553,23 +396,10 @@ version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52bf7393166beaf79b4bed9bfdf19e97472af32ce5b6b48169d321518a08cae2" dependencies = [ - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-data 55.2.0", - "arrow-schema 55.2.0", - "half", -] - -[[package]] -name = "arrow-row" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df9c9423c9e71abd1b08a7f788fcd203ba2698ac8e72a1f236f1faa1a06a7414" -dependencies = [ - "arrow-array 56.0.0", - "arrow-buffer 56.0.0", - "arrow-data 56.0.0", - "arrow-schema 56.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "half", ] @@ -584,12 +414,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "arrow-schema" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85fa1babc4a45fdc64a92175ef51ff00eba5ebbc0007962fecf8022ac1c6ce28" - [[package]] name = "arrow-select" version = "55.2.0" @@ -597,24 +421,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd2b45757d6a2373faa3352d02ff5b54b098f5e21dccebc45a21806bc34501e5" dependencies = [ "ahash", - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-data 55.2.0", - "arrow-schema 55.2.0", - "num", -] - -[[package]] -name = "arrow-select" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8854d15f1cf5005b4b358abeb60adea17091ff5bdd094dca5d3f73787d81170" -dependencies = [ - "ahash", - "arrow-array 56.0.0", - "arrow-buffer 56.0.0", - "arrow-data 56.0.0", - "arrow-schema 56.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "num", ] @@ -624,28 +434,11 @@ version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0377d532850babb4d927a06294314b316e23311503ed580ec6ce6a0158f49d40" dependencies = [ - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-data 55.2.0", - "arrow-schema 55.2.0", - "arrow-select 55.2.0", - "memchr", - "num", - "regex", - "regex-syntax 0.8.5", -] - -[[package]] -name = "arrow-string" -version = "56.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c477e8b89e1213d5927a2a84a72c384a9bf4dd0dbf15f9fd66d821aafd9e95e" -dependencies = [ - "arrow-array 56.0.0", - "arrow-buffer 56.0.0", - "arrow-data 56.0.0", - "arrow-schema 56.0.0", - "arrow-select 56.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "memchr", "num", "regex", @@ -1863,9 +1656,9 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a11e19a7ccc5bb979c95c1dceef663eab39c9061b3bbf8d1937faf0f03bf41f" dependencies = [ - "arrow 55.2.0", - "arrow-ipc 55.2.0", - "arrow-schema 55.2.0", + "arrow", + "arrow-ipc", + "arrow-schema", "async-trait", "bytes", "chrono", @@ -1911,7 +1704,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94985e67cab97b1099db2a7af11f31a45008b282aba921c1e1d35327c212ec18" dependencies = [ - "arrow 55.2.0", + "arrow", "async-trait", "dashmap 6.1.0", "datafusion-common", @@ -1937,7 +1730,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e002df133bdb7b0b9b429d89a69aa77b35caeadee4498b2ce1c7c23a99516988" dependencies = [ - "arrow 55.2.0", + "arrow", "async-trait", "datafusion-catalog", "datafusion-common", @@ -1961,8 +1754,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e13242fc58fd753787b0a538e5ae77d356cb9d0656fa85a591a33c5f106267f6" dependencies = [ "ahash", - "arrow 55.2.0", - "arrow-ipc 55.2.0", + "arrow", + "arrow-ipc", "base64 0.22.1", "half", "hashbrown 0.14.5", @@ -1993,7 +1786,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2cf792579bc8bf07d1b2f68c2d5382f8a63679cce8fbebfd4ba95742b6e08864" dependencies = [ - "arrow 55.2.0", + "arrow", "async-trait", "bytes", "chrono", @@ -2021,7 +1814,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfc114f9a1415174f3e8d2719c371fc72092ef2195a7955404cfe6b2ba29a706" dependencies = [ - "arrow 55.2.0", + "arrow", "async-trait", "bytes", "datafusion-catalog", @@ -2046,7 +1839,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d88dd5e215c420a52362b9988ecd4cefd71081b730663d4f7d886f706111fc75" dependencies = [ - "arrow 55.2.0", + "arrow", "async-trait", "bytes", "datafusion-catalog", @@ -2077,7 +1870,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9609d83d52ff8315283c6dad3b97566e877d8f366fab4c3297742f33dcd636c7" dependencies = [ - "arrow 55.2.0", + "arrow", "dashmap 6.1.0", "datafusion-common", "datafusion-expr", @@ -2096,7 +1889,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e75230cd67f650ef0399eb00f54d4a073698f2c0262948298e5299fc7324da63" dependencies = [ - "arrow 55.2.0", + "arrow", "chrono", "datafusion-common", "datafusion-doc", @@ -2116,7 +1909,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70fafb3a045ed6c49cfca0cd090f62cf871ca6326cc3355cb0aaf1260fa760b6" dependencies = [ - "arrow 55.2.0", + "arrow", "datafusion-common", "indexmap", "itertools 0.14.0", @@ -2129,8 +1922,8 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdf9a9cf655265861a20453b1e58357147eab59bdc90ce7f2f68f1f35104d3bb" dependencies = [ - "arrow 55.2.0", - "arrow-buffer 55.2.0", + "arrow", + "arrow-buffer", "base64 0.22.1", "blake2", "blake3", @@ -2159,7 +1952,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f07e49733d847be0a05235e17b884d326a2fd402c97a89fe8bcf0bfba310005" dependencies = [ "ahash", - "arrow 55.2.0", + "arrow", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -2180,7 +1973,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4512607e10d72b0b0a1dc08f42cb5bd5284cb8348b7fea49dc83409493e32b1b" dependencies = [ "ahash", - "arrow 55.2.0", + "arrow", "datafusion-common", "datafusion-expr-common", "datafusion-physical-expr-common", @@ -2192,8 +1985,8 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2ab331806e34f5545e5f03396e4d5068077395b1665795d8f88c14ec4f1e0b7a" dependencies = [ - "arrow 55.2.0", - "arrow-ord 55.2.0", + "arrow", + "arrow-ord", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -2213,7 +2006,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4ac2c0be983a06950ef077e34e0174aa0cb9e346f3aeae459823158037ade37" dependencies = [ - "arrow 55.2.0", + "arrow", "async-trait", "datafusion-catalog", "datafusion-common", @@ -2229,7 +2022,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36f3d92731de384c90906941d36dcadf6a86d4128409a9c5cd916662baed5f53" dependencies = [ - "arrow 55.2.0", + "arrow", "datafusion-common", "datafusion-doc", "datafusion-expr", @@ -2268,7 +2061,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1594c7a97219ede334f25347ad8d57056621e7f4f35a0693c8da876e10dd6a53" dependencies = [ - "arrow 55.2.0", + "arrow", "chrono", "datafusion-common", "datafusion-expr", @@ -2287,7 +2080,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc6da0f2412088d23f6b01929dedd687b5aee63b19b674eb73d00c3eb3c883b7" dependencies = [ "ahash", - "arrow 55.2.0", + "arrow", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -2309,7 +2102,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dcb0dbd9213078a593c3fe28783beaa625a4e6c6a6c797856ee2ba234311fb96" dependencies = [ "ahash", - "arrow 55.2.0", + "arrow", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", @@ -2322,7 +2115,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d140854b2db3ef8ac611caad12bfb2e1e1de827077429322a6188f18fc0026a" dependencies = [ - "arrow 55.2.0", + "arrow", "datafusion-common", "datafusion-execution", "datafusion-expr", @@ -2341,9 +2134,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b46cbdf21a01206be76d467f325273b22c559c744a012ead5018dfe79597de08" dependencies = [ "ahash", - "arrow 55.2.0", - "arrow-ord 55.2.0", - "arrow-schema 55.2.0", + "arrow", + "arrow-ord", + "arrow-schema", "async-trait", "chrono", "datafusion-common", @@ -2370,7 +2163,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a72733766ddb5b41534910926e8da5836622316f6283307fd9fb7e19811a59c" dependencies = [ - "arrow 55.2.0", + "arrow", "async-trait", "dashmap 6.1.0", "datafusion-common", @@ -2394,7 +2187,7 @@ version = "48.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5162338cdec9cc7ea13a0e6015c361acad5ec1d88d83f7c86301f789473971f" dependencies = [ - "arrow 55.2.0", + "arrow", "bigdecimal", "datafusion-common", "datafusion-expr", @@ -2803,7 +2596,7 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "548190a42654ce848835b410ae33f43b4d55cb24548fd0a885a289a1d5a95019" dependencies = [ - "arrow-array 55.2.0", + "arrow-array", "rand 0.9.2", ] @@ -3093,9 +2886,9 @@ version = "0.0.1" dependencies = [ "age", "anyhow", - "arrow 56.0.0", - "arrow-array 56.0.0", - "arrow-schema 56.0.0", + "arrow", + "arrow-array", + "arrow-schema", "base64 0.22.1", "bincode", "byteorder", @@ -3759,15 +3552,15 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94bafd9d9a9301c1eac48892ec8016d4d28204d4fc55f2ebebee9a7af465e152" dependencies = [ - "arrow 55.2.0", - "arrow-arith 55.2.0", - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-ipc 55.2.0", - "arrow-ord 55.2.0", - "arrow-row 55.2.0", - "arrow-schema 55.2.0", - "arrow-select 55.2.0", + "arrow", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-ipc", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", "async-recursion", "async-trait", "async_cell", @@ -3822,12 +3615,12 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97ebcd8edc2b534e8ded20c97c8928e275160794af91ed803a3d48d8d2a88d8" dependencies = [ - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-cast 55.2.0", - "arrow-data 55.2.0", - "arrow-schema 55.2.0", - "arrow-select 55.2.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "arrow-select", "bytes", "getrandom 0.2.16", "half", @@ -3841,9 +3634,9 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce5c1849d07985d6a5011aca9de43c7a42ec4c996d66ef3f2d9896c227cc934c" dependencies = [ - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-schema 55.2.0", + "arrow-array", + "arrow-buffer", + "arrow-schema", "async-trait", "byteorder", "bytes", @@ -3878,12 +3671,12 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d355c087bc66d85e36cfb428465f585b13971e1e13585dd2b6886a54d8a7d9a4" dependencies = [ - "arrow 55.2.0", - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-ord 55.2.0", - "arrow-schema 55.2.0", - "arrow-select 55.2.0", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ord", + "arrow-schema", + "arrow-select", "async-trait", "datafusion", "datafusion-common", @@ -3908,10 +3701,10 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "110d4dedfe02e9cff8f11cfb64a261755da7ee9131845197efeec8b659cc5513" dependencies = [ - "arrow 55.2.0", - "arrow-array 55.2.0", - "arrow-cast 55.2.0", - "arrow-schema 55.2.0", + "arrow", + "arrow-array", + "arrow-cast", + "arrow-schema", "chrono", "futures", "hex", @@ -3927,14 +3720,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66750006299a2fb003091bc290eb1fe2a5933e35236d921934131f3e4629cd33" dependencies = [ "arrayref", - "arrow 55.2.0", - "arrow-arith 55.2.0", - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-cast 55.2.0", - "arrow-data 55.2.0", - "arrow-schema 55.2.0", - "arrow-select 55.2.0", + "arrow", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "arrow-select", "bytemuck", "byteorder", "bytes", @@ -3967,12 +3760,12 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c639062100610a075e01fd455173348b2fccea10cb0e89f70e38a3183c56022" dependencies = [ - "arrow-arith 55.2.0", - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-data 55.2.0", - "arrow-schema 55.2.0", - "arrow-select 55.2.0", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "async-recursion", "async-trait", "byteorder", @@ -4003,11 +3796,11 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ae67a048a51fb525d1bfde86d1b39118462277e7e7a7cd0e7ba866312873532" dependencies = [ - "arrow 55.2.0", - "arrow-array 55.2.0", - "arrow-ord 55.2.0", - "arrow-schema 55.2.0", - "arrow-select 55.2.0", + "arrow", + "arrow-array", + "arrow-ord", + "arrow-schema", + "arrow-select", "async-channel", "async-recursion", "async-trait", @@ -4058,14 +3851,14 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cc86c7307e2d3d895cfefa503f986edcbdd208eb0aa89ba2c75724ba04bce843" dependencies = [ - "arrow 55.2.0", - "arrow-arith 55.2.0", - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-cast 55.2.0", - "arrow-data 55.2.0", - "arrow-schema 55.2.0", - "arrow-select 55.2.0", + "arrow", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "arrow-select", "async-priority-channel", "async-recursion", "async-trait", @@ -4100,10 +3893,10 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "769f910b6f2ad5eb4d1b3071c533b619351e61e0dfca74f13c98680a8e6476e9" dependencies = [ - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-ord 55.2.0", - "arrow-schema 55.2.0", + "arrow-array", + "arrow-buffer", + "arrow-ord", + "arrow-schema", "bitvec", "cc", "deepsize", @@ -4125,11 +3918,11 @@ version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffbeafa8a3e97b5b3a06f06d69b0cefe56e65c64a33f674c40c113b797328bd2" dependencies = [ - "arrow 55.2.0", - "arrow-array 55.2.0", - "arrow-buffer 55.2.0", - "arrow-ipc 55.2.0", - "arrow-schema 55.2.0", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ipc", + "arrow-schema", "async-trait", "byteorder", "bytes", @@ -4923,18 +4716,18 @@ dependencies = [ [[package]] name = "parquet" -version = "56.0.0" +version = "55.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7288a07ed5d25939a90f9cb1ca5afa6855faa08ec7700613511ae64bdb0620c" +checksum = "b17da4150748086bd43352bc77372efa9b6e3dbd06a04831d2a98c041c225cfa" dependencies = [ "ahash", - "arrow-array 56.0.0", - "arrow-buffer 56.0.0", - "arrow-cast 56.0.0", - "arrow-data 56.0.0", - "arrow-ipc 56.0.0", - "arrow-schema 56.0.0", - "arrow-select 56.0.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", "base64 0.22.1", "brotli", "bytes", diff --git a/Cargo.toml b/Cargo.toml index 9d57a84..df658a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,10 +28,11 @@ base64 = "0.22" lance = "0.33" lance-index = "0.33" lance-linalg = "0.33" -arrow = "56" -arrow-array = "56" -arrow-schema = "56" -parquet = "56" +# Use Arrow version compatible with Lance 0.33 +arrow = "55.2" +arrow-array = "55.2" +arrow-schema = "55.2" +parquet = "55.2" uuid = { version = "1.10", features = ["v4"] } reqwest = { version = "0.11", features = ["json"] } image = "0.25" diff --git a/docs/lance_vector_db.md b/docs/lance_vector_db.md new file mode 100644 index 0000000..fe44ae7 --- /dev/null +++ b/docs/lance_vector_db.md @@ -0,0 +1,454 @@ +# Lance Vector Database Operations + +HeroDB includes a powerful vector database integration using Lance, enabling high-performance vector storage, search, and multimodal data management. By default, it uses Ollama for local text embeddings, with support for custom external embedding services. + +## Overview + +The Lance vector database integration provides: + +- **High-performance vector storage** using Lance's columnar format +- **Local Ollama integration** for text embeddings (default, no external dependencies) +- **Custom embedding service support** for advanced use cases +- **Text embedding support** (images via custom services) +- **Vector similarity search** with configurable parameters +- **Scalable indexing** with IVF_PQ (Inverted File with Product Quantization) +- **Redis-compatible command interface** + +## Architecture + +``` +┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐ +│ HeroDB │ │ External │ │ Lance │ +│ Redis Server │◄──►│ Embedding │ │ Vector Store │ +│ │ │ Service │ │ │ +└─────────────────┘ └──────────────────┘ └─────────────────┘ + │ │ │ + │ │ │ + Redis Protocol HTTP API Arrow/Parquet + Commands JSON Requests Columnar Storage +``` + +### Key Components + +1. **Lance Store**: High-performance columnar vector storage +2. **Ollama Integration**: Local embedding service (default) +3. **Custom Embedding Service**: Optional HTTP API for advanced use cases +4. **Redis Command Interface**: Familiar Redis-style commands +5. **Arrow Schema**: Flexible schema definition for metadata + +## Configuration + +### Default Setup (Ollama) + +HeroDB uses Ollama by default for text embeddings. No configuration is required if Ollama is running locally: + +```bash +# Install Ollama (if not already installed) +# Visit: https://ollama.ai + +# Pull the embedding model +ollama pull nomic-embed-text + +# Ollama automatically runs on localhost:11434 +# HeroDB will use this by default +``` + +**Default Configuration:** +- **URL**: `http://localhost:11434` +- **Model**: `nomic-embed-text` +- **Dimensions**: 768 (for nomic-embed-text) + +### Custom Embedding Service (Optional) + +To use a custom embedding service instead of Ollama: + +```bash +# Set custom embedding service URL +redis-cli HSET config:core:aiembed url "http://your-embedding-service:8080/embed" + +# Optional: Set authentication if required +redis-cli HSET config:core:aiembed token "your-api-token" +``` + +### Embedding Service API Contracts + +#### Ollama API (Default) +HeroDB calls Ollama using this format: + +```bash +POST http://localhost:11434/api/embeddings +Content-Type: application/json + +{ + "model": "nomic-embed-text", + "prompt": "Your text to embed" +} +``` + +Response: +```json +{ + "embedding": [0.1, 0.2, 0.3, ...] +} +``` + +#### Custom Service API +Your custom embedding service should accept POST requests with this JSON format: + +```json +{ + "texts": ["text1", "text2"], // Optional: array of texts + "images": ["base64_image1", "base64_image2"], // Optional: base64 encoded images + "model": "your-model-name" // Optional: model specification +} +``` + +And return responses in this format: + +```json +{ + "embeddings": [[0.1, 0.2, ...], [0.3, 0.4, ...]], // Array of embedding vectors + "model": "model-name", // Model used + "usage": { // Optional usage stats + "tokens": 100, + "requests": 2 + } +} +``` + +## Commands Reference + +### Dataset Management + +#### LANCE CREATE +Create a new vector dataset with specified dimensions and optional schema. + +```bash +LANCE CREATE DIM [SCHEMA field:type ...] +``` + +**Parameters:** +- `dataset`: Name of the dataset +- `dimension`: Vector dimension (e.g., 384, 768, 1536) +- `field:type`: Optional metadata fields (string, int, float, bool) + +**Examples:** +```bash +# Create a simple dataset for 384-dimensional vectors +LANCE CREATE documents DIM 384 + +# Create dataset with metadata schema +LANCE CREATE products DIM 768 SCHEMA category:string price:float available:bool +``` + +#### LANCE LIST +List all available datasets. + +```bash +LANCE LIST +``` + +**Returns:** Array of dataset names + +#### LANCE INFO +Get information about a specific dataset. + +```bash +LANCE INFO +``` + +**Returns:** Dataset metadata including name, version, row count, and schema + +#### LANCE DROP +Delete a dataset and all its data. + +```bash +LANCE DROP +``` + +### Data Operations + +#### LANCE STORE +Store multimodal data (text/images) with automatic embedding generation. + +```bash +LANCE STORE [TEXT ] [IMAGE ] [key value ...] +``` + +**Parameters:** +- `dataset`: Target dataset name +- `TEXT`: Text content to embed +- `IMAGE`: Base64-encoded image to embed +- `key value`: Metadata key-value pairs + +**Examples:** +```bash +# Store text with metadata +LANCE STORE documents TEXT "Machine learning is transforming industries" category "AI" author "John Doe" + +# Store image with metadata +LANCE STORE images IMAGE "iVBORw0KGgoAAAANSUhEUgAA..." category "nature" tags "landscape,mountains" + +# Store both text and image +LANCE STORE multimodal TEXT "Beautiful sunset" IMAGE "base64data..." location "California" +``` + +**Returns:** Unique ID of the stored item + +### Search Operations + +#### LANCE SEARCH +Search using a raw vector. + +```bash +LANCE SEARCH VECTOR K [NPROBES ] [REFINE ] +``` + +**Parameters:** +- `dataset`: Dataset to search +- `vector`: Comma-separated vector values (e.g., "0.1,0.2,0.3") +- `k`: Number of results to return +- `NPROBES`: Number of partitions to search (optional) +- `REFINE`: Refine factor for better accuracy (optional) + +**Example:** +```bash +LANCE SEARCH documents VECTOR "0.1,0.2,0.3,0.4" K 5 NPROBES 10 +``` + +#### LANCE SEARCH.TEXT +Search using text query (automatically embedded). + +```bash +LANCE SEARCH.TEXT K [NPROBES ] [REFINE ] +``` + +**Parameters:** +- `dataset`: Dataset to search +- `query_text`: Text query to search for +- `k`: Number of results to return +- `NPROBES`: Number of partitions to search (optional) +- `REFINE`: Refine factor for better accuracy (optional) + +**Example:** +```bash +LANCE SEARCH.TEXT documents "artificial intelligence applications" K 10 NPROBES 20 +``` + +**Returns:** Array of results with distance scores and metadata + +### Embedding Operations + +#### LANCE EMBED.TEXT +Generate embeddings for text without storing. + +```bash +LANCE EMBED.TEXT [text2] [text3] ... +``` + +**Example:** +```bash +LANCE EMBED.TEXT "Hello world" "Machine learning" "Vector database" +``` + +**Returns:** Array of embedding vectors + +### Index Management + +#### LANCE CREATE.INDEX +Create a vector index for faster search performance. + +```bash +LANCE CREATE.INDEX [PARTITIONS ] [SUBVECTORS ] +``` + +**Parameters:** +- `dataset`: Dataset to index +- `index_type`: Index type (currently supports "IVF_PQ") +- `PARTITIONS`: Number of partitions (default: 256) +- `SUBVECTORS`: Number of sub-vectors for PQ (default: 16) + +**Example:** +```bash +LANCE CREATE.INDEX documents IVF_PQ PARTITIONS 512 SUBVECTORS 32 +``` + +## Usage Patterns + +### 1. Document Search System + +```bash +# Setup +LANCE CREATE documents DIM 384 SCHEMA title:string content:string category:string + +# Store documents +LANCE STORE documents TEXT "Introduction to machine learning algorithms" title "ML Basics" category "education" +LANCE STORE documents TEXT "Deep learning neural networks explained" title "Deep Learning" category "education" +LANCE STORE documents TEXT "Building scalable web applications" title "Web Dev" category "programming" + +# Create index for better performance +LANCE CREATE.INDEX documents IVF_PQ PARTITIONS 256 + +# Search +LANCE SEARCH.TEXT documents "neural networks" K 5 +``` + +### 2. Image Similarity Search + +```bash +# Setup +LANCE CREATE images DIM 512 SCHEMA filename:string tags:string + +# Store images (base64 encoded) +LANCE STORE images IMAGE "iVBORw0KGgoAAAANSUhEUgAA..." filename "sunset.jpg" tags "nature,landscape" +LANCE STORE images IMAGE "iVBORw0KGgoAAAANSUhEUgBB..." filename "city.jpg" tags "urban,architecture" + +# Search by image +LANCE STORE temp_search IMAGE "query_image_base64..." +# Then use the returned ID to get embedding and search +``` + +### 3. Multimodal Content Management + +```bash +# Setup +LANCE CREATE content DIM 768 SCHEMA type:string source:string + +# Store mixed content +LANCE STORE content TEXT "Product description for smartphone" type "product" source "catalog" +LANCE STORE content IMAGE "product_image_base64..." type "product_image" source "catalog" + +# Search across all content types +LANCE SEARCH.TEXT content "smartphone features" K 10 +``` + +## Performance Considerations + +### Vector Dimensions +- **384**: Good for general text (e.g., sentence-transformers) +- **768**: Standard for BERT-like models +- **1536**: OpenAI text-embedding-ada-002 +- **Higher dimensions**: Better accuracy but slower search + +### Index Configuration +- **More partitions**: Better for larger datasets (>100K vectors) +- **More sub-vectors**: Better compression but slower search +- **NPROBES**: Higher values = better accuracy, slower search + +### Best Practices + +1. **Create indexes** for datasets with >1000 vectors +2. **Use appropriate dimensions** based on your embedding model +3. **Configure NPROBES** based on accuracy vs speed requirements +4. **Batch operations** when possible for better performance +5. **Monitor embedding service** response times and rate limits + +## Error Handling + +Common error scenarios and solutions: + +### Embedding Service Errors +```bash +# Error: Embedding service not configured +ERR Embedding service URL not configured. Set it with: HSET config:core:aiembed url + +# Error: Service unavailable +ERR Embedding service returned error 404 Not Found +``` + +**Solution:** Ensure embedding service is running and URL is correct. + +### Dataset Errors +```bash +# Error: Dataset doesn't exist +ERR Dataset 'mydata' does not exist + +# Error: Dimension mismatch +ERR Vector dimension mismatch: expected 384, got 768 +``` + +**Solution:** Create dataset first or check vector dimensions. + +### Search Errors +```bash +# Error: Invalid vector format +ERR Invalid vector format + +# Error: No index available +ERR No index available for fast search +``` + +**Solution:** Check vector format or create an index. + +## Integration Examples + +### With Python +```python +import redis +import json + +r = redis.Redis(host='localhost', port=6379) + +# Create dataset +r.execute_command('LANCE', 'CREATE', 'docs', 'DIM', '384') + +# Store document +result = r.execute_command('LANCE', 'STORE', 'docs', + 'TEXT', 'Machine learning tutorial', + 'category', 'education') +print(f"Stored with ID: {result}") + +# Search +results = r.execute_command('LANCE', 'SEARCH.TEXT', 'docs', + 'machine learning', 'K', '5') +print(f"Search results: {results}") +``` + +### With Node.js +```javascript +const redis = require('redis'); +const client = redis.createClient(); + +// Create dataset +await client.sendCommand(['LANCE', 'CREATE', 'docs', 'DIM', '384']); + +// Store document +const id = await client.sendCommand(['LANCE', 'STORE', 'docs', + 'TEXT', 'Deep learning guide', + 'category', 'AI']); + +// Search +const results = await client.sendCommand(['LANCE', 'SEARCH.TEXT', 'docs', + 'deep learning', 'K', '10']); +``` + +## Monitoring and Maintenance + +### Health Checks +```bash +# Check if Lance store is available +LANCE LIST + +# Check dataset health +LANCE INFO mydataset + +# Test embedding service +LANCE EMBED.TEXT "test" +``` + +### Maintenance Operations +```bash +# Backup: Use standard Redis backup procedures +# The Lance data is stored separately in the data directory + +# Cleanup: Remove unused datasets +LANCE DROP old_dataset + +# Reindex: Drop and recreate indexes if needed +LANCE DROP dataset_name +LANCE CREATE dataset_name DIM 384 +# Re-import data +LANCE CREATE.INDEX dataset_name IVF_PQ +``` + +This integration provides a powerful foundation for building AI-powered applications with vector search capabilities while maintaining the familiar Redis interface. \ No newline at end of file diff --git a/examples/README.md b/examples/README.md index a36b993..7c8f0fa 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,6 +1,191 @@ -# HeroDB Tantivy Search Examples +# HeroDB Examples -This directory contains examples demonstrating HeroDB's full-text search capabilities powered by Tantivy. +This directory contains examples demonstrating HeroDB's capabilities including full-text search powered by Tantivy and vector database operations using Lance. + +## Available Examples + +1. **[Tantivy Search Demo](#tantivy-search-demo-bash-script)** - Full-text search capabilities +2. **[Lance Vector Database Demo](#lance-vector-database-demo-bash-script)** - Vector database and AI operations +3. **[AGE Encryption Demo](age_bash_demo.sh)** - Cryptographic operations +4. **[Simple Demo](simple_demo.sh)** - Basic Redis operations + +--- + +## Lance Vector Database Demo (Bash Script) + +### Overview +The `lance_vector_demo.sh` script provides a comprehensive demonstration of HeroDB's vector database capabilities using Lance. It showcases vector storage, similarity search, multimodal data handling, and AI-powered operations with external embedding services. + +### Prerequisites +1. **HeroDB Server**: The server must be running (default port 6379) +2. **Redis CLI**: The `redis-cli` tool must be installed and available in your PATH +3. **Embedding Service** (optional): For full functionality, set up an external embedding service + +### Running the Demo + +#### Step 1: Start HeroDB Server +```bash +# From the project root directory +cargo run -- --dir ./test_data --port 6379 +``` + +#### Step 2: Run the Demo (in a new terminal) +```bash +# From the project root directory +./examples/lance_vector_demo.sh +``` + +### What the Demo Covers + +The script demonstrates comprehensive vector database operations: + +1. **Dataset Management** + - Creating vector datasets with custom dimensions + - Defining schemas with metadata fields + - Listing and inspecting datasets + - Dataset information and statistics + +2. **Embedding Operations** + - Text embedding generation via external services + - Multimodal embedding support (text + images) + - Batch embedding operations + +3. **Data Storage** + - Storing text documents with automatic embedding + - Storing images with metadata + - Multimodal content storage + - Rich metadata support + +4. **Vector Search** + - Similarity search with raw vectors + - Text-based semantic search + - Configurable search parameters (K, NPROBES, REFINE) + - Cross-modal search capabilities + +5. **Index Management** + - Creating IVF_PQ indexes for performance + - Custom index parameters + - Performance optimization + +6. **Advanced Features** + - Error handling and recovery + - Performance testing concepts + - Monitoring and maintenance + - Cleanup operations + +### Key Lance Commands Demonstrated + +#### Dataset Management +```bash +# Create vector dataset +LANCE CREATE documents DIM 384 + +# Create dataset with schema +LANCE CREATE products DIM 768 SCHEMA category:string price:float available:bool + +# List datasets +LANCE LIST + +# Get dataset information +LANCE INFO documents +``` + +#### Data Operations +```bash +# Store text with metadata +LANCE STORE documents TEXT "Machine learning tutorial" category "education" author "John Doe" + +# Store image with metadata +LANCE STORE images IMAGE "base64_encoded_image..." filename "photo.jpg" tags "nature,landscape" + +# Store multimodal content +LANCE STORE content TEXT "Product description" IMAGE "base64_image..." type "product" +``` + +#### Search Operations +```bash +# Search with raw vector +LANCE SEARCH documents VECTOR "0.1,0.2,0.3,0.4" K 5 + +# Semantic text search +LANCE SEARCH.TEXT documents "artificial intelligence" K 10 NPROBES 20 + +# Generate embeddings +LANCE EMBED.TEXT "Hello world" "Machine learning" +``` + +#### Index Management +```bash +# Create performance index +LANCE CREATE.INDEX documents IVF_PQ PARTITIONS 256 SUBVECTORS 16 + +# Drop dataset +LANCE DROP old_dataset +``` + +### Configuration + +#### Setting Up Embedding Service +```bash +# Configure embedding service URL +redis-cli HSET config:core:aiembed url "http://your-embedding-service:8080/embed" + +# Optional: Set authentication token +redis-cli HSET config:core:aiembed token "your-api-token" +``` + +#### Embedding Service API +Your embedding service should accept POST requests: +```json +{ + "texts": ["text1", "text2"], + "images": ["base64_image1", "base64_image2"], + "model": "your-model-name" +} +``` + +And return responses: +```json +{ + "embeddings": [[0.1, 0.2, ...], [0.3, 0.4, ...]], + "model": "model-name", + "usage": {"tokens": 100, "requests": 2} +} +``` + +### Interactive Features + +The demo script includes: +- **Colored output** for better readability +- **Step-by-step execution** with explanations +- **Error handling** demonstrations +- **Automatic cleanup** options +- **Performance testing** concepts +- **Real-world usage** examples + +### Use Cases Demonstrated + +1. **Document Search System** + - Semantic document retrieval + - Metadata filtering + - Relevance ranking + +2. **Image Similarity Search** + - Visual content matching + - Tag-based filtering + - Multimodal queries + +3. **Product Recommendations** + - Feature-based similarity + - Category filtering + - Price range queries + +4. **Content Management** + - Mixed media storage + - Cross-modal search + - Rich metadata support + +--- ## Tantivy Search Demo (Bash Script) diff --git a/examples/lance_vector_demo.sh b/examples/lance_vector_demo.sh new file mode 100755 index 0000000..2fc9162 --- /dev/null +++ b/examples/lance_vector_demo.sh @@ -0,0 +1,426 @@ +#!/bin/bash + +# Lance Vector Database Demo Script +# This script demonstrates all Lance vector database operations in HeroDB + +set -e # Exit on any error + +# Configuration +REDIS_HOST="localhost" +REDIS_PORT="6379" +REDIS_CLI="redis-cli -h $REDIS_HOST -p $REDIS_PORT" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Helper functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +execute_command() { + local cmd="$1" + local description="$2" + + echo + log_info "Executing: $description" + echo "Command: $cmd" + + if result=$($cmd 2>&1); then + log_success "Result: $result" + else + log_error "Failed: $result" + return 1 + fi +} + +# Check if HeroDB is running +check_herodb() { + log_info "Checking if HeroDB is running..." + if ! $REDIS_CLI ping > /dev/null 2>&1; then + log_error "HeroDB is not running. Please start it first:" + echo " cargo run -- --dir ./test_data --port $REDIS_PORT" + exit 1 + fi + log_success "HeroDB is running" +} + +# Setup embedding service configuration +setup_embedding_service() { + log_info "Setting up embedding service configuration..." + + # Note: This is a mock URL for demonstration + # In production, replace with your actual embedding service + execute_command \ + "$REDIS_CLI HSET config:core:aiembed url 'http://localhost:8080/embed'" \ + "Configure embedding service URL" + + # Optional: Set authentication token + # execute_command \ + # "$REDIS_CLI HSET config:core:aiembed token 'your-api-token'" \ + # "Configure embedding service token" + + log_warning "Note: Embedding service at http://localhost:8080/embed is not running." + log_warning "Some operations will fail, but this demonstrates the command structure." +} + +# Dataset Management Operations +demo_dataset_management() { + echo + echo "==========================================" + echo " DATASET MANAGEMENT DEMO" + echo "==========================================" + + # List datasets (should be empty initially) + execute_command \ + "$REDIS_CLI LANCE LIST" \ + "List all datasets (initially empty)" + + # Create a simple dataset + execute_command \ + "$REDIS_CLI LANCE CREATE documents DIM 384" \ + "Create a simple document dataset with 384 dimensions" + + # Create a dataset with schema + execute_command \ + "$REDIS_CLI LANCE CREATE products DIM 768 SCHEMA category:string price:float available:bool description:string" \ + "Create products dataset with custom schema" + + # Create an image dataset + execute_command \ + "$REDIS_CLI LANCE CREATE images DIM 512 SCHEMA filename:string tags:string width:int height:int" \ + "Create images dataset for multimodal content" + + # List datasets again + execute_command \ + "$REDIS_CLI LANCE LIST" \ + "List all datasets (should show 3 datasets)" + + # Get info about datasets + execute_command \ + "$REDIS_CLI LANCE INFO documents" \ + "Get information about documents dataset" + + execute_command \ + "$REDIS_CLI LANCE INFO products" \ + "Get information about products dataset" +} + +# Embedding Operations +demo_embedding_operations() { + echo + echo "==========================================" + echo " EMBEDDING OPERATIONS DEMO" + echo "==========================================" + + log_warning "The following operations will fail because no embedding service is running." + log_warning "This demonstrates the command structure and error handling." + + # Try to embed text (will fail without embedding service) + execute_command \ + "$REDIS_CLI LANCE EMBED.TEXT 'Hello world'" \ + "Generate embedding for single text" || true + + # Try to embed multiple texts + execute_command \ + "$REDIS_CLI LANCE EMBED.TEXT 'Machine learning' 'Artificial intelligence' 'Deep learning'" \ + "Generate embeddings for multiple texts" || true +} + +# Data Storage Operations +demo_data_storage() { + echo + echo "==========================================" + echo " DATA STORAGE DEMO" + echo "==========================================" + + log_warning "Storage operations will fail without embedding service, but show command structure." + + # Store text documents + execute_command \ + "$REDIS_CLI LANCE STORE documents TEXT 'Introduction to machine learning algorithms and their applications in modern AI systems' category 'education' author 'John Doe' difficulty 'beginner'" \ + "Store a document with text and metadata" || true + + execute_command \ + "$REDIS_CLI LANCE STORE documents TEXT 'Deep learning neural networks for computer vision tasks' category 'research' author 'Jane Smith' difficulty 'advanced'" \ + "Store another document" || true + + # Store product information + execute_command \ + "$REDIS_CLI LANCE STORE products TEXT 'High-performance laptop with 16GB RAM and SSD storage' category 'electronics' price '1299.99' available 'true'" \ + "Store product with text description" || true + + # Store image with metadata (using placeholder base64) + execute_command \ + "$REDIS_CLI LANCE STORE images IMAGE 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==' filename 'sample.png' tags 'test,demo' width '1' height '1'" \ + "Store image with metadata (1x1 pixel PNG)" || true + + # Store multimodal content + execute_command \ + "$REDIS_CLI LANCE STORE images TEXT 'Beautiful sunset over mountains' IMAGE 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg==' filename 'sunset.png' tags 'nature,landscape' location 'California'" \ + "Store multimodal content (text + image)" || true +} + +# Search Operations +demo_search_operations() { + echo + echo "==========================================" + echo " SEARCH OPERATIONS DEMO" + echo "==========================================" + + log_warning "Search operations will fail without data, but show command structure." + + # Search with raw vector + execute_command \ + "$REDIS_CLI LANCE SEARCH documents VECTOR '0.1,0.2,0.3,0.4,0.5' K 5" \ + "Search with raw vector (5 results)" || true + + # Search with vector and parameters + execute_command \ + "$REDIS_CLI LANCE SEARCH documents VECTOR '0.1,0.2,0.3,0.4,0.5' K 10 NPROBES 20 REFINE 2" \ + "Search with vector and advanced parameters" || true + + # Text-based search + execute_command \ + "$REDIS_CLI LANCE SEARCH.TEXT documents 'machine learning algorithms' K 5" \ + "Search using text query" || true + + # Text search with parameters + execute_command \ + "$REDIS_CLI LANCE SEARCH.TEXT products 'laptop computer' K 3 NPROBES 10" \ + "Search products using text with parameters" || true + + # Search in image dataset + execute_command \ + "$REDIS_CLI LANCE SEARCH.TEXT images 'sunset landscape' K 5" \ + "Search images using text description" || true +} + +# Index Management Operations +demo_index_management() { + echo + echo "==========================================" + echo " INDEX MANAGEMENT DEMO" + echo "==========================================" + + # Create indexes for better search performance + execute_command \ + "$REDIS_CLI LANCE CREATE.INDEX documents IVF_PQ" \ + "Create default IVF_PQ index for documents" + + execute_command \ + "$REDIS_CLI LANCE CREATE.INDEX products IVF_PQ PARTITIONS 512 SUBVECTORS 32" \ + "Create IVF_PQ index with custom parameters for products" + + execute_command \ + "$REDIS_CLI LANCE CREATE.INDEX images IVF_PQ PARTITIONS 256 SUBVECTORS 16" \ + "Create IVF_PQ index for images dataset" + + log_success "Indexes created successfully" +} + +# Advanced Usage Examples +demo_advanced_usage() { + echo + echo "==========================================" + echo " ADVANCED USAGE EXAMPLES" + echo "==========================================" + + # Create a specialized dataset for semantic search + execute_command \ + "$REDIS_CLI LANCE CREATE semantic_search DIM 1536 SCHEMA title:string content:string url:string timestamp:string source:string" \ + "Create dataset for semantic search with rich metadata" + + # Demonstrate batch operations concept + log_info "Batch operations example (would store multiple items):" + echo " for doc in documents:" + echo " LANCE STORE semantic_search TEXT \"\$doc_content\" title \"\$title\" url \"\$url\"" + + # Show monitoring commands + log_info "Monitoring and maintenance commands:" + execute_command \ + "$REDIS_CLI LANCE LIST" \ + "List all datasets for monitoring" + + # Show dataset statistics + for dataset in documents products images semantic_search; do + execute_command \ + "$REDIS_CLI LANCE INFO $dataset" \ + "Get statistics for $dataset" || true + done +} + +# Cleanup Operations +demo_cleanup() { + echo + echo "==========================================" + echo " CLEANUP OPERATIONS DEMO" + echo "==========================================" + + log_info "Demonstrating cleanup operations..." + + # Drop individual datasets + execute_command \ + "$REDIS_CLI LANCE DROP semantic_search" \ + "Drop semantic_search dataset" + + # List remaining datasets + execute_command \ + "$REDIS_CLI LANCE LIST" \ + "List remaining datasets" + + # Ask user if they want to clean up all test data + echo + read -p "Do you want to clean up all test datasets? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + execute_command \ + "$REDIS_CLI LANCE DROP documents" \ + "Drop documents dataset" + + execute_command \ + "$REDIS_CLI LANCE DROP products" \ + "Drop products dataset" + + execute_command \ + "$REDIS_CLI LANCE DROP images" \ + "Drop images dataset" + + execute_command \ + "$REDIS_CLI LANCE LIST" \ + "Verify all datasets are cleaned up" + + log_success "All test datasets cleaned up" + else + log_info "Keeping test datasets for further experimentation" + fi +} + +# Error Handling Demo +demo_error_handling() { + echo + echo "==========================================" + echo " ERROR HANDLING DEMO" + echo "==========================================" + + log_info "Demonstrating various error conditions..." + + # Try to access non-existent dataset + execute_command \ + "$REDIS_CLI LANCE INFO nonexistent_dataset" \ + "Try to get info for non-existent dataset" || true + + # Try to search non-existent dataset + execute_command \ + "$REDIS_CLI LANCE SEARCH nonexistent_dataset VECTOR '0.1,0.2' K 5" \ + "Try to search non-existent dataset" || true + + # Try to drop non-existent dataset + execute_command \ + "$REDIS_CLI LANCE DROP nonexistent_dataset" \ + "Try to drop non-existent dataset" || true + + # Try invalid vector format + execute_command \ + "$REDIS_CLI LANCE SEARCH documents VECTOR 'invalid,vector,format' K 5" \ + "Try search with invalid vector format" || true + + log_info "Error handling demonstration complete" +} + +# Performance Testing Demo +demo_performance_testing() { + echo + echo "==========================================" + echo " PERFORMANCE TESTING DEMO" + echo "==========================================" + + log_info "Creating performance test dataset..." + execute_command \ + "$REDIS_CLI LANCE CREATE perf_test DIM 128 SCHEMA batch_id:string item_id:string" \ + "Create performance test dataset" + + log_info "Performance testing would involve:" + echo " 1. Bulk loading thousands of vectors" + echo " 2. Creating indexes with different parameters" + echo " 3. Measuring search latency with various K values" + echo " 4. Testing different NPROBES settings" + echo " 5. Monitoring memory usage" + + log_info "Example performance test commands:" + echo " # Test search speed with different parameters" + echo " time redis-cli LANCE SEARCH.TEXT perf_test 'query' K 10" + echo " time redis-cli LANCE SEARCH.TEXT perf_test 'query' K 10 NPROBES 50" + echo " time redis-cli LANCE SEARCH.TEXT perf_test 'query' K 100 NPROBES 100" + + # Clean up performance test dataset + execute_command \ + "$REDIS_CLI LANCE DROP perf_test" \ + "Clean up performance test dataset" +} + +# Main execution +main() { + echo "==========================================" + echo " LANCE VECTOR DATABASE DEMO SCRIPT" + echo "==========================================" + echo + echo "This script demonstrates all Lance vector database operations." + echo "Note: Some operations will fail without a running embedding service." + echo "This is expected and demonstrates error handling." + echo + + # Check prerequisites + check_herodb + + # Setup + setup_embedding_service + + # Run demos + demo_dataset_management + demo_embedding_operations + demo_data_storage + demo_search_operations + demo_index_management + demo_advanced_usage + demo_error_handling + demo_performance_testing + + # Cleanup + demo_cleanup + + echo + echo "==========================================" + echo " DEMO COMPLETE" + echo "==========================================" + echo + log_success "Lance vector database demo completed successfully!" + echo + echo "Next steps:" + echo "1. Set up a real embedding service (OpenAI, Hugging Face, etc.)" + echo "2. Update the embedding service URL configuration" + echo "3. Try storing and searching real data" + echo "4. Experiment with different vector dimensions and index parameters" + echo "5. Build your AI-powered application!" + echo + echo "For more information, see docs/lance_vector_db.md" +} + +# Run the demo +main "$@" \ No newline at end of file diff --git a/src/cmd.rs b/src/cmd.rs index f78dc08..d79169b 100644 --- a/src/cmd.rs +++ b/src/cmd.rs @@ -1,6 +1,8 @@ use crate::{error::DBError, protocol::Protocol, server::Server}; use tokio::time::{timeout, Duration}; use futures::future::select_all; +use std::sync::Arc; +use base64::Engine; #[derive(Debug, Clone)] pub enum Cmd { @@ -1006,11 +1008,11 @@ impl Cmd { Cmd::AgeList => Ok(crate::age::cmd_age_list(server).await), // Lance vector database commands - Cmd::LanceCreate { dataset, dim, schema } => lance_create_cmd(server, &dataset, *dim, &schema).await, - Cmd::LanceStore { dataset, text, image_base64, metadata } => lance_store_cmd(server, &dataset, text.as_deref(), image_base64.as_deref(), metadata).await, - Cmd::LanceSearch { dataset, vector, k, nprobes, refine_factor } => lance_search_cmd(server, &dataset, vector, *k, nprobes, refine_factor).await, - Cmd::LanceSearchText { dataset, query_text, k, nprobes, refine_factor } => lance_search_text_cmd(server, &dataset, &query_text, *k, nprobes, refine_factor).await, - Cmd::LanceEmbedText { texts } => lance_embed_text_cmd(server, texts).await, + Cmd::LanceCreate { dataset, dim, schema } => lance_create_cmd(server, &dataset, dim, &schema).await, + Cmd::LanceStore { dataset, text, image_base64, metadata } => lance_store_cmd(server, &dataset, text.as_deref(), image_base64.as_deref(), &metadata).await, + Cmd::LanceSearch { dataset, vector, k, nprobes, refine_factor } => lance_search_cmd(server, &dataset, &vector, k, nprobes, refine_factor).await, + Cmd::LanceSearchText { dataset, query_text, k, nprobes, refine_factor } => lance_search_text_cmd(server, &dataset, &query_text, k, nprobes, refine_factor).await, + Cmd::LanceEmbedText { texts } => lance_embed_text_cmd(server, &texts).await, Cmd::LanceCreateIndex { dataset, index_type, num_partitions, num_sub_vectors } => lance_create_index_cmd(server, &dataset, &index_type, num_partitions, num_sub_vectors).await, Cmd::LanceList => lance_list_cmd(server).await, Cmd::LanceDrop { dataset } => lance_drop_cmd(server, &dataset).await, @@ -1800,6 +1802,36 @@ fn command_cmd(args: &[String]) -> Result { } } +// Helper function to create Arrow schema from field specifications +fn create_schema_from_fields(dim: usize, fields: &[(String, String)]) -> arrow::datatypes::Schema { + let mut schema_fields = Vec::new(); + + // Always add the vector field first + let vector_field = arrow::datatypes::Field::new( + "vector", + arrow::datatypes::DataType::FixedSizeList( + Arc::new(arrow::datatypes::Field::new("item", arrow::datatypes::DataType::Float32, true)), + dim as i32 + ), + false + ); + schema_fields.push(vector_field); + + // Add custom fields + for (name, field_type) in fields { + let data_type = match field_type.to_lowercase().as_str() { + "string" | "text" => arrow::datatypes::DataType::Utf8, + "int" | "integer" => arrow::datatypes::DataType::Int64, + "float" => arrow::datatypes::DataType::Float64, + "bool" | "boolean" => arrow::datatypes::DataType::Boolean, + _ => arrow::datatypes::DataType::Utf8, // Default to string + }; + schema_fields.push(arrow::datatypes::Field::new(name, data_type, true)); + } + + arrow::datatypes::Schema::new(schema_fields) +} + // Lance vector database command implementations async fn lance_create_cmd( server: &Server, @@ -1809,12 +1841,12 @@ async fn lance_create_cmd( ) -> Result { match server.lance_store() { Ok(lance_store) => { - match lance_store.create_dataset(dataset, dim, schema.to_vec()).await { + match lance_store.create_dataset(dataset, create_schema_from_fields(dim, schema)).await { Ok(_) => Ok(Protocol::SimpleString("OK".to_string())), - Err(e) => Ok(Protocol::err(&format!("ERR {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR {}", e)))), } } - Err(e) => Ok(Protocol::err(&format!("ERR Lance store not available: {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR Lance store not available: {}", e)))), } } @@ -1827,12 +1859,14 @@ async fn lance_store_cmd( ) -> Result { match server.lance_store() { Ok(lance_store) => { - match lance_store.store_data(dataset, text, image_base64, metadata.clone()).await { + match lance_store.store_multimodal(server, dataset, text.map(|s| s.to_string()), + image_base64.and_then(|s| base64::engine::general_purpose::STANDARD.decode(s).ok()), + metadata.clone()).await { Ok(id) => Ok(Protocol::BulkString(id)), - Err(e) => Ok(Protocol::err(&format!("ERR {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR {}", e)))), } } - Err(e) => Ok(Protocol::err(&format!("ERR Lance store not available: {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR Lance store not available: {}", e)))), } } @@ -1846,24 +1880,14 @@ async fn lance_search_cmd( ) -> Result { match server.lance_store() { Ok(lance_store) => { - match lance_store.search_vector(dataset, vector, k, nprobes, refine_factor).await { + match lance_store.search_vectors(dataset, vector.to_vec(), k, nprobes, refine_factor).await { Ok(results) => { let mut response = Vec::new(); - for result in results { + for (distance, metadata) in results { let mut item = Vec::new(); - item.push(Protocol::BulkString("id".to_string())); - item.push(Protocol::BulkString(result.id)); - item.push(Protocol::BulkString("score".to_string())); - item.push(Protocol::BulkString(result.score.to_string())); - if let Some(text) = result.text { - item.push(Protocol::BulkString("text".to_string())); - item.push(Protocol::BulkString(text)); - } - if let Some(image) = result.image_base64 { - item.push(Protocol::BulkString("image".to_string())); - item.push(Protocol::BulkString(image)); - } - for (key, value) in result.metadata { + item.push(Protocol::BulkString("distance".to_string())); + item.push(Protocol::BulkString(distance.to_string())); + for (key, value) in metadata { item.push(Protocol::BulkString(key)); item.push(Protocol::BulkString(value)); } @@ -1871,10 +1895,10 @@ async fn lance_search_cmd( } Ok(Protocol::Array(response)) } - Err(e) => Ok(Protocol::err(&format!("ERR {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR {}", e)))), } } - Err(e) => Ok(Protocol::err(&format!("ERR Lance store not available: {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR Lance store not available: {}", e)))), } } @@ -1888,24 +1912,14 @@ async fn lance_search_text_cmd( ) -> Result { match server.lance_store() { Ok(lance_store) => { - match lance_store.search_text(dataset, query_text, k, nprobes, refine_factor).await { + match lance_store.search_with_text(server, dataset, query_text.to_string(), k, nprobes, refine_factor).await { Ok(results) => { let mut response = Vec::new(); - for result in results { + for (distance, metadata) in results { let mut item = Vec::new(); - item.push(Protocol::BulkString("id".to_string())); - item.push(Protocol::BulkString(result.id)); - item.push(Protocol::BulkString("score".to_string())); - item.push(Protocol::BulkString(result.score.to_string())); - if let Some(text) = result.text { - item.push(Protocol::BulkString("text".to_string())); - item.push(Protocol::BulkString(text)); - } - if let Some(image) = result.image_base64 { - item.push(Protocol::BulkString("image".to_string())); - item.push(Protocol::BulkString(image)); - } - for (key, value) in result.metadata { + item.push(Protocol::BulkString("distance".to_string())); + item.push(Protocol::BulkString(distance.to_string())); + for (key, value) in metadata { item.push(Protocol::BulkString(key)); item.push(Protocol::BulkString(value)); } @@ -1913,10 +1927,26 @@ async fn lance_search_text_cmd( } Ok(Protocol::Array(response)) } - Err(e) => Ok(Protocol::err(&format!("ERR {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR {}", e)))), } } - Err(e) => Ok(Protocol::err(&format!("ERR Lance store not available: {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR Lance store not available: {}", e)))), + } +} + +// Helper function to sanitize error messages for Redis protocol +fn sanitize_error_message(msg: &str) -> String { + // Remove newlines, carriage returns, and limit length + let sanitized = msg + .replace('\n', " ") + .replace('\r', " ") + .replace('\t', " "); + + // Limit to 200 characters to avoid overly long error messages + if sanitized.len() > 200 { + format!("{}...", &sanitized[..197]) + } else { + sanitized } } @@ -1926,7 +1956,7 @@ async fn lance_embed_text_cmd( ) -> Result { match server.lance_store() { Ok(lance_store) => { - match lance_store.embed_texts(texts).await { + match lance_store.embed_text(server, texts.to_vec()).await { Ok(embeddings) => { let mut response = Vec::new(); for embedding in embeddings { @@ -1938,10 +1968,10 @@ async fn lance_embed_text_cmd( } Ok(Protocol::Array(response)) } - Err(e) => Ok(Protocol::err(&format!("ERR {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR {}", e)))), } } - Err(e) => Ok(Protocol::err(&format!("ERR Lance store not available: {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR Lance store not available: {}", e)))), } } @@ -1956,10 +1986,10 @@ async fn lance_create_index_cmd( Ok(lance_store) => { match lance_store.create_index(dataset, index_type, num_partitions, num_sub_vectors).await { Ok(_) => Ok(Protocol::SimpleString("OK".to_string())), - Err(e) => Ok(Protocol::err(&format!("ERR {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR {}", e)))), } } - Err(e) => Ok(Protocol::err(&format!("ERR Lance store not available: {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR Lance store not available: {}", e)))), } } @@ -1974,10 +2004,10 @@ async fn lance_list_cmd(server: &Server) -> Result { .collect(); Ok(Protocol::Array(response)) } - Err(e) => Ok(Protocol::err(&format!("ERR {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR {}", e)))), } } - Err(e) => Ok(Protocol::err(&format!("ERR Lance store not available: {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR Lance store not available: {}", e)))), } } @@ -1986,41 +2016,28 @@ async fn lance_drop_cmd(server: &Server, dataset: &str) -> Result { match lance_store.drop_dataset(dataset).await { Ok(_) => Ok(Protocol::SimpleString("OK".to_string())), - Err(e) => Ok(Protocol::err(&format!("ERR {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR {}", e)))), } } - Err(e) => Ok(Protocol::err(&format!("ERR Lance store not available: {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR Lance store not available: {}", e)))), } } async fn lance_info_cmd(server: &Server, dataset: &str) -> Result { match server.lance_store() { Ok(lance_store) => { - match lance_store.dataset_info(dataset).await { + match lance_store.get_dataset_info(dataset).await { Ok(info) => { let mut response = Vec::new(); - response.push(Protocol::BulkString("name".to_string())); - response.push(Protocol::BulkString(info.name)); - response.push(Protocol::BulkString("dimension".to_string())); - response.push(Protocol::BulkString(info.dimension.to_string())); - response.push(Protocol::BulkString("num_rows".to_string())); - response.push(Protocol::BulkString(info.num_rows.to_string())); - response.push(Protocol::BulkString("schema".to_string())); - let schema_items: Vec = info.schema - .into_iter() - .map(|(field, field_type)| { - Protocol::Array(vec![ - Protocol::BulkString(field), - Protocol::BulkString(field_type), - ]) - }) - .collect(); - response.push(Protocol::Array(schema_items)); + for (key, value) in info { + response.push(Protocol::BulkString(key)); + response.push(Protocol::BulkString(value)); + } Ok(Protocol::Array(response)) } - Err(e) => Ok(Protocol::err(&format!("ERR {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR {}", e)))), } } - Err(e) => Ok(Protocol::err(&format!("ERR Lance store not available: {}", e))), + Err(e) => Ok(Protocol::err(&sanitize_error_message(&format!("ERR Lance store not available: {}", e)))), } } diff --git a/src/lance_store.rs b/src/lance_store.rs index 504c82b..96a9986 100644 --- a/src/lance_store.rs +++ b/src/lance_store.rs @@ -3,9 +3,10 @@ use std::path::PathBuf; use std::sync::Arc; use tokio::sync::RwLock; -use arrow::array::{Float32Array, StringArray, ArrayRef, FixedSizeListArray}; -use arrow::datatypes::{DataType, Field, Schema, FieldRef}; -use arrow::record_batch::RecordBatch; +use arrow::array::{Float32Array, StringArray, ArrayRef, FixedSizeListArray, Array}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::record_batch::{RecordBatch, RecordBatchReader}; +use arrow::error::ArrowError; use lance::dataset::{Dataset, WriteParams, WriteMode}; use lance::index::vector::VectorIndexParams; use lance_index::vector::pq::PQBuildParams; @@ -13,10 +14,39 @@ use lance_index::vector::ivf::IvfBuildParams; use lance_index::DatasetIndexExt; use lance_linalg::distance::MetricType; use futures::TryStreamExt; +use base64::Engine; use serde::{Deserialize, Serialize}; use crate::error::DBError; -use crate::protocol::Protocol; + +// Simple RecordBatchReader implementation for Vec +struct VecRecordBatchReader { + batches: std::vec::IntoIter>, +} + +impl VecRecordBatchReader { + fn new(batches: Vec) -> Self { + let result_batches = batches.into_iter().map(Ok).collect::>(); + Self { + batches: result_batches.into_iter(), + } + } +} + +impl Iterator for VecRecordBatchReader { + type Item = Result; + + fn next(&mut self) -> Option { + self.batches.next() + } +} + +impl RecordBatchReader for VecRecordBatchReader { + fn schema(&self) -> SchemaRef { + // This is a simplified implementation - in practice you'd want to store the schema + Arc::new(Schema::empty()) + } +} #[derive(Debug, Serialize, Deserialize)] struct EmbeddingRequest { @@ -32,6 +62,18 @@ struct EmbeddingResponse { usage: Option>, } +// Ollama-specific request/response structures +#[derive(Debug, Serialize, Deserialize)] +struct OllamaEmbeddingRequest { + model: String, + prompt: String, +} + +#[derive(Debug, Serialize, Deserialize)] +struct OllamaEmbeddingResponse { + embedding: Vec, +} + pub struct LanceStore { datasets: Arc>>>, data_dir: PathBuf, @@ -56,64 +98,104 @@ impl LanceStore { }) } - /// Get embedding service URL from Redis config + /// Get embedding service URL from Redis config, default to local Ollama async fn get_embedding_url(&self, server: &crate::server::Server) -> Result { - // Get the embedding URL from Redis config - let key = "config:core:aiembed:url"; - - // Use HGET to retrieve the URL from Redis hash - let cmd = crate::cmd::Cmd::HGet(key.to_string(), "url".to_string()); - - // Execute command to get the config - let result = cmd.run(&mut server.clone()).await?; - - match result { - Protocol::BulkString(url) => Ok(url), - Protocol::SimpleString(url) => Ok(url), - Protocol::Nil => Err(DBError( - "Embedding service URL not configured. Set it with: HSET config:core:aiembed:url url ".to_string() - )), - _ => Err(DBError("Invalid embedding URL configuration".to_string())), + // Get the embedding URL from Redis config directly from storage + let storage = server.current_storage()?; + match storage.hget("config:core:aiembed", "url")? { + Some(url) => Ok(url), + None => Ok("http://localhost:11434".to_string()), // Default to local Ollama } } - /// Call external embedding service + /// Check if we're using Ollama (default) or custom embedding service + async fn is_ollama_service(&self, server: &crate::server::Server) -> Result { + let url = self.get_embedding_url(server).await?; + Ok(url.contains("localhost:11434") || url.contains("127.0.0.1:11434")) + } + + /// Call external embedding service (Ollama or custom) async fn call_embedding_service( &self, server: &crate::server::Server, texts: Option>, images: Option>, ) -> Result>, DBError> { - let url = self.get_embedding_url(server).await?; + let base_url = self.get_embedding_url(server).await?; + let is_ollama = self.is_ollama_service(server).await?; - let request = EmbeddingRequest { - texts, - images, - model: None, // Let the service use its default - }; - - let response = self.http_client - .post(&url) - .json(&request) - .send() - .await - .map_err(|e| DBError(format!("Failed to call embedding service: {}", e)))?; - - if !response.status().is_success() { - let status = response.status(); - let error_text = response.text().await.unwrap_or_default(); - return Err(DBError(format!( - "Embedding service returned error {}: {}", - status, error_text - ))); + if is_ollama { + // Use Ollama API format + if let Some(texts) = texts { + let mut embeddings = Vec::new(); + for text in texts { + let url = format!("{}/api/embeddings", base_url); + let request = OllamaEmbeddingRequest { + model: "nomic-embed-text".to_string(), + prompt: text, + }; + + let response = self.http_client + .post(&url) + .json(&request) + .send() + .await + .map_err(|e| DBError(format!("Failed to call Ollama embedding service: {}", e)))?; + + if !response.status().is_success() { + let status = response.status(); + let error_text = response.text().await.unwrap_or_default(); + return Err(DBError(format!( + "Ollama embedding service returned error {}: {}", + status, error_text + ))); + } + + let ollama_response: OllamaEmbeddingResponse = response + .json() + .await + .map_err(|e| DBError(format!("Failed to parse Ollama embedding response: {}", e)))?; + + embeddings.push(ollama_response.embedding); + } + Ok(embeddings) + } else if let Some(_images) = images { + // Ollama doesn't support image embeddings with this API yet + Err(DBError("Image embeddings not supported with Ollama. Please configure a custom embedding service.".to_string())) + } else { + Err(DBError("No text or images provided for embedding".to_string())) + } + } else { + // Use custom embedding service API format + let request = EmbeddingRequest { + texts, + images, + model: None, // Let the service use its default + }; + + let response = self.http_client + .post(&base_url) + .json(&request) + .send() + .await + .map_err(|e| DBError(format!("Failed to call embedding service: {}", e)))?; + + if !response.status().is_success() { + let status = response.status(); + let error_text = response.text().await.unwrap_or_default(); + return Err(DBError(format!( + "Embedding service returned error {}: {}", + status, error_text + ))); + } + + let embedding_response: EmbeddingResponse = response + .json() + .await + .map_err(|e| DBError(format!("Failed to parse embedding response: {}", e)))?; + + Ok(embedding_response.embeddings) } - - let embedding_response: EmbeddingResponse = response - .json() - .await - .map_err(|e| DBError(format!("Failed to parse embedding response: {}", e)))?; - - Ok(embedding_response.embeddings) } pub async fn embed_text( @@ -162,10 +244,11 @@ impl LanceStore { // Create an empty RecordBatch with the schema let empty_batch = RecordBatch::new_empty(Arc::new(schema)); - let batches = vec![empty_batch]; + // Use RecordBatchReader for Lance 0.33 + let reader = VecRecordBatchReader::new(vec![empty_batch]); let dataset = Dataset::write( - batches, + reader, dataset_path.to_str().unwrap(), Some(write_params) ).await @@ -186,7 +269,7 @@ impl LanceStore { let dataset_path = self.data_dir.join(format!("{}.lance", dataset_name)); // Open or get cached dataset - let dataset = self.get_or_open_dataset(dataset_name).await?; + let _dataset = self.get_or_open_dataset(dataset_name).await?; // Build RecordBatch let num_vectors = vectors.len(); @@ -200,10 +283,13 @@ impl LanceStore { // Flatten vectors let flat_vectors: Vec = vectors.into_iter().flatten().collect(); - let vector_array = Float32Array::from(flat_vectors); - let vector_array = arrow::array::FixedSizeListArray::try_new_from_values( - vector_array, - dim as i32 + let values_array = Float32Array::from(flat_vectors); + let field = Arc::new(Field::new("item", DataType::Float32, true)); + let vector_array = FixedSizeListArray::try_new( + field, + dim as i32, + Arc::new(values_array), + None ).map_err(|e| DBError(format!("Failed to create vector array: {}", e)))?; let mut arrays: Vec = vec![Arc::new(vector_array)]; @@ -241,8 +327,9 @@ impl LanceStore { ..Default::default() }; + let reader = VecRecordBatchReader::new(vec![batch]); Dataset::write( - vec![batch], + reader, dataset_path.to_str().unwrap(), Some(write_params) ).await @@ -261,25 +348,27 @@ impl LanceStore { query_vector: Vec, k: usize, nprobes: Option, - refine_factor: Option, + _refine_factor: Option, ) -> Result)>, DBError> { let dataset = self.get_or_open_dataset(dataset_name).await?; // Build query + let query_array = Float32Array::from(query_vector.clone()); let mut query = dataset.scan(); - query = query.nearest( + query.nearest( "vector", - &query_vector, + &query_array, k, ).map_err(|e| DBError(format!("Failed to build search query: {}", e)))?; if let Some(nprobes) = nprobes { - query = query.nprobes(nprobes); + query.nprobs(nprobes); } - if let Some(refine) = refine_factor { - query = query.refine_factor(refine); - } + // Note: refine_factor might not be available in this Lance version + // if let Some(refine) = refine_factor { + // query.refine_factor(refine); + // } // Execute search let results = query @@ -399,33 +488,41 @@ impl LanceStore { num_partitions: Option, num_sub_vectors: Option, ) -> Result<(), DBError> { - let dataset = self.get_or_open_dataset(dataset_name).await?; - - let mut params = VectorIndexParams::default(); + let _dataset = self.get_or_open_dataset(dataset_name).await?; match index_type.to_uppercase().as_str() { "IVF_PQ" => { - params.ivf = IvfBuildParams { + let ivf_params = IvfBuildParams { num_partitions: num_partitions.unwrap_or(256), ..Default::default() }; - params.pq = PQBuildParams { + let pq_params = PQBuildParams { num_sub_vectors: num_sub_vectors.unwrap_or(16), ..Default::default() }; + let params = VectorIndexParams::with_ivf_pq_params( + MetricType::L2, + ivf_params, + pq_params, + ); + + // Get a mutable reference to the dataset + let mut dataset_mut = Dataset::open(self.data_dir.join(format!("{}.lance", dataset_name)).to_str().unwrap()) + .await + .map_err(|e| DBError(format!("Failed to open dataset for indexing: {}", e)))?; + + dataset_mut.create_index( + &["vector"], + lance_index::IndexType::Vector, + None, + ¶ms, + true + ).await + .map_err(|e| DBError(format!("Failed to create index: {}", e)))?; } _ => return Err(DBError(format!("Unsupported index type: {}", index_type))), } - dataset.create_index( - &["vector"], - lance::index::IndexType::Vector, - None, - ¶ms, - true - ).await - .map_err(|e| DBError(format!("Failed to create index: {}", e)))?; - Ok(()) } @@ -496,14 +593,14 @@ impl LanceStore { let mut info = HashMap::new(); info.insert("name".to_string(), name.to_string()); - info.insert("version".to_string(), dataset.version().to_string()); - info.insert("num_rows".to_string(), dataset.count_rows().await?.to_string()); + info.insert("version".to_string(), dataset.version().version.to_string()); + info.insert("num_rows".to_string(), dataset.count_rows(None).await?.to_string()); // Get schema info let schema = dataset.schema(); - let fields: Vec = schema.fields() + let fields: Vec = schema.fields .iter() - .map(|f| format!("{}:{}", f.name(), f.data_type())) + .map(|f| format!("{}:{}", f.name, f.data_type())) .collect(); info.insert("schema".to_string(), fields.join(", "));