From 189971509abf47372e91f0b49c1b1d417d6a5e41 Mon Sep 17 00:00:00 2001 From: despiegk Date: Sun, 20 Apr 2025 06:34:31 +0200 Subject: [PATCH] ... --- herodb/src/models/biz/product.rs | 5 - ourdb/Cargo.lock | 95 +++++- ourdb/Cargo.toml | 1 + ourdb/README.md | 2 +- ourdb/benches/ourdb_benchmarks.rs | 277 ++++++++++++++++ radixtree/README.md | 6 +- radixtree/examples/large_scale_test.rs | 121 +++++++ radixtree/examples/performance_test.rs | 134 ++++++++ radixtree/src/operations.rs | 4 +- tst/Cargo.lock | 180 +++++++++++ tst/Cargo.toml | 31 ++ tst/README.md | 183 +++++++++++ tst/examples/basic_usage.rs | 76 +++++ tst/examples/performance.rs | 134 ++++++++ tst/examples/prefix_ops.rs | 123 ++++++++ tst/src/error.rs | 36 +++ tst/src/lib.rs | 122 ++++++++ tst/src/node.rs | 49 +++ tst/src/operations.rs | 418 +++++++++++++++++++++++++ tst/src/serialize.rs | 134 ++++++++ tst/tests/basic_test.rs | 215 +++++++++++++ tst/tests/prefix_test.rs | 215 +++++++++++++ tst_implementation_plan.md | 365 +++++++++++++++++++++ 23 files changed, 2913 insertions(+), 13 deletions(-) create mode 100644 ourdb/benches/ourdb_benchmarks.rs create mode 100644 radixtree/examples/large_scale_test.rs create mode 100644 radixtree/examples/performance_test.rs create mode 100644 tst/Cargo.lock create mode 100644 tst/Cargo.toml create mode 100644 tst/README.md create mode 100644 tst/examples/basic_usage.rs create mode 100644 tst/examples/performance.rs create mode 100644 tst/examples/prefix_ops.rs create mode 100644 tst/src/error.rs create mode 100644 tst/src/lib.rs create mode 100644 tst/src/node.rs create mode 100644 tst/src/operations.rs create mode 100644 tst/src/serialize.rs create mode 100644 tst/tests/basic_test.rs create mode 100644 tst/tests/prefix_test.rs create mode 100644 tst_implementation_plan.md diff --git a/herodb/src/models/biz/product.rs b/herodb/src/models/biz/product.rs index bc3d793..a08e757 100644 --- a/herodb/src/models/biz/product.rs +++ b/herodb/src/models/biz/product.rs @@ -117,13 +117,8 @@ impl ProductComponentBuilder { } } -<<<<<<< HEAD /// Product represents a product or service offered in the system #[derive(Debug, Clone, Serialize, Deserialize)] -======= -/// Product represents a product or service offered by the Freezone -#[derive(Debug, Clone, Serialize, Deserialize, CustomType)] ->>>>>>> builders_in_script pub struct Product { pub id: i64, pub name: String, diff --git a/ourdb/Cargo.lock b/ourdb/Cargo.lock index be4032e..6126ce6 100644 --- a/ourdb/Cargo.lock +++ b/ourdb/Cargo.lock @@ -29,6 +29,12 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +[[package]] +name = "bitflags" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" + [[package]] name = "bumpalo" version = "3.17.0" @@ -181,6 +187,22 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "errno" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "976dd42dc7e85965fe702eb8164f21f450704bdde31faefd6471dba214cb594e" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + [[package]] name = "getrandom" version = "0.2.15" @@ -189,7 +211,19 @@ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi 0.14.2+wasi-0.2.4", ] [[package]] @@ -250,6 +284,12 @@ version = "0.2.171" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" +[[package]] +name = "linux-raw-sys" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" + [[package]] name = "log" version = "0.4.27" @@ -291,6 +331,7 @@ dependencies = [ "criterion", "log", "rand", + "tempfile", "thiserror", ] @@ -349,6 +390,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" + [[package]] name = "rand" version = "0.8.5" @@ -376,7 +423,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", ] [[package]] @@ -428,6 +475,19 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "rustix" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d97817398dd4bb2e6da002002db259209759911da105da92bec29ccb12cf58bf" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "rustversion" version = "1.0.20" @@ -492,6 +552,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7437ac7763b9b123ccf33c338a5cc1bac6f69b45a136c19bdd8a65e3916435bf" +dependencies = [ + "fastrand", + "getrandom 0.3.2", + "once_cell", + "rustix", + "windows-sys", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -544,6 +617,15 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.14.2+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -694,6 +776,15 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "wit-bindgen-rt" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" +dependencies = [ + "bitflags", +] + [[package]] name = "zerocopy" version = "0.8.24" diff --git a/ourdb/Cargo.toml b/ourdb/Cargo.toml index 014e080..becf1f3 100644 --- a/ourdb/Cargo.toml +++ b/ourdb/Cargo.toml @@ -13,6 +13,7 @@ rand = "0.8.5" [dev-dependencies] criterion = "0.5.1" +tempfile = "3.8.0" [[bench]] name = "ourdb_benchmarks" diff --git a/ourdb/README.md b/ourdb/README.md index 5feb215..cda806a 100644 --- a/ourdb/README.md +++ b/ourdb/README.md @@ -26,7 +26,7 @@ use std::path::PathBuf; fn main() -> Result<(), ourdb::Error> { // Create a new database let config = OurDBConfig { - path: PathBuf::from("/path/to/db"), + path: PathBuf::from("/tmp/ourdb"), incremental_mode: true, file_size: None, // Use default (500MB) keysize: None, // Use default (4 bytes) diff --git a/ourdb/benches/ourdb_benchmarks.rs b/ourdb/benches/ourdb_benchmarks.rs new file mode 100644 index 0000000..f1a796e --- /dev/null +++ b/ourdb/benches/ourdb_benchmarks.rs @@ -0,0 +1,277 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use ourdb::{OurDB, OurDBConfig, OurDBSetArgs}; +use std::path::PathBuf; +use tempfile::tempdir; + +fn criterion_benchmark(c: &mut Criterion) { + // Create a temporary directory for benchmarks + let temp_dir = tempdir().expect("Failed to create temp directory"); + let db_path = temp_dir.path().to_path_buf(); + + // Benchmark set operation (insertion) + c.bench_function("set", |b| { + let config = OurDBConfig { + path: db_path.clone(), + incremental_mode: true, + file_size: Some(10 * 1024 * 1024), // 10MB + keysize: None, + }; + + let mut db = OurDB::new(config).unwrap(); + let test_data = vec![b'X'; 100]; // 100 bytes of data + let mut i = 0; + + b.iter(|| { + let args = OurDBSetArgs { + id: None, // Let the DB assign an ID + data: &test_data, + }; + black_box(db.set(args).unwrap()); + i += 1; + }); + }); + + // Setup database with data for other benchmarks + let setup_config = OurDBConfig { + path: db_path.clone(), + incremental_mode: true, + file_size: Some(10 * 1024 * 1024), // 10MB + keysize: None, + }; + + let mut setup_db = OurDB::new(setup_config).unwrap(); + let test_data = vec![b'X'; 100]; // 100 bytes of data + let mut ids = Vec::with_capacity(1000); + + // Insert 1000 records + for _ in 0..1000 { + let args = OurDBSetArgs { + id: None, + data: &test_data, + }; + let id = setup_db.set(args).unwrap(); + ids.push(id); + } + + // Benchmark get operation + c.bench_function("get", |b| { + let config = OurDBConfig { + path: db_path.clone(), + incremental_mode: true, + file_size: Some(10 * 1024 * 1024), + keysize: None, + }; + + let mut db = OurDB::new(config).unwrap(); + let mut i = 0; + + b.iter(|| { + let id = ids[i % ids.len()]; + black_box(db.get(id).unwrap()); + i += 1; + }); + }); + + // Benchmark update operation + c.bench_function("update", |b| { + let config = OurDBConfig { + path: db_path.clone(), + incremental_mode: true, + file_size: Some(10 * 1024 * 1024), + keysize: None, + }; + + let mut db = OurDB::new(config).unwrap(); + let updated_data = vec![b'Y'; 100]; // Different data for updates + let mut i = 0; + + b.iter(|| { + let id = ids[i % ids.len()]; + let args = OurDBSetArgs { + id: Some(id), + data: &updated_data, + }; + black_box(db.set(args).unwrap()); + i += 1; + }); + }); + + // Benchmark get_history operation + c.bench_function("get_history", |b| { + let config = OurDBConfig { + path: db_path.clone(), + incremental_mode: true, + file_size: Some(10 * 1024 * 1024), + keysize: None, + }; + + let mut db = OurDB::new(config).unwrap(); + let mut i = 0; + + b.iter(|| { + let id = ids[i % ids.len()]; + black_box(db.get_history(id, 2).unwrap()); + i += 1; + }); + }); + + // Benchmark delete operation + c.bench_function("delete", |b| { + // Create a fresh database for deletion benchmarks + let delete_dir = tempdir().expect("Failed to create temp directory"); + let delete_path = delete_dir.path().to_path_buf(); + + let config = OurDBConfig { + path: delete_path.clone(), + incremental_mode: true, + file_size: Some(10 * 1024 * 1024), + keysize: None, + }; + + let mut db = OurDB::new(config).unwrap(); + let test_data = vec![b'X'; 100]; + + // Setup keys to delete + let mut delete_ids = Vec::with_capacity(1000); + for _ in 0..1000 { + let args = OurDBSetArgs { + id: None, + data: &test_data, + }; + let id = db.set(args).unwrap(); + delete_ids.push(id); + } + + let mut i = 0; + b.iter(|| { + let id = delete_ids[i % delete_ids.len()]; + // Only try to delete if it exists (not already deleted) + if db.get(id).is_ok() { + black_box(db.delete(id).unwrap()); + } + i += 1; + }); + }); + + // Benchmark key-value mode vs incremental mode + let mut group = c.benchmark_group("mode_comparison"); + + // Benchmark set in key-value mode + group.bench_function("set_keyvalue_mode", |b| { + let kv_dir = tempdir().expect("Failed to create temp directory"); + let kv_path = kv_dir.path().to_path_buf(); + + let config = OurDBConfig { + path: kv_path.clone(), + incremental_mode: false, // Key-value mode + file_size: Some(10 * 1024 * 1024), + keysize: None, + }; + + let mut db = OurDB::new(config).unwrap(); + let test_data = vec![b'X'; 100]; + let mut i = 0; + + b.iter(|| { + let id = i + 1; // Explicit ID + let args = OurDBSetArgs { + id: Some(id as u32), + data: &test_data, + }; + black_box(db.set(args).unwrap()); + i += 1; + }); + }); + + // Benchmark set in incremental mode + group.bench_function("set_incremental_mode", |b| { + let inc_dir = tempdir().expect("Failed to create temp directory"); + let inc_path = inc_dir.path().to_path_buf(); + + let config = OurDBConfig { + path: inc_path.clone(), + incremental_mode: true, // Incremental mode + file_size: Some(10 * 1024 * 1024), + keysize: None, + }; + + let mut db = OurDB::new(config).unwrap(); + let test_data = vec![b'X'; 100]; + + b.iter(|| { + let args = OurDBSetArgs { + id: None, // Auto-generated ID + data: &test_data, + }; + black_box(db.set(args).unwrap()); + }); + }); + + group.finish(); + + // Benchmark with different record sizes + let mut size_group = c.benchmark_group("record_size"); + + for &size in &[10, 100, 1000, 10000] { + size_group.bench_function(format!("set_size_{}", size), |b| { + let size_dir = tempdir().expect("Failed to create temp directory"); + let size_path = size_dir.path().to_path_buf(); + + let config = OurDBConfig { + path: size_path.clone(), + incremental_mode: true, + file_size: Some(10 * 1024 * 1024), + keysize: None, + }; + + let mut db = OurDB::new(config).unwrap(); + let test_data = vec![b'X'; size]; + + b.iter(|| { + let args = OurDBSetArgs { + id: None, + data: &test_data, + }; + black_box(db.set(args).unwrap()); + }); + }); + + size_group.bench_function(format!("get_size_{}", size), |b| { + let size_dir = tempdir().expect("Failed to create temp directory"); + let size_path = size_dir.path().to_path_buf(); + + let config = OurDBConfig { + path: size_path.clone(), + incremental_mode: true, + file_size: Some(10 * 1024 * 1024), + keysize: None, + }; + + let mut db = OurDB::new(config).unwrap(); + let test_data = vec![b'X'; size]; + + // Insert some records first + let mut size_ids = Vec::with_capacity(100); + for _ in 0..100 { + let args = OurDBSetArgs { + id: None, + data: &test_data, + }; + let id = db.set(args).unwrap(); + size_ids.push(id); + } + + let mut i = 0; + b.iter(|| { + let id = size_ids[i % size_ids.len()]; + black_box(db.get(id).unwrap()); + i += 1; + }); + }); + } + + size_group.finish(); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/radixtree/README.md b/radixtree/README.md index 14b3ed3..fa87ede 100644 --- a/radixtree/README.md +++ b/radixtree/README.md @@ -38,7 +38,7 @@ use radixtree::RadixTree; fn main() -> Result<(), radixtree::Error> { // Create a new radix tree - let mut tree = RadixTree::new("/path/to/storage", false)?; + let mut tree = RadixTree::new("/tmp/radix", false)?; // Set key-value pairs tree.set("hello", b"world".to_vec())?; @@ -68,10 +68,10 @@ fn main() -> Result<(), radixtree::Error> { ```rust // Create a new radix tree -let mut tree = RadixTree::new("/path/to/storage", false)?; +let mut tree = RadixTree::new("/tmp/radix", false)?; // Create a new radix tree and reset if it exists -let mut tree = RadixTree::new("/path/to/storage", true)?; +let mut tree = RadixTree::new("/tmp/radix", true)?; ``` ### Setting Values diff --git a/radixtree/examples/large_scale_test.rs b/radixtree/examples/large_scale_test.rs new file mode 100644 index 0000000..4eed308 --- /dev/null +++ b/radixtree/examples/large_scale_test.rs @@ -0,0 +1,121 @@ +use radixtree::RadixTree; +use std::time::{Duration, Instant}; +use std::io::{self, Write}; + +// Use much smaller batches to avoid hitting OurDB's size limit +const BATCH_SIZE: usize = 1_000; +const NUM_BATCHES: usize = 1_000; // Total records: 1,000,000 +const PROGRESS_INTERVAL: usize = 100; + +fn main() -> Result<(), radixtree::Error> { + // Overall metrics + let total_start_time = Instant::now(); + let mut total_records_inserted = 0; + let mut batch_times = Vec::with_capacity(NUM_BATCHES); + + println!("Will insert up to {} records in batches of {}", + BATCH_SIZE * NUM_BATCHES, BATCH_SIZE); + + // Process in batches to avoid OurDB size limits + for batch in 0..NUM_BATCHES { + // Create a new database for each batch + let batch_path = std::env::temp_dir().join(format!("radixtree_batch_{}", batch)); + + // Clean up any existing database + if batch_path.exists() { + std::fs::remove_dir_all(&batch_path)?; + } + std::fs::create_dir_all(&batch_path)?; + + println!("\nBatch {}/{}: Creating new radix tree...", batch + 1, NUM_BATCHES); + let mut tree = RadixTree::new(batch_path.to_str().unwrap(), true)?; + + let batch_start_time = Instant::now(); + let mut last_progress_time = Instant::now(); + let mut last_progress_count = 0; + + // Insert records for this batch + for i in 0..BATCH_SIZE { + let global_index = batch * BATCH_SIZE + i; + let key = format!("key:{:08}", global_index); + let value = format!("val{}", global_index).into_bytes(); + + tree.set(&key, value)?; + + // Show progress at intervals + if (i + 1) % PROGRESS_INTERVAL == 0 || i == BATCH_SIZE - 1 { + let records_since_last = i + 1 - last_progress_count; + let time_since_last = last_progress_time.elapsed(); + let records_per_second = records_since_last as f64 / time_since_last.as_secs_f64(); + + print!("\rProgress: {}/{} records ({:.2}%) - {:.2} records/sec", + i + 1, BATCH_SIZE, + (i + 1) as f64 / BATCH_SIZE as f64 * 100.0, + records_per_second); + io::stdout().flush().unwrap(); + + last_progress_time = Instant::now(); + last_progress_count = i + 1; + } + } + + let batch_duration = batch_start_time.elapsed(); + batch_times.push(batch_duration); + total_records_inserted += BATCH_SIZE; + + println!("\nBatch {}/{} completed in {:?} ({:.2} records/sec)", + batch + 1, NUM_BATCHES, + batch_duration, + BATCH_SIZE as f64 / batch_duration.as_secs_f64()); + + // Test random access performance for this batch + println!("Testing access performance for batch {}...", batch + 1); + let mut total_get_time = Duration::new(0, 0); + let num_samples = 100; + + // Use a simple distribution pattern + for i in 0..num_samples { + // Distribute samples across the batch + let sample_id = batch * BATCH_SIZE + (i * (BATCH_SIZE / num_samples)); + let key = format!("key:{:08}", sample_id); + + let get_start = Instant::now(); + let _ = tree.get(&key)?; + total_get_time += get_start.elapsed(); + } + + println!("Average time to retrieve a record: {:?}", + total_get_time / num_samples as u32); + + // Test prefix search performance + println!("Testing prefix search performance..."); + let prefix = format!("key:{:02}", batch % 100); + + let list_start = Instant::now(); + let keys = tree.list(&prefix)?; + let list_duration = list_start.elapsed(); + + println!("Found {} keys with prefix '{}' in {:?}", + keys.len(), prefix, list_duration); + } + + // Overall performance summary + let total_duration = total_start_time.elapsed(); + println!("\n\nPerformance Summary:"); + println!("Total time to insert {} records: {:?}", total_records_inserted, total_duration); + println!("Average insertion rate: {:.2} records/second", + total_records_inserted as f64 / total_duration.as_secs_f64()); + + // Show performance trend + println!("\nPerformance Trend (batch number vs. time):"); + for (i, duration) in batch_times.iter().enumerate() { + if i % 10 == 0 || i == batch_times.len() - 1 { // Only show every 10th point + println!(" Batch {}: {:?} ({:.2} records/sec)", + i + 1, + duration, + BATCH_SIZE as f64 / duration.as_secs_f64()); + } + } + + Ok(()) +} \ No newline at end of file diff --git a/radixtree/examples/performance_test.rs b/radixtree/examples/performance_test.rs new file mode 100644 index 0000000..9b844ca --- /dev/null +++ b/radixtree/examples/performance_test.rs @@ -0,0 +1,134 @@ +use radixtree::RadixTree; +use std::time::{Duration, Instant}; +use std::io::{self, Write}; + +// Number of records to insert +const TOTAL_RECORDS: usize = 1_000_000; +// How often to report progress (every X records) +const PROGRESS_INTERVAL: usize = 10_000; +// How many records to use for performance sampling +const PERFORMANCE_SAMPLE_SIZE: usize = 1000; + +fn main() -> Result<(), radixtree::Error> { + // Create a temporary directory for the database + let db_path = std::env::temp_dir().join("radixtree_performance_test"); + + // Completely remove and recreate the directory to ensure a clean start + if db_path.exists() { + std::fs::remove_dir_all(&db_path)?; + } + std::fs::create_dir_all(&db_path)?; + + println!("Creating radix tree at: {}", db_path.display()); + println!("Will insert {} records and show progress...", TOTAL_RECORDS); + + // Create a new radix tree + let mut tree = RadixTree::new(db_path.to_str().unwrap(), true)?; + + // Track overall time + let start_time = Instant::now(); + + // Track performance metrics + let mut insertion_times = Vec::with_capacity(TOTAL_RECORDS / PROGRESS_INTERVAL); + let mut last_batch_time = Instant::now(); + let mut last_batch_records = 0; + + // Insert records and track progress + for i in 0..TOTAL_RECORDS { + let key = format!("key:{:08}", i); + // Use smaller values to avoid exceeding OurDB's size limit + let value = format!("val{}", i).into_bytes(); + + // Time the insertion of every Nth record for performance sampling + if i % PERFORMANCE_SAMPLE_SIZE == 0 { + let insert_start = Instant::now(); + tree.set(&key, value)?; + let insert_duration = insert_start.elapsed(); + + // Only print detailed timing for specific samples to avoid flooding output + if i % (PERFORMANCE_SAMPLE_SIZE * 10) == 0 { + println!("Record {}: Insertion took {:?}", i, insert_duration); + } + } else { + tree.set(&key, value)?; + } + + // Show progress at intervals + if (i + 1) % PROGRESS_INTERVAL == 0 || i == TOTAL_RECORDS - 1 { + let records_in_batch = i + 1 - last_batch_records; + let batch_duration = last_batch_time.elapsed(); + let records_per_second = records_in_batch as f64 / batch_duration.as_secs_f64(); + + insertion_times.push((i + 1, batch_duration)); + + print!("\rProgress: {}/{} records ({:.2}%) - {:.2} records/sec", + i + 1, TOTAL_RECORDS, + (i + 1) as f64 / TOTAL_RECORDS as f64 * 100.0, + records_per_second); + io::stdout().flush().unwrap(); + + last_batch_time = Instant::now(); + last_batch_records = i + 1; + } + } + + let total_duration = start_time.elapsed(); + println!("\n\nPerformance Summary:"); + println!("Total time to insert {} records: {:?}", TOTAL_RECORDS, total_duration); + println!("Average insertion rate: {:.2} records/second", + TOTAL_RECORDS as f64 / total_duration.as_secs_f64()); + + // Show performance trend + println!("\nPerformance Trend (records inserted vs. time per batch):"); + for (i, (record_count, duration)) in insertion_times.iter().enumerate() { + if i % 10 == 0 || i == insertion_times.len() - 1 { // Only show every 10th point to avoid too much output + println!(" After {} records: {:?} for {} records ({:.2} records/sec)", + record_count, + duration, + PROGRESS_INTERVAL, + PROGRESS_INTERVAL as f64 / duration.as_secs_f64()); + } + } + + // Test access performance with distributed samples + println!("\nTesting access performance with distributed samples..."); + let mut total_get_time = Duration::new(0, 0); + let num_samples = 1000; + + // Use a simple distribution pattern instead of random + for i in 0..num_samples { + // Distribute samples across the entire range + let sample_id = (i * (TOTAL_RECORDS / num_samples)) % TOTAL_RECORDS; + let key = format!("key:{:08}", sample_id); + + let get_start = Instant::now(); + let _ = tree.get(&key)?; + total_get_time += get_start.elapsed(); + } + + println!("Average time to retrieve a record: {:?}", + total_get_time / num_samples as u32); + + // Test prefix search performance + println!("\nTesting prefix search performance..."); + let prefixes = ["key:0", "key:1", "key:5", "key:9"]; + + for prefix in &prefixes { + let list_start = Instant::now(); + let keys = tree.list(prefix)?; + let list_duration = list_start.elapsed(); + + println!("Found {} keys with prefix '{}' in {:?}", + keys.len(), prefix, list_duration); + } + + // Clean up (optional) + if std::env::var("KEEP_DB").is_err() { + std::fs::remove_dir_all(&db_path)?; + println!("\nCleaned up database directory"); + } else { + println!("\nDatabase kept at: {}", db_path.display()); + } + + Ok(()) +} \ No newline at end of file diff --git a/radixtree/src/operations.rs b/radixtree/src/operations.rs index f4c017a..676464b 100644 --- a/radixtree/src/operations.rs +++ b/radixtree/src/operations.rs @@ -13,8 +13,8 @@ pub fn new_radix_tree(path: &str, reset: bool) -> Result { let config = OurDBConfig { path: PathBuf::from(path), incremental_mode: true, - file_size: Some(1024 * 1024), // 1MB file size - keysize: Some(4), // Default key size + file_size: Some(1024 * 1024 * 10), // 10MB file size for better performance with large datasets + keysize: Some(6), // Use keysize=6 to support multiple files (file_nr + position) }; let mut db = OurDB::new(config)?; diff --git a/tst/Cargo.lock b/tst/Cargo.lock new file mode 100644 index 0000000..cce2cdf --- /dev/null +++ b/tst/Cargo.lock @@ -0,0 +1,180 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "libc" +version = "0.2.172" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "ourdb" +version = "0.1.0" +dependencies = [ + "crc32fast", + "log", + "rand", + "thiserror", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "syn" +version = "2.0.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tst" +version = "0.1.0" +dependencies = [ + "log", + "ourdb", + "thiserror", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "zerocopy" +version = "0.8.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/tst/Cargo.toml b/tst/Cargo.toml new file mode 100644 index 0000000..5e1c6bf --- /dev/null +++ b/tst/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "tst" +version = "0.1.0" +edition = "2021" +description = "A persistent ternary search tree implementation using OurDB for storage" +authors = ["OurWorld Team"] + +[dependencies] +ourdb = { path = "../ourdb" } +thiserror = "1.0.40" +log = "0.4.17" + +[dev-dependencies] +# criterion = "0.5.1" + +# Uncomment when benchmarks are implemented +# [[bench]] +# name = "tst_benchmarks" +# harness = false + +[[example]] +name = "basic_usage" +path = "examples/basic_usage.rs" + +[[example]] +name = "prefix_ops" +path = "examples/prefix_ops.rs" + +[[example]] +name = "performance" +path = "examples/performance.rs" \ No newline at end of file diff --git a/tst/README.md b/tst/README.md new file mode 100644 index 0000000..dd1a84d --- /dev/null +++ b/tst/README.md @@ -0,0 +1,183 @@ +# Ternary Search Tree (TST) + +A persistent ternary search tree implementation in Rust using OurDB for storage. + +## Overview + +TST is a space-optimized tree data structure that enables efficient string key operations with persistent storage. This implementation provides a persistent ternary search tree that can be used for efficient string key operations, such as auto-complete, routing tables, and more. + +A ternary search tree is a type of trie where each node has three children: left, middle, and right. Unlike a radix tree which compresses common prefixes, a TST stores one character per node and uses a binary search tree-like structure for efficient traversal. + +Key characteristics: +- Each node stores a single character +- Nodes have three children: left (for characters < current), middle (for next character in key), and right (for characters > current) +- Leaf nodes contain the actual values +- Balanced structure for consistent performance across operations + +## Features + +- Efficient string key operations +- Persistent storage using OurDB backend +- Balanced tree structure for consistent performance +- Support for binary values +- Thread-safe operations through OurDB + +## Usage + +Add the dependency to your `Cargo.toml`: + +```toml +[dependencies] +tst = { path = "../tst" } +``` + +### Basic Example + +```rust +use tst::TST; + +fn main() -> Result<(), tst::Error> { + // Create a new ternary search tree + let mut tree = TST::new("/tmp/tst", false)?; + + // Set key-value pairs + tree.set("hello", b"world".to_vec())?; + tree.set("help", b"me".to_vec())?; + + // Get values by key + let value = tree.get("hello")?; + println!("hello: {}", String::from_utf8_lossy(&value)); // Prints: world + + // List keys by prefix + let keys = tree.list("hel")?; // Returns ["hello", "help"] + println!("Keys with prefix 'hel': {:?}", keys); + + // Get all values by prefix + let values = tree.getall("hel")?; // Returns [b"world", b"me"] + + // Delete keys + tree.delete("help")?; + + Ok(()) +} +``` + +## API + +### Creating a TST + +```rust +// Create a new ternary search tree +let mut tree = TST::new("/tmp/tst", false)?; + +// Create a new ternary search tree and reset if it exists +let mut tree = TST::new("/tmp/tst", true)?; +``` + +### Setting Values + +```rust +// Set a key-value pair +tree.set("key", b"value".to_vec())?; +``` + +### Getting Values + +```rust +// Get a value by key +let value = tree.get("key")?; +``` + +### Deleting Keys + +```rust +// Delete a key +tree.delete("key")?; +``` + +### Listing Keys by Prefix + +```rust +// List all keys with a given prefix +let keys = tree.list("prefix")?; +``` + +### Getting All Values by Prefix + +```rust +// Get all values for keys with a given prefix +let values = tree.getall("prefix")?; +``` + +## Performance Characteristics + +- Search: O(k) where k is the key length +- Insert: O(k) for new keys +- Delete: O(k) plus potential node cleanup +- Space: O(n) where n is the total number of nodes + +## Use Cases + +TST is particularly useful for: +- Prefix-based searching +- Auto-complete systems +- Dictionary implementations +- Spell checking +- Any application requiring efficient string key operations with persistence + +## Implementation Details + +The TST implementation uses OurDB for persistent storage: +- Each node is serialized and stored as a record in OurDB +- Node references use OurDB record IDs +- The tree maintains a root node ID for traversal +- Node serialization includes version tracking for format evolution + +## Running Tests + +The project includes a comprehensive test suite that verifies all functionality: + +```bash +# Run all tests +cargo test + +# Run specific test file +cargo test --test basic_test +cargo test --test prefix_test +``` + +## Running Examples + +The project includes example applications that demonstrate how to use the TST: + +```bash +# Run the basic usage example +cargo run --example basic_usage + +# Run the prefix operations example +cargo run --example prefix_ops + +# Run the performance test +cargo run --example performance +``` + +## Comparison with RadixTree + +While both TST and RadixTree provide efficient string key operations, they have different characteristics: + +- **TST**: Stores one character per node, with a balanced structure for consistent performance across operations. +- **RadixTree**: Compresses common prefixes, which can be more space-efficient for keys with long common prefixes. + +Choose TST when: +- You need balanced performance across all operations +- Your keys don't share long common prefixes +- You want a simpler implementation with predictable performance + +Choose RadixTree when: +- Space efficiency is a priority +- Your keys share long common prefixes +- You prioritize lookup performance over balanced performance + +## License + +This project is licensed under the same license as the HeroCode project. \ No newline at end of file diff --git a/tst/examples/basic_usage.rs b/tst/examples/basic_usage.rs new file mode 100644 index 0000000..cea748d --- /dev/null +++ b/tst/examples/basic_usage.rs @@ -0,0 +1,76 @@ +use tst::TST; +use std::time::Instant; +use std::io::{self, Write}; + +fn main() -> Result<(), tst::Error> { + // Create a temporary directory for the database + let db_path = std::env::temp_dir().join("tst_example"); + std::fs::create_dir_all(&db_path)?; + + println!("Creating ternary search tree at: {}", db_path.display()); + + // Create a new TST + let mut tree = TST::new(db_path.to_str().unwrap(), true)?; + + // Store some data + println!("Inserting data..."); + tree.set("hello", b"world".to_vec())?; + tree.set("help", b"me".to_vec())?; + tree.set("helicopter", b"flying".to_vec())?; + tree.set("apple", b"fruit".to_vec())?; + tree.set("application", b"software".to_vec())?; + tree.set("banana", b"yellow".to_vec())?; + + // Retrieve and print the data + let value = tree.get("hello")?; + println!("hello: {}", String::from_utf8_lossy(&value)); + + // List keys with prefix + println!("\nListing keys with prefix 'hel':"); + let start = Instant::now(); + let keys = tree.list("hel")?; + let duration = start.elapsed(); + + for key in &keys { + println!(" {}", key); + } + println!("Found {} keys in {:?}", keys.len(), duration); + + // Get all values with prefix + println!("\nGetting all values with prefix 'app':"); + let start = Instant::now(); + let values = tree.getall("app")?; + let duration = start.elapsed(); + + for (i, value) in values.iter().enumerate() { + println!(" Value {}: {}", i + 1, String::from_utf8_lossy(value)); + } + println!("Found {} values in {:?}", values.len(), duration); + + // Delete a key + println!("\nDeleting 'help'..."); + tree.delete("help")?; + + // Verify deletion + println!("Listing keys with prefix 'hel' after deletion:"); + let keys_after = tree.list("hel")?; + for key in &keys_after { + println!(" {}", key); + } + + // Try to get a deleted key + match tree.get("help") { + Ok(_) => println!("Unexpectedly found 'help' after deletion!"), + Err(e) => println!("As expected, 'help' was not found: {}", e), + } + + // Clean up (optional) + if std::env::var("KEEP_DB").is_err() { + std::fs::remove_dir_all(&db_path)?; + println!("\nCleaned up database directory"); + } else { + println!("\nDatabase kept at: {}", db_path.display()); + } + + Ok(()) +} \ No newline at end of file diff --git a/tst/examples/performance.rs b/tst/examples/performance.rs new file mode 100644 index 0000000..ca34f1c --- /dev/null +++ b/tst/examples/performance.rs @@ -0,0 +1,134 @@ +use tst::TST; +use std::time::{Duration, Instant}; +use std::io::{self, Write}; + +// Number of records to insert +const TOTAL_RECORDS: usize = 100_000; +// How often to report progress (every X records) +const PROGRESS_INTERVAL: usize = 1_000; +// How many records to use for performance sampling +const PERFORMANCE_SAMPLE_SIZE: usize = 100; + +fn main() -> Result<(), tst::Error> { + // Create a temporary directory for the database + let db_path = std::env::temp_dir().join("tst_performance_test"); + + // Completely remove and recreate the directory to ensure a clean start + if db_path.exists() { + std::fs::remove_dir_all(&db_path)?; + } + std::fs::create_dir_all(&db_path)?; + + println!("Creating ternary search tree at: {}", db_path.display()); + println!("Will insert {} records and show progress...", TOTAL_RECORDS); + + // Create a new TST + let mut tree = TST::new(db_path.to_str().unwrap(), true)?; + + // Track overall time + let start_time = Instant::now(); + + // Track performance metrics + let mut insertion_times = Vec::with_capacity(TOTAL_RECORDS / PROGRESS_INTERVAL); + let mut last_batch_time = Instant::now(); + let mut last_batch_records = 0; + + // Insert records and track progress + for i in 0..TOTAL_RECORDS { + let key = format!("key:{:08}", i); + // Use smaller values to avoid exceeding OurDB's size limit + let value = format!("val{}", i).into_bytes(); + + // Time the insertion of every Nth record for performance sampling + if i % PERFORMANCE_SAMPLE_SIZE == 0 { + let insert_start = Instant::now(); + tree.set(&key, value)?; + let insert_duration = insert_start.elapsed(); + + // Only print detailed timing for specific samples to avoid flooding output + if i % (PERFORMANCE_SAMPLE_SIZE * 10) == 0 { + println!("Record {}: Insertion took {:?}", i, insert_duration); + } + } else { + tree.set(&key, value)?; + } + + // Show progress at intervals + if (i + 1) % PROGRESS_INTERVAL == 0 || i == TOTAL_RECORDS - 1 { + let records_in_batch = i + 1 - last_batch_records; + let batch_duration = last_batch_time.elapsed(); + let records_per_second = records_in_batch as f64 / batch_duration.as_secs_f64(); + + insertion_times.push((i + 1, batch_duration)); + + print!("\rProgress: {}/{} records ({:.2}%) - {:.2} records/sec", + i + 1, TOTAL_RECORDS, + (i + 1) as f64 / TOTAL_RECORDS as f64 * 100.0, + records_per_second); + io::stdout().flush().unwrap(); + + last_batch_time = Instant::now(); + last_batch_records = i + 1; + } + } + + let total_duration = start_time.elapsed(); + println!("\n\nPerformance Summary:"); + println!("Total time to insert {} records: {:?}", TOTAL_RECORDS, total_duration); + println!("Average insertion rate: {:.2} records/second", + TOTAL_RECORDS as f64 / total_duration.as_secs_f64()); + + // Show performance trend + println!("\nPerformance Trend (records inserted vs. time per batch):"); + for (i, (record_count, duration)) in insertion_times.iter().enumerate() { + if i % 10 == 0 || i == insertion_times.len() - 1 { // Only show every 10th point to avoid too much output + println!(" After {} records: {:?} for {} records ({:.2} records/sec)", + record_count, + duration, + PROGRESS_INTERVAL, + PROGRESS_INTERVAL as f64 / duration.as_secs_f64()); + } + } + + // Test access performance with distributed samples + println!("\nTesting access performance with distributed samples..."); + let mut total_get_time = Duration::new(0, 0); + let num_samples = 1000; + + // Use a simple distribution pattern instead of random + for i in 0..num_samples { + // Distribute samples across the entire range + let sample_id = (i * (TOTAL_RECORDS / num_samples)) % TOTAL_RECORDS; + let key = format!("key:{:08}", sample_id); + + let get_start = Instant::now(); + let _ = tree.get(&key)?; + total_get_time += get_start.elapsed(); + } + + println!("Average time to retrieve a record: {:?}", + total_get_time / num_samples as u32); + + // Test prefix search performance + println!("\nTesting prefix search performance..."); + let prefixes = ["key:0", "key:1", "key:5", "key:9"]; + + for prefix in &prefixes { + let list_start = Instant::now(); + let keys = tree.list(prefix)?; + let list_duration = list_start.elapsed(); + + println!("Found {} keys with prefix '{}' in {:?}", + keys.len(), prefix, list_duration); + } + + // Clean up (optional) + if std::env::var("KEEP_DB").is_err() { + std::fs::remove_dir_all(&db_path)?; + println!("\nCleaned up database directory"); + } else { + println!("\nDatabase kept at: {}", db_path.display()); + } + + Ok(()) +} \ No newline at end of file diff --git a/tst/examples/prefix_ops.rs b/tst/examples/prefix_ops.rs new file mode 100644 index 0000000..cf1a07b --- /dev/null +++ b/tst/examples/prefix_ops.rs @@ -0,0 +1,123 @@ +use tst::TST; +use std::time::Instant; +use std::io::{self, Write}; + +fn main() -> Result<(), tst::Error> { + // Create a temporary directory for the database + let db_path = std::env::temp_dir().join("tst_prefix_example"); + std::fs::create_dir_all(&db_path)?; + + println!("Creating ternary search tree at: {}", db_path.display()); + + // Create a new TST + let mut tree = TST::new(db_path.to_str().unwrap(), true)?; + + // Insert a variety of keys with different prefixes + println!("Inserting data with various prefixes..."); + + // Names + let names = [ + "Alice", "Alexander", "Amanda", "Andrew", "Amy", + "Bob", "Barbara", "Benjamin", "Brenda", "Brian", + "Charlie", "Catherine", "Christopher", "Cynthia", "Carl", + "David", "Diana", "Daniel", "Deborah", "Donald", + "Edward", "Elizabeth", "Eric", "Emily", "Ethan" + ]; + + for (i, name) in names.iter().enumerate() { + let value = format!("person-{}", i).into_bytes(); + tree.set(name, value)?; + } + + // Cities + let cities = [ + "New York", "Los Angeles", "Chicago", "Houston", "Phoenix", + "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose", + "Austin", "Jacksonville", "Fort Worth", "Columbus", "San Francisco", + "Charlotte", "Indianapolis", "Seattle", "Denver", "Washington" + ]; + + for (i, city) in cities.iter().enumerate() { + let value = format!("city-{}", i).into_bytes(); + tree.set(city, value)?; + } + + // Countries + let countries = [ + "United States", "Canada", "Mexico", "Brazil", "Argentina", + "United Kingdom", "France", "Germany", "Italy", "Spain", + "China", "Japan", "India", "Australia", "Russia" + ]; + + for (i, country) in countries.iter().enumerate() { + let value = format!("country-{}", i).into_bytes(); + tree.set(country, value)?; + } + + println!("Total items inserted: {}", names.len() + cities.len() + countries.len()); + + // Test prefix operations + test_prefix(&mut tree, "A")?; + test_prefix(&mut tree, "B")?; + test_prefix(&mut tree, "C")?; + test_prefix(&mut tree, "San")?; + test_prefix(&mut tree, "United")?; + + // Test non-existent prefix + test_prefix(&mut tree, "Z")?; + + // Test empty prefix (should return all keys) + println!("\nTesting empty prefix (should return all keys):"); + let start = Instant::now(); + let all_keys = tree.list("")?; + let duration = start.elapsed(); + + println!("Found {} keys with empty prefix in {:?}", all_keys.len(), duration); + println!("First 5 keys (alphabetically):"); + for key in all_keys.iter().take(5) { + println!(" {}", key); + } + + // Clean up (optional) + if std::env::var("KEEP_DB").is_err() { + std::fs::remove_dir_all(&db_path)?; + println!("\nCleaned up database directory"); + } else { + println!("\nDatabase kept at: {}", db_path.display()); + } + + Ok(()) +} + +fn test_prefix(tree: &mut TST, prefix: &str) -> Result<(), tst::Error> { + println!("\nTesting prefix '{}':", prefix); + + // Test list operation + let start = Instant::now(); + let keys = tree.list(prefix)?; + let list_duration = start.elapsed(); + + println!("Found {} keys with prefix '{}' in {:?}", keys.len(), prefix, list_duration); + + if !keys.is_empty() { + println!("Keys:"); + for key in &keys { + println!(" {}", key); + } + + // Test getall operation + let start = Instant::now(); + let values = tree.getall(prefix)?; + let getall_duration = start.elapsed(); + + println!("Retrieved {} values in {:?}", values.len(), getall_duration); + println!("First value: {}", + if !values.is_empty() { + String::from_utf8_lossy(&values[0]) + } else { + "None".into() + }); + } + + Ok(()) +} \ No newline at end of file diff --git a/tst/src/error.rs b/tst/src/error.rs new file mode 100644 index 0000000..cbb6e0f --- /dev/null +++ b/tst/src/error.rs @@ -0,0 +1,36 @@ +//! Error types for the TST module. + +use thiserror::Error; +use std::io; + +/// Error type for TST operations. +#[derive(Debug, Error)] +pub enum Error { + /// Error from OurDB operations. + #[error("OurDB error: {0}")] + OurDB(#[from] ourdb::Error), + + /// Error when a key is not found. + #[error("Key not found: {0}")] + KeyNotFound(String), + + /// Error when a prefix is not found. + #[error("Prefix not found: {0}")] + PrefixNotFound(String), + + /// Error during serialization. + #[error("Serialization error: {0}")] + Serialization(String), + + /// Error during deserialization. + #[error("Deserialization error: {0}")] + Deserialization(String), + + /// Error for invalid operations. + #[error("Invalid operation: {0}")] + InvalidOperation(String), + + /// IO error. + #[error("IO error: {0}")] + IO(#[from] io::Error), +} \ No newline at end of file diff --git a/tst/src/lib.rs b/tst/src/lib.rs new file mode 100644 index 0000000..10a4e57 --- /dev/null +++ b/tst/src/lib.rs @@ -0,0 +1,122 @@ +//! TST is a space-optimized tree data structure that enables efficient string key operations +//! with persistent storage using OurDB as a backend. +//! +//! This implementation provides a persistent ternary search tree that can be used for efficient +//! string key operations, such as auto-complete, routing tables, and more. + +mod error; +mod node; +mod operations; +mod serialize; + +pub use error::Error; +pub use node::TSTNode; + +use ourdb::OurDB; + +/// TST represents a ternary search tree data structure with persistent storage. +pub struct TST { + /// Database for persistent storage + db: OurDB, + + /// Database ID of the root node + root_id: Option, +} + +impl TST { + /// Creates a new TST with the specified database path. + /// + /// # Arguments + /// + /// * `path` - The path to the database directory + /// * `reset` - Whether to reset the database if it exists + /// + /// # Returns + /// + /// A new `TST` instance + /// + /// # Errors + /// + /// Returns an error if the database cannot be created or opened + pub fn new(path: &str, reset: bool) -> Result { + operations::new_tst(path, reset) + } + + /// Sets a key-value pair in the tree. + /// + /// # Arguments + /// + /// * `key` - The key to set + /// * `value` - The value to set + /// + /// # Errors + /// + /// Returns an error if the operation fails + pub fn set(&mut self, key: &str, value: Vec) -> Result<(), Error> { + operations::set(self, key, value) + } + + /// Gets a value by key from the tree. + /// + /// # Arguments + /// + /// * `key` - The key to get + /// + /// # Returns + /// + /// The value associated with the key + /// + /// # Errors + /// + /// Returns an error if the key is not found or the operation fails + pub fn get(&mut self, key: &str) -> Result, Error> { + operations::get(self, key) + } + + /// Deletes a key from the tree. + /// + /// # Arguments + /// + /// * `key` - The key to delete + /// + /// # Errors + /// + /// Returns an error if the key is not found or the operation fails + pub fn delete(&mut self, key: &str) -> Result<(), Error> { + operations::delete(self, key) + } + + /// Lists all keys with a given prefix. + /// + /// # Arguments + /// + /// * `prefix` - The prefix to search for + /// + /// # Returns + /// + /// A list of keys that start with the given prefix + /// + /// # Errors + /// + /// Returns an error if the operation fails + pub fn list(&mut self, prefix: &str) -> Result, Error> { + operations::list(self, prefix) + } + + /// Gets all values for keys with a given prefix. + /// + /// # Arguments + /// + /// * `prefix` - The prefix to search for + /// + /// # Returns + /// + /// A list of values for keys that start with the given prefix + /// + /// # Errors + /// + /// Returns an error if the operation fails + pub fn getall(&mut self, prefix: &str) -> Result>, Error> { + operations::getall(self, prefix) + } +} \ No newline at end of file diff --git a/tst/src/node.rs b/tst/src/node.rs new file mode 100644 index 0000000..badb512 --- /dev/null +++ b/tst/src/node.rs @@ -0,0 +1,49 @@ +//! Node types for the TST module. + +/// Represents a node in the ternary search tree. +#[derive(Debug, Clone, PartialEq)] +pub struct TSTNode { + /// The character stored at this node. + pub character: char, + + /// Value stored at this node (empty if not end of key). + pub value: Vec, + + /// Whether this node represents the end of a key. + pub is_end_of_key: bool, + + /// Reference to the left child node (for characters < current character). + pub left_id: Option, + + /// Reference to the middle child node (for next character in key). + pub middle_id: Option, + + /// Reference to the right child node (for characters > current character). + pub right_id: Option, +} + +impl TSTNode { + /// Creates a new node. + pub fn new(character: char, value: Vec, is_end_of_key: bool) -> Self { + Self { + character, + value, + is_end_of_key, + left_id: None, + middle_id: None, + right_id: None, + } + } + + /// Creates a new root node. + pub fn new_root() -> Self { + Self { + character: '\0', // Use null character for root + value: Vec::new(), + is_end_of_key: false, + left_id: None, + middle_id: None, + right_id: None, + } + } +} \ No newline at end of file diff --git a/tst/src/operations.rs b/tst/src/operations.rs new file mode 100644 index 0000000..2214b9b --- /dev/null +++ b/tst/src/operations.rs @@ -0,0 +1,418 @@ +//! Implementation of TST operations. + +use crate::error::Error; +use crate::node::TSTNode; +use crate::TST; +use ourdb::{OurDB, OurDBConfig, OurDBSetArgs}; +use std::path::PathBuf; + +/// Creates a new TST with the specified database path. +pub fn new_tst(path: &str, reset: bool) -> Result { + // If the path exists and reset is true, remove it first + let path_buf = PathBuf::from(path); + if path_buf.exists() && reset { + std::fs::remove_dir_all(&path_buf)?; + } + + // Create the directory if it doesn't exist + std::fs::create_dir_all(&path_buf)?; + + let config = OurDBConfig { + path: path_buf, + incremental_mode: true, + file_size: Some(1024 * 1024), // 10MB file size for better performance with large datasets + keysize: Some(4), // Use keysize=4 (default) + }; + + let mut db = OurDB::new(config)?; + + let root_id = if db.get_next_id()? == 1 || reset { + // Create a new root node + let root = TSTNode::new_root(); + let root_id = db.set(OurDBSetArgs { + id: None, + data: &root.serialize(), + })?; + + Some(root_id) + } else { + // Use existing root node + Some(1) // Root node always has ID 1 + }; + + Ok(TST { + db, + root_id, + }) +} + +/// Sets a key-value pair in the tree. +pub fn set(tree: &mut TST, key: &str, value: Vec) -> Result<(), Error> { + if key.is_empty() { + return Err(Error::InvalidOperation("Empty key not allowed".to_string())); + } + + let root_id = match tree.root_id { + Some(id) => id, + None => return Err(Error::InvalidOperation("Tree not initialized".to_string())), + }; + + let chars: Vec = key.chars().collect(); + set_recursive(tree, root_id, &chars, 0, value)?; + + Ok(()) +} + +/// Recursive helper function for setting a key-value pair. +fn set_recursive(tree: &mut TST, node_id: u32, chars: &[char], pos: usize, value: Vec) -> Result { + let mut node = tree.get_node(node_id)?; + + if pos >= chars.len() { + // We've reached the end of the key + node.is_end_of_key = true; + node.value = value; + return tree.save_node(Some(node_id), &node); + } + + let current_char = chars[pos]; + + if node.character == '\0' { + // Root node or empty node, set the character + node.character = current_char; + let node_id = tree.save_node(Some(node_id), &node)?; + + // Continue with the next character + if pos + 1 < chars.len() { + let new_node = TSTNode::new(chars[pos + 1], Vec::new(), false); + let new_id = tree.save_node(None, &new_node)?; + + let mut updated_node = tree.get_node(node_id)?; + updated_node.middle_id = Some(new_id); + tree.save_node(Some(node_id), &updated_node)?; + + return set_recursive(tree, new_id, chars, pos + 1, value); + } else { + // This is the last character + let mut updated_node = tree.get_node(node_id)?; + updated_node.is_end_of_key = true; + updated_node.value = value; + return tree.save_node(Some(node_id), &updated_node); + } + } + + if current_char < node.character { + // Go left + if let Some(left_id) = node.left_id { + return set_recursive(tree, left_id, chars, pos, value); + } else { + // Create new left node + let new_node = TSTNode::new(current_char, Vec::new(), false); + let new_id = tree.save_node(None, &new_node)?; + + // Update current node + node.left_id = Some(new_id); + tree.save_node(Some(node_id), &node)?; + + return set_recursive(tree, new_id, chars, pos, value); + } + } else if current_char > node.character { + // Go right + if let Some(right_id) = node.right_id { + return set_recursive(tree, right_id, chars, pos, value); + } else { + // Create new right node + let new_node = TSTNode::new(current_char, Vec::new(), false); + let new_id = tree.save_node(None, &new_node)?; + + // Update current node + node.right_id = Some(new_id); + tree.save_node(Some(node_id), &node)?; + + return set_recursive(tree, new_id, chars, pos, value); + } + } else { + // Character matches, go middle (next character) + if pos + 1 >= chars.len() { + // This is the last character + node.is_end_of_key = true; + node.value = value; + return tree.save_node(Some(node_id), &node); + } + + if let Some(middle_id) = node.middle_id { + return set_recursive(tree, middle_id, chars, pos + 1, value); + } else { + // Create new middle node + let new_node = TSTNode::new(chars[pos + 1], Vec::new(), false); + let new_id = tree.save_node(None, &new_node)?; + + // Update current node + node.middle_id = Some(new_id); + tree.save_node(Some(node_id), &node)?; + + return set_recursive(tree, new_id, chars, pos + 1, value); + } + } +} + +/// Gets a value by key from the tree. +pub fn get(tree: &mut TST, key: &str) -> Result, Error> { + if key.is_empty() { + return Err(Error::InvalidOperation("Empty key not allowed".to_string())); + } + + let root_id = match tree.root_id { + Some(id) => id, + None => return Err(Error::InvalidOperation("Tree not initialized".to_string())), + }; + + let chars: Vec = key.chars().collect(); + let node_id = find_node(tree, root_id, &chars, 0)?; + + let node = tree.get_node(node_id)?; + if node.is_end_of_key { + Ok(node.value.clone()) + } else { + Err(Error::KeyNotFound(key.to_string())) + } +} + +/// Finds a node by key. +fn find_node(tree: &mut TST, node_id: u32, chars: &[char], pos: usize) -> Result { + let node = tree.get_node(node_id)?; + + if pos >= chars.len() { + return Ok(node_id); + } + + let current_char = chars[pos]; + + if current_char < node.character { + // Go left + if let Some(left_id) = node.left_id { + find_node(tree, left_id, chars, pos) + } else { + Err(Error::KeyNotFound(chars.iter().collect())) + } + } else if current_char > node.character { + // Go right + if let Some(right_id) = node.right_id { + find_node(tree, right_id, chars, pos) + } else { + Err(Error::KeyNotFound(chars.iter().collect())) + } + } else { + // Character matches + if pos + 1 >= chars.len() { + // This is the last character + Ok(node_id) + } else if let Some(middle_id) = node.middle_id { + // Go to next character + find_node(tree, middle_id, chars, pos + 1) + } else { + Err(Error::KeyNotFound(chars.iter().collect())) + } + } +} + +/// Deletes a key from the tree. +pub fn delete(tree: &mut TST, key: &str) -> Result<(), Error> { + if key.is_empty() { + return Err(Error::InvalidOperation("Empty key not allowed".to_string())); + } + + let root_id = match tree.root_id { + Some(id) => id, + None => return Err(Error::InvalidOperation("Tree not initialized".to_string())), + }; + + let chars: Vec = key.chars().collect(); + let node_id = find_node(tree, root_id, &chars, 0)?; + + let mut node = tree.get_node(node_id)?; + + if !node.is_end_of_key { + return Err(Error::KeyNotFound(key.to_string())); + } + + // If the node has a middle child, just mark it as not end of key + if node.middle_id.is_some() || node.left_id.is_some() || node.right_id.is_some() { + node.is_end_of_key = false; + node.value = Vec::new(); + tree.save_node(Some(node_id), &node)?; + return Ok(()); + } + + // Otherwise, we need to remove the node and update its parent + // This is more complex and would require tracking the path to the node + // For simplicity, we'll just mark it as not end of key for now + node.is_end_of_key = false; + node.value = Vec::new(); + tree.save_node(Some(node_id), &node)?; + + Ok(()) +} + +/// Lists all keys with a given prefix. +pub fn list(tree: &mut TST, prefix: &str) -> Result, Error> { + let root_id = match tree.root_id { + Some(id) => id, + None => return Err(Error::InvalidOperation("Tree not initialized".to_string())), + }; + + let mut result = Vec::new(); + + // Handle empty prefix case - will return all keys + if prefix.is_empty() { + collect_all_keys(tree, root_id, String::new(), &mut result)?; + return Ok(result); + } + + // Find the node corresponding to the prefix + let chars: Vec = prefix.chars().collect(); + let node_id = match find_prefix_node(tree, root_id, &chars, 0) { + Ok(id) => id, + Err(_) => return Ok(Vec::new()), // Prefix not found, return empty list + }; + + // Collect all keys from the subtree + collect_keys_with_prefix(tree, node_id, prefix.to_string(), &mut result)?; + + Ok(result) +} + +/// Finds the node corresponding to a prefix. +fn find_prefix_node(tree: &mut TST, node_id: u32, chars: &[char], pos: usize) -> Result { + if pos >= chars.len() { + return Ok(node_id); + } + + let node = tree.get_node(node_id)?; + let current_char = chars[pos]; + + if current_char < node.character { + // Go left + if let Some(left_id) = node.left_id { + find_prefix_node(tree, left_id, chars, pos) + } else { + Err(Error::PrefixNotFound(chars.iter().collect())) + } + } else if current_char > node.character { + // Go right + if let Some(right_id) = node.right_id { + find_prefix_node(tree, right_id, chars, pos) + } else { + Err(Error::PrefixNotFound(chars.iter().collect())) + } + } else { + // Character matches + if pos + 1 >= chars.len() { + // This is the last character of the prefix + Ok(node_id) + } else if let Some(middle_id) = node.middle_id { + // Go to next character + find_prefix_node(tree, middle_id, chars, pos + 1) + } else { + Err(Error::PrefixNotFound(chars.iter().collect())) + } + } +} + +/// Collects all keys with a given prefix. +fn collect_keys_with_prefix( + tree: &mut TST, + node_id: u32, + current_path: String, + result: &mut Vec, +) -> Result<(), Error> { + let node = tree.get_node(node_id)?; + + // If this node is an end of key, add it to the result + if node.is_end_of_key { + result.push(current_path.clone()); + } + + // Recursively collect keys from all children + if let Some(left_id) = node.left_id { + collect_all_keys(tree, left_id, current_path.clone(), result)?; + } + + if let Some(middle_id) = node.middle_id { + let mut new_path = current_path.clone(); + new_path.push(node.character); + collect_all_keys(tree, middle_id, new_path, result)?; + } + + if let Some(right_id) = node.right_id { + collect_all_keys(tree, right_id, current_path.clone(), result)?; + } + + Ok(()) +} + +/// Recursively collects all keys under a node. +fn collect_all_keys( + tree: &mut TST, + node_id: u32, + current_path: String, + result: &mut Vec, +) -> Result<(), Error> { + let node = tree.get_node(node_id)?; + + let mut new_path = current_path.clone(); + new_path.push(node.character); + + // If this node is an end of key, add it to the result + if node.is_end_of_key { + result.push(new_path.clone()); + } + + // Recursively collect keys from all children + if let Some(left_id) = node.left_id { + collect_all_keys(tree, left_id, current_path.clone(), result)?; + } + + if let Some(middle_id) = node.middle_id { + collect_all_keys(tree, middle_id, new_path.clone(), result)?; + } + + if let Some(right_id) = node.right_id { + collect_all_keys(tree, right_id, current_path.clone(), result)?; + } + + Ok(()) +} + +/// Gets all values for keys with a given prefix. +pub fn getall(tree: &mut TST, prefix: &str) -> Result>, Error> { + // Get all matching keys + let keys = list(tree, prefix)?; + + // Get values for each key + let mut values = Vec::new(); + for key in keys { + if let Ok(value) = get(tree, &key) { + values.push(value); + } + } + + Ok(values) +} + +impl TST { + /// Helper function to get a node from the database. + pub(crate) fn get_node(&mut self, node_id: u32) -> Result { + let data = self.db.get(node_id)?; + TSTNode::deserialize(&data) + } + + /// Helper function to save a node to the database. + pub(crate) fn save_node(&mut self, node_id: Option, node: &TSTNode) -> Result { + let data = node.serialize(); + let args = OurDBSetArgs { + id: node_id, + data: &data, + }; + Ok(self.db.set(args)?) + } +} \ No newline at end of file diff --git a/tst/src/serialize.rs b/tst/src/serialize.rs new file mode 100644 index 0000000..7924cfa --- /dev/null +++ b/tst/src/serialize.rs @@ -0,0 +1,134 @@ +//! Serialization and deserialization for TST nodes. + +use crate::error::Error; +use crate::node::TSTNode; + +/// Current binary format version. +const VERSION: u8 = 1; + +impl TSTNode { + /// Serializes a node to bytes for storage. + pub fn serialize(&self) -> Vec { + let mut buffer = Vec::new(); + + // Version + buffer.push(VERSION); + + // Character (as UTF-32) + let char_bytes = (self.character as u32).to_le_bytes(); + buffer.extend_from_slice(&char_bytes); + + // Is end of key + buffer.push(if self.is_end_of_key { 1 } else { 0 }); + + // Value (only if is_end_of_key) + if self.is_end_of_key { + let value_len = (self.value.len() as u32).to_le_bytes(); + buffer.extend_from_slice(&value_len); + buffer.extend_from_slice(&self.value); + } else { + // Zero length + buffer.extend_from_slice(&[0, 0, 0, 0]); + } + + // Child pointers + let left_id = self.left_id.unwrap_or(0).to_le_bytes(); + buffer.extend_from_slice(&left_id); + + let middle_id = self.middle_id.unwrap_or(0).to_le_bytes(); + buffer.extend_from_slice(&middle_id); + + let right_id = self.right_id.unwrap_or(0).to_le_bytes(); + buffer.extend_from_slice(&right_id); + + buffer + } + + /// Deserializes bytes to a node. + pub fn deserialize(data: &[u8]) -> Result { + if data.len() < 14 { // Minimum size: version + char + is_end + value_len + 3 child IDs + return Err(Error::Deserialization("Data too short".to_string())); + } + + let mut pos = 0; + + // Version + let version = data[pos]; + pos += 1; + + if version != VERSION { + return Err(Error::Deserialization(format!("Unsupported version: {}", version))); + } + + // Character + let char_bytes = [data[pos], data[pos+1], data[pos+2], data[pos+3]]; + let char_code = u32::from_le_bytes(char_bytes); + let character = char::from_u32(char_code) + .ok_or_else(|| Error::Deserialization("Invalid character".to_string()))?; + pos += 4; + + // Is end of key + let is_end_of_key = data[pos] != 0; + pos += 1; + + // Value length + let value_len_bytes = [data[pos], data[pos+1], data[pos+2], data[pos+3]]; + let value_len = u32::from_le_bytes(value_len_bytes) as usize; + pos += 4; + + // Value + let value = if value_len > 0 { + if pos + value_len > data.len() { + return Err(Error::Deserialization("Value length exceeds data".to_string())); + } + data[pos..pos+value_len].to_vec() + } else { + Vec::new() + }; + pos += value_len; + + // Child pointers + if pos + 12 > data.len() { + return Err(Error::Deserialization("Data too short for child pointers".to_string())); + } + + let left_id_bytes = [data[pos], data[pos+1], data[pos+2], data[pos+3]]; + let left_id = u32::from_le_bytes(left_id_bytes); + pos += 4; + + let middle_id_bytes = [data[pos], data[pos+1], data[pos+2], data[pos+3]]; + let middle_id = u32::from_le_bytes(middle_id_bytes); + pos += 4; + + let right_id_bytes = [data[pos], data[pos+1], data[pos+2], data[pos+3]]; + let right_id = u32::from_le_bytes(right_id_bytes); + + Ok(TSTNode { + character, + value, + is_end_of_key, + left_id: if left_id == 0 { None } else { Some(left_id) }, + middle_id: if middle_id == 0 { None } else { Some(middle_id) }, + right_id: if right_id == 0 { None } else { Some(right_id) }, + }) + } +} + +/// Gets the common prefix of two strings. +pub fn get_common_prefix(a: &str, b: &str) -> String { + let mut result = String::new(); + let a_chars: Vec = a.chars().collect(); + let b_chars: Vec = b.chars().collect(); + + let min_len = a_chars.len().min(b_chars.len()); + + for i in 0..min_len { + if a_chars[i] == b_chars[i] { + result.push(a_chars[i]); + } else { + break; + } + } + + result +} \ No newline at end of file diff --git a/tst/tests/basic_test.rs b/tst/tests/basic_test.rs new file mode 100644 index 0000000..54c71bb --- /dev/null +++ b/tst/tests/basic_test.rs @@ -0,0 +1,215 @@ +use tst::TST; +use std::env::temp_dir; +use std::fs; +use std::time::SystemTime; + +fn get_test_db_path() -> String { + let timestamp = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs(); + + let path = temp_dir().join(format!("tst_test_{}", timestamp)); + fs::create_dir_all(&path).unwrap(); + + path.to_string_lossy().to_string() +} + +fn cleanup_test_db(path: &str) { + let _ = fs::remove_dir_all(path); +} + +#[test] +fn test_create_tst() { + let path = get_test_db_path(); + + let result = TST::new(&path, true); + assert!(result.is_ok()); + + cleanup_test_db(&path); +} + +#[test] +fn test_set_and_get() { + let path = get_test_db_path(); + + let mut tree = TST::new(&path, true).unwrap(); + + // Test setting and getting a key + let key = "test_key"; + let value = b"test_value".to_vec(); + + let set_result = tree.set(key, value.clone()); + assert!(set_result.is_ok()); + + let get_result = tree.get(key); + assert!(get_result.is_ok()); + assert_eq!(get_result.unwrap(), value); + + cleanup_test_db(&path); +} + +#[test] +fn test_get_nonexistent_key() { + let path = get_test_db_path(); + + let mut tree = TST::new(&path, true).unwrap(); + + // Test getting a key that doesn't exist + let get_result = tree.get("nonexistent_key"); + assert!(get_result.is_err()); + + cleanup_test_db(&path); +} + +#[test] +fn test_delete() { + let path = get_test_db_path(); + + let mut tree = TST::new(&path, true).unwrap(); + + // Set a key + let key = "delete_test"; + let value = b"to_be_deleted".to_vec(); + + tree.set(key, value).unwrap(); + + // Verify it exists + let get_result = tree.get(key); + assert!(get_result.is_ok()); + + // Delete it + let delete_result = tree.delete(key); + assert!(delete_result.is_ok()); + + // Verify it's gone + let get_after_delete = tree.get(key); + assert!(get_after_delete.is_err()); + + cleanup_test_db(&path); +} + +#[test] +fn test_multiple_keys() { + let path = get_test_db_path(); + + let mut tree = TST::new(&path, true).unwrap(); + + // Insert multiple keys + let keys = ["apple", "banana", "cherry", "date", "elderberry"]; + + for (i, key) in keys.iter().enumerate() { + let value = format!("value_{}", i).into_bytes(); + tree.set(key, value).unwrap(); + } + + // Verify all keys exist + for (i, key) in keys.iter().enumerate() { + let expected_value = format!("value_{}", i).into_bytes(); + let get_result = tree.get(key).unwrap(); + assert_eq!(get_result, expected_value); + } + + cleanup_test_db(&path); +} + +#[test] +fn test_list_prefix() { + let path = get_test_db_path(); + + let mut tree = TST::new(&path, true).unwrap(); + + // Insert keys with common prefixes + let keys = [ + "apple", "application", "append", + "banana", "bandana", + "cherry", "chocolate" + ]; + + for key in &keys { + tree.set(key, key.as_bytes().to_vec()).unwrap(); + } + + // Test prefix "app" + let app_keys = tree.list("app").unwrap(); + assert_eq!(app_keys.len(), 3); + assert!(app_keys.contains(&"apple".to_string())); + assert!(app_keys.contains(&"application".to_string())); + assert!(app_keys.contains(&"append".to_string())); + + // Test prefix "ban" + let ban_keys = tree.list("ban").unwrap(); + assert_eq!(ban_keys.len(), 2); + assert!(ban_keys.contains(&"banana".to_string())); + assert!(ban_keys.contains(&"bandana".to_string())); + + // Test prefix "c" + let c_keys = tree.list("c").unwrap(); + assert_eq!(c_keys.len(), 2); + assert!(c_keys.contains(&"cherry".to_string())); + assert!(c_keys.contains(&"chocolate".to_string())); + + // Test non-existent prefix + let z_keys = tree.list("z").unwrap(); + assert_eq!(z_keys.len(), 0); + + cleanup_test_db(&path); +} + +#[test] +fn test_getall_prefix() { + let path = get_test_db_path(); + + let mut tree = TST::new(&path, true).unwrap(); + + // Insert keys with common prefixes + let keys = [ + "apple", "application", "append", + "banana", "bandana", + "cherry", "chocolate" + ]; + + for key in &keys { + tree.set(key, key.as_bytes().to_vec()).unwrap(); + } + + // Test getall with prefix "app" + let app_values = tree.getall("app").unwrap(); + assert_eq!(app_values.len(), 3); + + // Convert values to strings for easier comparison + let app_value_strings: Vec = app_values + .iter() + .map(|v| String::from_utf8_lossy(v).to_string()) + .collect(); + + assert!(app_value_strings.contains(&"apple".to_string())); + assert!(app_value_strings.contains(&"application".to_string())); + assert!(app_value_strings.contains(&"append".to_string())); + + cleanup_test_db(&path); +} + +#[test] +fn test_empty_prefix() { + let path = get_test_db_path(); + + let mut tree = TST::new(&path, true).unwrap(); + + // Insert some keys + let keys = ["apple", "banana", "cherry"]; + + for key in &keys { + tree.set(key, key.as_bytes().to_vec()).unwrap(); + } + + // Test list with empty prefix (should return all keys) + let all_keys = tree.list("").unwrap(); + assert_eq!(all_keys.len(), keys.len()); + + for key in &keys { + assert!(all_keys.contains(&key.to_string())); + } + + cleanup_test_db(&path); +} \ No newline at end of file diff --git a/tst/tests/prefix_test.rs b/tst/tests/prefix_test.rs new file mode 100644 index 0000000..a2c7ac8 --- /dev/null +++ b/tst/tests/prefix_test.rs @@ -0,0 +1,215 @@ +use tst::TST; +use std::env::temp_dir; +use std::fs; +use std::time::SystemTime; + +fn get_test_db_path() -> String { + let timestamp = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs(); + + let path = temp_dir().join(format!("tst_prefix_test_{}", timestamp)); + fs::create_dir_all(&path).unwrap(); + + path.to_string_lossy().to_string() +} + +fn cleanup_test_db(path: &str) { + let _ = fs::remove_dir_all(path); +} + +#[test] +fn test_prefix_with_common_prefixes() { + let path = get_test_db_path(); + + let mut tree = TST::new(&path, true).unwrap(); + + // Insert keys with common prefixes + let test_data = [ + ("test", b"value1".to_vec()), + ("testing", b"value2".to_vec()), + ("tested", b"value3".to_vec()), + ("tests", b"value4".to_vec()), + ("tester", b"value5".to_vec()), + ]; + + for (key, value) in &test_data { + tree.set(key, value.clone()).unwrap(); + } + + // Test prefix "test" + let keys = tree.list("test").unwrap(); + assert_eq!(keys.len(), 5); + + for (key, _) in &test_data { + assert!(keys.contains(&key.to_string())); + } + + // Test prefix "teste" + let keys = tree.list("teste").unwrap(); + assert_eq!(keys.len(), 2); + assert!(keys.contains(&"tested".to_string())); + assert!(keys.contains(&"tester".to_string())); + + cleanup_test_db(&path); +} + +#[test] +fn test_prefix_with_different_prefixes() { + let path = get_test_db_path(); + + let mut tree = TST::new(&path, true).unwrap(); + + // Insert keys with different prefixes + let test_data = [ + ("apple", b"fruit1".to_vec()), + ("banana", b"fruit2".to_vec()), + ("cherry", b"fruit3".to_vec()), + ("date", b"fruit4".to_vec()), + ("elderberry", b"fruit5".to_vec()), + ]; + + for (key, value) in &test_data { + tree.set(key, value.clone()).unwrap(); + } + + // Test each prefix + for (key, _) in &test_data { + let prefix = &key[0..1]; // First character + let keys = tree.list(prefix).unwrap(); + assert!(keys.contains(&key.to_string())); + } + + // Test non-existent prefix + let keys = tree.list("z").unwrap(); + assert_eq!(keys.len(), 0); + + cleanup_test_db(&path); +} + +#[test] +fn test_prefix_with_empty_string() { + let path = get_test_db_path(); + + let mut tree = TST::new(&path, true).unwrap(); + + // Insert some keys + let test_data = [ + ("apple", b"fruit1".to_vec()), + ("banana", b"fruit2".to_vec()), + ("cherry", b"fruit3".to_vec()), + ]; + + for (key, value) in &test_data { + tree.set(key, value.clone()).unwrap(); + } + + // Test empty prefix (should return all keys) + let keys = tree.list("").unwrap(); + assert_eq!(keys.len(), test_data.len()); + + for (key, _) in &test_data { + assert!(keys.contains(&key.to_string())); + } + + cleanup_test_db(&path); +} + +#[test] +fn test_getall_with_prefix() { + let path = get_test_db_path(); + + let mut tree = TST::new(&path, true).unwrap(); + + // Insert keys with common prefixes + let test_data = [ + ("test", b"value1".to_vec()), + ("testing", b"value2".to_vec()), + ("tested", b"value3".to_vec()), + ("tests", b"value4".to_vec()), + ("tester", b"value5".to_vec()), + ]; + + for (key, value) in &test_data { + tree.set(key, value.clone()).unwrap(); + } + + // Test getall with prefix "test" + let values = tree.getall("test").unwrap(); + assert_eq!(values.len(), 5); + + for (_, value) in &test_data { + assert!(values.contains(value)); + } + + cleanup_test_db(&path); +} + +#[test] +fn test_prefix_with_unicode_characters() { + let path = get_test_db_path(); + + let mut tree = TST::new(&path, true).unwrap(); + + // Insert keys with Unicode characters + let test_data = [ + ("café", b"coffee".to_vec()), + ("cafétéria", b"cafeteria".to_vec()), + ("caffè", b"italian coffee".to_vec()), + ("café au lait", b"coffee with milk".to_vec()), + ]; + + for (key, value) in &test_data { + tree.set(key, value.clone()).unwrap(); + } + + // Test prefix "café" + let keys = tree.list("café").unwrap(); + assert_eq!(keys.len(), 2); + assert!(keys.contains(&"café".to_string())); + assert!(keys.contains(&"café au lait".to_string())); + + // Test prefix "caf" + let keys = tree.list("caf").unwrap(); + assert_eq!(keys.len(), 4); + + for (key, _) in &test_data { + assert!(keys.contains(&key.to_string())); + } + + cleanup_test_db(&path); +} + +#[test] +fn test_prefix_with_long_keys() { + let path = get_test_db_path(); + + let mut tree = TST::new(&path, true).unwrap(); + + // Insert long keys + let test_data = [ + ("this_is_a_very_long_key_for_testing_purposes_1", b"value1".to_vec()), + ("this_is_a_very_long_key_for_testing_purposes_2", b"value2".to_vec()), + ("this_is_a_very_long_key_for_testing_purposes_3", b"value3".to_vec()), + ("this_is_another_long_key_for_testing", b"value4".to_vec()), + ]; + + for (key, value) in &test_data { + tree.set(key, value.clone()).unwrap(); + } + + // Test prefix "this_is_a_very" + let keys = tree.list("this_is_a_very").unwrap(); + assert_eq!(keys.len(), 3); + + // Test prefix "this_is" + let keys = tree.list("this_is").unwrap(); + assert_eq!(keys.len(), 4); + + for (key, _) in &test_data { + assert!(keys.contains(&key.to_string())); + } + + cleanup_test_db(&path); +} \ No newline at end of file diff --git a/tst_implementation_plan.md b/tst_implementation_plan.md new file mode 100644 index 0000000..dfcf609 --- /dev/null +++ b/tst_implementation_plan.md @@ -0,0 +1,365 @@ +# Ternary Search Tree (TST) Implementation Plan + +## 1. Overview + +A Ternary Search Tree (TST) is a type of trie where each node has three children: left, middle, and right. Unlike a RadixTree which compresses common prefixes, a TST stores one character per node and uses a binary search tree-like structure for efficient traversal. + +```mermaid +graph TD + A[Root Node 'r'] --> B[Left Child 'a'] + A --> C[Middle Child 'o'] + A --> D[Right Child 't'] + C --> E[Middle Child 'o'] + E --> F[Middle Child 'm' - End of Key] + E --> G[Middle Child 't' - End of Key] +``` + +The TST implementation will use OurDB as the backend for persistent storage, similar to the existing RadixTree implementation. The goal is to provide a more balanced tree structure that offers consistent performance across all operations (set, get, delete, list). + +## 2. Core Data Structures + +### 2.1 TST Node Structure + +```rust +pub struct TSTNode { + // The character stored at this node + pub character: char, + + // Value stored at this node (empty if not end of key) + pub value: Vec, + + // Whether this node represents the end of a key + pub is_end_of_key: bool, + + // References to child nodes + pub left_id: Option, // For characters < current character + pub middle_id: Option, // For characters == current character (next character in key) + pub right_id: Option, // For characters > current character +} +``` + +### 2.2 TST Structure + +```rust +pub struct TST { + // Database for persistent storage + db: OurDB, + + // Database ID of the root node + root_id: Option, +} +``` + +## 3. API Design + +The TST will maintain similar core functionality to RadixTree but with an API that better suits its structure: + +```rust +impl TST { + // Creates a new TST with the specified database path + pub fn new(path: &str, reset: bool) -> Result; + + // Sets a key-value pair in the tree + pub fn set(&mut self, key: &str, value: Vec) -> Result<(), Error>; + + // Gets a value by key from the tree + pub fn get(&mut self, key: &str) -> Result, Error>; + + // Deletes a key from the tree + pub fn delete(&mut self, key: &str) -> Result<(), Error>; + + // Lists all keys with a given prefix + pub fn list(&mut self, prefix: &str) -> Result, Error>; + + // Gets all values for keys with a given prefix + pub fn getall(&mut self, prefix: &str) -> Result>, Error>; +} +``` + +## 4. Implementation Strategy + +### 4.1 Phase 1: Core Data Structures and Serialization + +```mermaid +graph TD + A[Define TSTNode and TST structs] --> B[Implement serialization/deserialization] + B --> C[Implement Error handling] + C --> D[Implement OurDB integration] +``` + +1. Define the `TSTNode` and `TST` structs +2. Implement serialization and deserialization for `TSTNode` +3. Define error types for TST-specific errors +4. Implement OurDB integration for node storage and retrieval + +### 4.2 Phase 2: Basic Tree Operations + +```mermaid +graph TD + A[Implement new] --> B[Implement set] + B --> C[Implement get] + C --> D[Implement helper functions] +``` + +1. Implement the `new()` function for creating a new TST +2. Implement the `set()` function for inserting key-value pairs +3. Implement the `get()` function for retrieving values +4. Implement helper functions for node traversal and manipulation + +### 4.3 Phase 3: Advanced Tree Operations + +```mermaid +graph TD + A[Implement delete] --> B[Implement list] + B --> C[Implement getall] + C --> D[Optimize operations] +``` + +1. Implement the `delete()` function for removing keys +2. Implement the `list()` function for prefix-based key listing +3. Implement the `getall()` function for retrieving all values with a prefix +4. Optimize operations for balanced performance + +### 4.4 Phase 4: Testing and Performance Evaluation + +```mermaid +graph TD + A[Create unit tests] --> B[Create integration tests] + B --> C[Create performance tests] + C --> D[Compare with RadixTree] + D --> E[Optimize based on results] +``` + +1. Create unit tests for each component +2. Create integration tests for the complete system +3. Create performance tests similar to RadixTree's +4. Compare performance with RadixTree +5. Optimize based on performance results + +## 5. Implementation Details + +### 5.1 Node Structure and Storage + +Each TST node will store a single character and have three child pointers (left, middle, right). The nodes will be serialized and stored in OurDB, with node references using OurDB record IDs. + +### 5.2 Key Operations + +#### 5.2.1 Insertion (set) + +```mermaid +graph TD + A[Start at root] --> B{Root exists?} + B -- No --> C[Create root node] + B -- Yes --> D[Compare current char with node char] + D -- Less than --> E[Go to left child] + D -- Equal to --> F[Go to middle child] + D -- Greater than --> G[Go to right child] + E --> H{Child exists?} + F --> H + G --> H + H -- No --> I[Create new node] + H -- Yes --> J[Continue with next char] + I --> J + J --> K{End of key?} + K -- Yes --> L[Set value and mark as end of key] + K -- No --> D +``` + +1. Start at the root node +2. For each character in the key: + - If the character is less than the current node's character, go to the left child + - If the character is equal to the current node's character, go to the middle child + - If the character is greater than the current node's character, go to the right child + - If the child doesn't exist, create a new node +3. When the end of the key is reached, set the value and mark the node as end of key + +#### 5.2.2 Lookup (get) + +1. Start at the root node +2. For each character in the key: + - If the character is less than the current node's character, go to the left child + - If the character is equal to the current node's character, go to the middle child + - If the character is greater than the current node's character, go to the right child + - If the child doesn't exist, the key is not found +3. When the end of the key is reached, check if the node is marked as end of key + - If yes, return the value + - If no, the key is not found + +#### 5.2.3 Deletion (delete) + +1. Find the node corresponding to the end of the key +2. If the node has no children, remove it and update its parent +3. If the node has children, mark it as not end of key and clear its value +4. Recursively clean up any nodes that are no longer needed + +#### 5.2.4 Prefix Operations (list, getall) + +1. Find the node corresponding to the end of the prefix +2. Perform a traversal of the subtree rooted at that node +3. Collect all keys (for list) or values (for getall) from nodes marked as end of key + +### 5.3 Serialization and OurDB Integration + +#### 5.3.1 Node Structure for Serialization + +Each TSTNode will be serialized with the following logical structure: + +1. Version marker (for future format evolution) +2. Character data +3. Is-end-of-key flag +4. Value (if is-end-of-key is true) +5. Child node references (left, middle, right) + +#### 5.3.2 OurDB Integration + +The TST will use OurDB for node storage and retrieval: + +1. **Node Storage**: Each node will be serialized and stored as a record in OurDB. +```rust +fn save_node(&mut self, node_id: Option, node: &TSTNode) -> Result { + let data = node.serialize(); + let args = OurDBSetArgs { + id: node_id, + data: &data, + }; + Ok(self.db.set(args)?) +} +``` + +2. **Node Retrieval**: Nodes will be retrieved from OurDB and deserialized. +```rust +fn get_node(&mut self, node_id: u32) -> Result { + let data = self.db.get(node_id)?; + TSTNode::deserialize(&data) +} +``` + +3. **Root Node Management**: The TST will maintain a root node ID for traversal. + +#### 5.3.3 Handling Large Datasets + +For large datasets, we'll implement a batching approach similar to the RadixTree's large-scale tests: + +1. **Batch Processing**: Process large datasets in manageable batches to avoid OurDB size limitations. +2. **Database Partitioning**: Create separate database instances for very large datasets. +3. **Memory Management**: Implement efficient memory usage patterns to avoid excessive memory consumption. + +## 6. Project Structure + +``` +tst/ +├── Cargo.toml +├── src/ +│ ├── lib.rs # Public API and re-exports +│ ├── node.rs # TSTNode implementation +│ ├── serialize.rs # Serialization and deserialization +│ ├── error.rs # Error types +│ └── operations.rs # Tree operations implementation +├── tests/ +│ ├── basic_test.rs # Basic operations tests +│ ├── prefix_test.rs # Prefix operations tests +│ └── edge_cases.rs # Edge case tests +└── examples/ + ├── basic_usage.rs # Basic usage example + ├── prefix_ops.rs # Prefix operations example + └── performance.rs # Performance benchmark +``` + +## 7. Performance Considerations + +### 7.1 Advantages of TST over RadixTree + +1. **Balanced Structure**: TST naturally maintains a more balanced structure, which can lead to more consistent performance across operations. +2. **Character-by-Character Comparison**: TST performs character-by-character comparisons, which can be more efficient for certain workloads. +3. **Efficient Prefix Operations**: TST can efficiently handle prefix operations by traversing the middle child path. + +### 7.2 Potential Optimizations + +1. **Node Caching**: Cache frequently accessed nodes to reduce database operations. +2. **Balancing Techniques**: Implement balancing techniques to ensure the tree remains balanced. +3. **Batch Operations**: Support batch operations for improved performance. +4. **Memory Management**: Implement efficient memory usage patterns to avoid excessive memory consumption. + +## 8. Testing Strategy + +### 8.1 Unit Tests + +1. Test `TSTNode` serialization/deserialization +2. Test character comparison operations +3. Test error handling + +### 8.2 Integration Tests + +1. Test basic CRUD operations +2. Test prefix operations +3. Test edge cases (empty keys, very long keys, etc.) +4. Test with large datasets + +### 8.3 Performance Tests + +1. Measure throughput for set/get operations +2. Measure latency for different operations +3. Test with different tree sizes and key distributions +4. Compare performance with RadixTree + +#### 8.3.1 Performance Benchmarking + +We'll create comprehensive benchmarks to compare the TST implementation with RadixTree: + +```rust +// Example benchmark structure +fn benchmark_set_operations(tree_type: &str, num_records: usize) -> Duration { + let start_time = Instant::now(); + + // Create tree (TST or RadixTree) + let mut tree = match tree_type { + "tst" => create_tst(), + "radix" => create_radix_tree(), + _ => panic!("Unknown tree type"), + }; + + // Insert records + for i in 0..num_records { + let key = format!("key:{:08}", i); + let value = format!("val{}", i).into_bytes(); + tree.set(&key, value).unwrap(); + } + + start_time.elapsed() +} +``` + +We'll benchmark the following operations: +- Set (insertion) +- Get (lookup) +- Delete +- List (prefix search) +- GetAll (prefix values) + +For each operation, we'll measure: +- Throughput (operations per second) +- Latency (time per operation) +- Memory usage +- Database size + +We'll test with various dataset characteristics: +- Small datasets (100-1,000 keys) +- Medium datasets (10,000-100,000 keys) +- Large datasets (1,000,000+ keys) +- Keys with common prefixes +- Keys with random distribution +- Long keys vs. short keys + +## 9. Timeline and Milestones + +1. **Week 1**: Core data structures and serialization +2. **Week 2**: Basic tree operations +3. **Week 3**: Advanced tree operations +4. **Week 4**: Testing and performance evaluation +5. **Week 5**: Optimization and documentation + +## 10. Conclusion + +This implementation plan provides a roadmap for creating a Ternary Search Tree (TST) as an alternative to the RadixTree implementation. The TST will maintain the same core functionality while providing a more balanced tree structure and aiming for balanced performance across all operations. + +The implementation will leverage OurDB for persistent storage, similar to RadixTree, but with a different node structure and traversal algorithm that better suits the TST approach. \ No newline at end of file