diff --git a/src/tantivy_search.rs b/src/tantivy_search.rs index 33b8c84..7615d35 100644 --- a/src/tantivy_search.rs +++ b/src/tantivy_search.rs @@ -1,18 +1,16 @@ +use crate::error::DBError; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::{Arc, RwLock}; use tantivy::{ collector::TopDocs, directory::MmapDirectory, - query::{QueryParser, BooleanQuery, Query, TermQuery, Occur}, - schema::{Schema, Field, TextOptions, TextFieldIndexing, - STORED, STRING, Value}, - Index, IndexWriter, IndexReader, ReloadPolicy, - Term, DateTime, TantivyDocument, - tokenizer::{TokenizerManager}, + query::{BooleanQuery, Occur, Query, QueryParser, TermQuery}, + schema::{Field, Schema, TextFieldIndexing, TextOptions, Value, STORED, STRING}, + tokenizer::TokenizerManager, + DateTime, Index, IndexReader, IndexWriter, ReloadPolicy, TantivyDocument, Term, }; -use std::path::PathBuf; -use std::sync::{Arc, RwLock}; -use std::collections::HashMap; -use crate::error::DBError; -use serde::{Serialize, Deserialize}; #[derive(Debug, Clone, Serialize, Deserialize)] pub enum FieldDef { @@ -100,35 +98,48 @@ impl TantivySearch { // Always add a document ID field let id_field = schema_builder.add_text_field("_id", STRING | STORED); - fields.insert("_id".to_string(), (id_field, FieldDef::Text { - stored: true, - indexed: true, - tokenized: false, - fast: false, - })); + fields.insert( + "_id".to_string(), + ( + id_field, + FieldDef::Text { + stored: true, + indexed: true, + tokenized: false, + fast: false, + }, + ), + ); // Add user-defined fields for (field_name, field_def) in field_definitions { let field = match &field_def { - FieldDef::Text { stored, indexed, tokenized, fast: _fast } => { + FieldDef::Text { + stored, + indexed, + tokenized, + fast: _fast, + } => { let mut text_options = TextOptions::default(); - + if *stored { text_options = text_options.set_stored(); } - + if *indexed { let indexing_options = if *tokenized { TextFieldIndexing::default() .set_tokenizer("default") - .set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions) + .set_index_option( + tantivy::schema::IndexRecordOption::WithFreqsAndPositions, + ) } else { TextFieldIndexing::default() .set_tokenizer("raw") .set_index_option(tantivy::schema::IndexRecordOption::Basic) }; text_options = text_options.set_indexing_options(indexing_options); - + let f = schema_builder.add_text_field(&field_name, text_options); if *tokenized { default_search_fields.push(f); @@ -138,39 +149,70 @@ impl TantivySearch { schema_builder.add_text_field(&field_name, text_options) } } - FieldDef::Numeric { stored, indexed, fast, precision } => { - match precision { - NumericType::I64 => { - let mut opts = tantivy::schema::NumericOptions::default(); - if *stored { opts = opts.set_stored(); } - if *indexed { opts = opts.set_indexed(); } - if *fast { opts = opts.set_fast(); } - schema_builder.add_i64_field(&field_name, opts) + FieldDef::Numeric { + stored, + indexed, + fast, + precision, + } => match precision { + NumericType::I64 => { + let mut opts = tantivy::schema::NumericOptions::default(); + if *stored { + opts = opts.set_stored(); } - NumericType::U64 => { - let mut opts = tantivy::schema::NumericOptions::default(); - if *stored { opts = opts.set_stored(); } - if *indexed { opts = opts.set_indexed(); } - if *fast { opts = opts.set_fast(); } - schema_builder.add_u64_field(&field_name, opts) + if *indexed { + opts = opts.set_indexed(); } - NumericType::F64 => { - let mut opts = tantivy::schema::NumericOptions::default(); - if *stored { opts = opts.set_stored(); } - if *indexed { opts = opts.set_indexed(); } - if *fast { opts = opts.set_fast(); } - schema_builder.add_f64_field(&field_name, opts) - } - NumericType::Date => { - let mut opts = tantivy::schema::DateOptions::default(); - if *stored { opts = opts.set_stored(); } - if *indexed { opts = opts.set_indexed(); } - if *fast { opts = opts.set_fast(); } - schema_builder.add_date_field(&field_name, opts) + if *fast { + opts = opts.set_fast(); } + schema_builder.add_i64_field(&field_name, opts) } - } - FieldDef::Tag { stored, separator: _, case_sensitive: _ } => { + NumericType::U64 => { + let mut opts = tantivy::schema::NumericOptions::default(); + if *stored { + opts = opts.set_stored(); + } + if *indexed { + opts = opts.set_indexed(); + } + if *fast { + opts = opts.set_fast(); + } + schema_builder.add_u64_field(&field_name, opts) + } + NumericType::F64 => { + let mut opts = tantivy::schema::NumericOptions::default(); + if *stored { + opts = opts.set_stored(); + } + if *indexed { + opts = opts.set_indexed(); + } + if *fast { + opts = opts.set_fast(); + } + schema_builder.add_f64_field(&field_name, opts) + } + NumericType::Date => { + let mut opts = tantivy::schema::DateOptions::default(); + if *stored { + opts = opts.set_stored(); + } + if *indexed { + opts = opts.set_indexed(); + } + if *fast { + opts = opts.set_fast(); + } + schema_builder.add_date_field(&field_name, opts) + } + }, + FieldDef::Tag { + stored, + separator: _, + case_sensitive: _, + } => { let mut text_options = TextOptions::default(); if *stored { text_options = text_options.set_stored(); @@ -178,35 +220,51 @@ impl TantivySearch { text_options = text_options.set_indexing_options( TextFieldIndexing::default() .set_tokenizer("raw") - .set_index_option(tantivy::schema::IndexRecordOption::Basic) + .set_index_option(tantivy::schema::IndexRecordOption::Basic), ); schema_builder.add_text_field(&field_name, text_options) } FieldDef::Geo { stored } => { // For now, store as two f64 fields for lat/lon let mut opts = tantivy::schema::NumericOptions::default(); - if *stored { opts = opts.set_stored(); } + if *stored { + opts = opts.set_stored(); + } opts = opts.set_indexed().set_fast(); - - let lat_field = schema_builder.add_f64_field(&format!("{}_lat", field_name), opts.clone()); - let lon_field = schema_builder.add_f64_field(&format!("{}_lon", field_name), opts); - - fields.insert(format!("{}_lat", field_name), (lat_field, FieldDef::Numeric { - stored: *stored, - indexed: true, - fast: true, - precision: NumericType::F64, - })); - fields.insert(format!("{}_lon", field_name), (lon_field, FieldDef::Numeric { - stored: *stored, - indexed: true, - fast: true, - precision: NumericType::F64, - })); + + let lat_field = + schema_builder.add_f64_field(&format!("{}_lat", field_name), opts.clone()); + let lon_field = + schema_builder.add_f64_field(&format!("{}_lon", field_name), opts); + + fields.insert( + format!("{}_lat", field_name), + ( + lat_field, + FieldDef::Numeric { + stored: *stored, + indexed: true, + fast: true, + precision: NumericType::F64, + }, + ), + ); + fields.insert( + format!("{}_lon", field_name), + ( + lon_field, + FieldDef::Numeric { + stored: *stored, + indexed: true, + fast: true, + precision: NumericType::F64, + }, + ), + ); continue; // Skip adding the geo field itself } }; - + fields.insert(field_name.clone(), (field, field_def)); } @@ -220,7 +278,7 @@ impl TantivySearch { // Create or open index let dir = MmapDirectory::open(&index_path) .map_err(|e| DBError(format!("Failed to open index directory: {}", e)))?; - + let mut index = Index::open_or_create(dir, schema) .map_err(|e| DBError(format!("Failed to create index: {}", e)))?; @@ -228,7 +286,8 @@ impl TantivySearch { let tokenizer_manager = TokenizerManager::default(); index.set_tokenizers(tokenizer_manager); - let writer = index.writer(1_000_000) + let writer = index + .writer(15_000_000) .map_err(|e| DBError(format!("Failed to create index writer: {}", e)))?; let reader = index @@ -254,7 +313,9 @@ impl TantivySearch { doc_id: &str, fields: HashMap, ) -> Result<(), DBError> { - let mut writer = self.writer.write() + let mut writer = self + .writer + .write() .map_err(|e| DBError(format!("Failed to acquire writer lock: {}", e)))?; // Delete existing document with same ID @@ -264,7 +325,7 @@ impl TantivySearch { // Create new document let mut doc = tantivy::doc!(); - + // Add document ID if let Some((id_field, _)) = self.index_schema.fields.get("_id") { doc.add_text(*id_field, doc_id); @@ -277,37 +338,39 @@ impl TantivySearch { FieldDef::Text { .. } => { doc.add_text(*field, &field_value); } - FieldDef::Numeric { precision, .. } => { - match precision { - NumericType::I64 => { - if let Ok(v) = field_value.parse::() { - doc.add_i64(*field, v); - } - } - NumericType::U64 => { - if let Ok(v) = field_value.parse::() { - doc.add_u64(*field, v); - } - } - NumericType::F64 => { - if let Ok(v) = field_value.parse::() { - doc.add_f64(*field, v); - } - } - NumericType::Date => { - if let Ok(v) = field_value.parse::() { - doc.add_date(*field, DateTime::from_timestamp_millis(v)); - } + FieldDef::Numeric { precision, .. } => match precision { + NumericType::I64 => { + if let Ok(v) = field_value.parse::() { + doc.add_i64(*field, v); } } - } - FieldDef::Tag { separator, case_sensitive, .. } => { + NumericType::U64 => { + if let Ok(v) = field_value.parse::() { + doc.add_u64(*field, v); + } + } + NumericType::F64 => { + if let Ok(v) = field_value.parse::() { + doc.add_f64(*field, v); + } + } + NumericType::Date => { + if let Ok(v) = field_value.parse::() { + doc.add_date(*field, DateTime::from_timestamp_millis(v)); + } + } + }, + FieldDef::Tag { + separator, + case_sensitive, + .. + } => { let tags = if !case_sensitive { field_value.to_lowercase() } else { field_value.clone() }; - + // Store tags as separate terms for efficient filtering for tag in tags.split(separator.as_str()) { doc.add_text(*field, tag.trim()); @@ -317,11 +380,17 @@ impl TantivySearch { // Parse "lat,lon" format let parts: Vec<&str> = field_value.split(',').collect(); if parts.len() == 2 { - if let (Ok(lat), Ok(lon)) = (parts[0].parse::(), parts[1].parse::()) { - if let Some((lat_field, _)) = self.index_schema.fields.get(&format!("{}_lat", field_name)) { + if let (Ok(lat), Ok(lon)) = + (parts[0].parse::(), parts[1].parse::()) + { + if let Some((lat_field, _)) = + self.index_schema.fields.get(&format!("{}_lat", field_name)) + { doc.add_f64(*lat_field, lat); } - if let Some((lon_field, _)) = self.index_schema.fields.get(&format!("{}_lon", field_name)) { + if let Some((lon_field, _)) = + self.index_schema.fields.get(&format!("{}_lon", field_name)) + { doc.add_f64(*lon_field, lon); } } @@ -331,9 +400,12 @@ impl TantivySearch { } } - writer.add_document(doc).map_err(|e| DBError(format!("Failed to add document: {}", e)))?; + writer + .add_document(doc) + .map_err(|e| DBError(format!("Failed to add document: {}", e)))?; - writer.commit() + writer + .commit() .map_err(|e| DBError(format!("Failed to commit: {}", e)))?; Ok(()) @@ -345,18 +417,23 @@ impl TantivySearch { options: SearchOptions, ) -> Result { let searcher = self.reader.searcher(); - + // Parse query based on search fields let query: Box = if self.index_schema.default_search_fields.is_empty() { - return Err(DBError("No searchable fields defined in schema".to_string())); + return Err(DBError( + "No searchable fields defined in schema".to_string(), + )); } else { let query_parser = QueryParser::for_index( &self.index, self.index_schema.default_search_fields.clone(), ); - - Box::new(query_parser.parse_query(query_str) - .map_err(|e| DBError(format!("Failed to parse query: {}", e)))?) + + Box::new( + query_parser + .parse_query(query_str) + .map_err(|e| DBError(format!("Failed to parse query: {}", e)))?, + ) }; // Apply filters if any @@ -392,32 +469,34 @@ impl TantivySearch { } } } - + Box::new(BooleanQuery::new(clauses)) } else { query }; // Execute search - let top_docs = searcher.search( - &*final_query, - &TopDocs::with_limit(options.limit + options.offset) - ).map_err(|e| DBError(format!("Search failed: {}", e)))?; + let top_docs = searcher + .search( + &*final_query, + &TopDocs::with_limit(options.limit + options.offset), + ) + .map_err(|e| DBError(format!("Search failed: {}", e)))?; let total_hits = top_docs.len(); let mut documents = Vec::new(); for (score, doc_address) in top_docs.iter().skip(options.offset).take(options.limit) { - let retrieved_doc: TantivyDocument = searcher.doc(*doc_address) + let retrieved_doc: TantivyDocument = searcher + .doc(*doc_address) .map_err(|e| DBError(format!("Failed to retrieve doc: {}", e)))?; let mut doc_fields = HashMap::new(); - + // Extract all stored fields for (field_name, (field, field_def)) in &self.index_schema.fields { match field_def { - FieldDef::Text { stored, .. } | - FieldDef::Tag { stored, .. } => { + FieldDef::Text { stored, .. } | FieldDef::Tag { stored, .. } => { if *stored { if let Some(value) = retrieved_doc.get_first(*field) { if let Some(text) = value.as_str() { @@ -426,31 +505,29 @@ impl TantivySearch { } } } - FieldDef::Numeric { stored, precision, .. } => { + FieldDef::Numeric { + stored, precision, .. + } => { if *stored { let value_str = match precision { - NumericType::I64 => { - retrieved_doc.get_first(*field) - .and_then(|v| v.as_i64()) - .map(|v| v.to_string()) - } - NumericType::U64 => { - retrieved_doc.get_first(*field) - .and_then(|v| v.as_u64()) - .map(|v| v.to_string()) - } - NumericType::F64 => { - retrieved_doc.get_first(*field) - .and_then(|v| v.as_f64()) - .map(|v| v.to_string()) - } - NumericType::Date => { - retrieved_doc.get_first(*field) - .and_then(|v| v.as_datetime()) - .map(|v| v.into_timestamp_millis().to_string()) - } + NumericType::I64 => retrieved_doc + .get_first(*field) + .and_then(|v| v.as_i64()) + .map(|v| v.to_string()), + NumericType::U64 => retrieved_doc + .get_first(*field) + .and_then(|v| v.as_u64()) + .map(|v| v.to_string()), + NumericType::F64 => retrieved_doc + .get_first(*field) + .and_then(|v| v.as_f64()) + .map(|v| v.to_string()), + NumericType::Date => retrieved_doc + .get_first(*field) + .and_then(|v| v.as_datetime()) + .map(|v| v.into_timestamp_millis().to_string()), }; - + if let Some(v) = value_str { doc_fields.insert(field_name.clone(), v); } @@ -458,12 +535,22 @@ impl TantivySearch { } FieldDef::Geo { stored } => { if *stored { - let lat_field = self.index_schema.fields.get(&format!("{}_lat", field_name)).unwrap().0; - let lon_field = self.index_schema.fields.get(&format!("{}_lon", field_name)).unwrap().0; - + let lat_field = self + .index_schema + .fields + .get(&format!("{}_lat", field_name)) + .unwrap() + .0; + let lon_field = self + .index_schema + .fields + .get(&format!("{}_lon", field_name)) + .unwrap() + .0; + let lat = retrieved_doc.get_first(lat_field).and_then(|v| v.as_f64()); let lon = retrieved_doc.get_first(lon_field).and_then(|v| v.as_f64()); - + if let (Some(lat), Some(lon)) = (lat, lon) { doc_fields.insert(field_name.clone(), format!("{},{}", lat, lon)); } @@ -487,13 +574,16 @@ impl TantivySearch { pub fn get_info(&self) -> Result { let searcher = self.reader.searcher(); let num_docs = searcher.num_docs(); - - let fields_info: Vec = self.index_schema.fields.iter().map(|(name, (_, def))| { - FieldInfo { + + let fields_info: Vec = self + .index_schema + .fields + .iter() + .map(|(name, (_, def))| FieldInfo { name: name.clone(), field_type: format!("{:?}", def), - } - }).collect(); + }) + .collect(); Ok(IndexInfo { name: self.name.clone(), @@ -564,4 +654,4 @@ pub struct IndexInfo { pub struct FieldInfo { pub name: String, pub field_type: String, -} \ No newline at end of file +}