tantivy #5

Open
despiegk wants to merge 18 commits from tantivy into main
Showing only changes of commit e9675aafed - Show all commits

View File

@@ -1,18 +1,16 @@
use crate::error::DBError;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::{Arc, RwLock};
use tantivy::{ use tantivy::{
collector::TopDocs, collector::TopDocs,
directory::MmapDirectory, directory::MmapDirectory,
query::{QueryParser, BooleanQuery, Query, TermQuery, Occur}, query::{BooleanQuery, Occur, Query, QueryParser, TermQuery},
schema::{Schema, Field, TextOptions, TextFieldIndexing, schema::{Field, Schema, TextFieldIndexing, TextOptions, Value, STORED, STRING},
STORED, STRING, Value}, tokenizer::TokenizerManager,
Index, IndexWriter, IndexReader, ReloadPolicy, DateTime, Index, IndexReader, IndexWriter, ReloadPolicy, TantivyDocument, Term,
Term, DateTime, TantivyDocument,
tokenizer::{TokenizerManager},
}; };
use std::path::PathBuf;
use std::sync::{Arc, RwLock};
use std::collections::HashMap;
use crate::error::DBError;
use serde::{Serialize, Deserialize};
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub enum FieldDef { pub enum FieldDef {
@@ -100,17 +98,28 @@ impl TantivySearch {
// Always add a document ID field // Always add a document ID field
let id_field = schema_builder.add_text_field("_id", STRING | STORED); let id_field = schema_builder.add_text_field("_id", STRING | STORED);
fields.insert("_id".to_string(), (id_field, FieldDef::Text { fields.insert(
"_id".to_string(),
(
id_field,
FieldDef::Text {
stored: true, stored: true,
indexed: true, indexed: true,
tokenized: false, tokenized: false,
fast: false, fast: false,
})); },
),
);
// Add user-defined fields // Add user-defined fields
for (field_name, field_def) in field_definitions { for (field_name, field_def) in field_definitions {
let field = match &field_def { let field = match &field_def {
FieldDef::Text { stored, indexed, tokenized, fast: _fast } => { FieldDef::Text {
stored,
indexed,
tokenized,
fast: _fast,
} => {
let mut text_options = TextOptions::default(); let mut text_options = TextOptions::default();
if *stored { if *stored {
@@ -121,7 +130,9 @@ impl TantivySearch {
let indexing_options = if *tokenized { let indexing_options = if *tokenized {
TextFieldIndexing::default() TextFieldIndexing::default()
.set_tokenizer("default") .set_tokenizer("default")
.set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions) .set_index_option(
tantivy::schema::IndexRecordOption::WithFreqsAndPositions,
)
} else { } else {
TextFieldIndexing::default() TextFieldIndexing::default()
.set_tokenizer("raw") .set_tokenizer("raw")
@@ -138,39 +149,70 @@ impl TantivySearch {
schema_builder.add_text_field(&field_name, text_options) schema_builder.add_text_field(&field_name, text_options)
} }
} }
FieldDef::Numeric { stored, indexed, fast, precision } => { FieldDef::Numeric {
match precision { stored,
indexed,
fast,
precision,
} => match precision {
NumericType::I64 => { NumericType::I64 => {
let mut opts = tantivy::schema::NumericOptions::default(); let mut opts = tantivy::schema::NumericOptions::default();
if *stored { opts = opts.set_stored(); } if *stored {
if *indexed { opts = opts.set_indexed(); } opts = opts.set_stored();
if *fast { opts = opts.set_fast(); } }
if *indexed {
opts = opts.set_indexed();
}
if *fast {
opts = opts.set_fast();
}
schema_builder.add_i64_field(&field_name, opts) schema_builder.add_i64_field(&field_name, opts)
} }
NumericType::U64 => { NumericType::U64 => {
let mut opts = tantivy::schema::NumericOptions::default(); let mut opts = tantivy::schema::NumericOptions::default();
if *stored { opts = opts.set_stored(); } if *stored {
if *indexed { opts = opts.set_indexed(); } opts = opts.set_stored();
if *fast { opts = opts.set_fast(); } }
if *indexed {
opts = opts.set_indexed();
}
if *fast {
opts = opts.set_fast();
}
schema_builder.add_u64_field(&field_name, opts) schema_builder.add_u64_field(&field_name, opts)
} }
NumericType::F64 => { NumericType::F64 => {
let mut opts = tantivy::schema::NumericOptions::default(); let mut opts = tantivy::schema::NumericOptions::default();
if *stored { opts = opts.set_stored(); } if *stored {
if *indexed { opts = opts.set_indexed(); } opts = opts.set_stored();
if *fast { opts = opts.set_fast(); } }
if *indexed {
opts = opts.set_indexed();
}
if *fast {
opts = opts.set_fast();
}
schema_builder.add_f64_field(&field_name, opts) schema_builder.add_f64_field(&field_name, opts)
} }
NumericType::Date => { NumericType::Date => {
let mut opts = tantivy::schema::DateOptions::default(); let mut opts = tantivy::schema::DateOptions::default();
if *stored { opts = opts.set_stored(); } if *stored {
if *indexed { opts = opts.set_indexed(); } opts = opts.set_stored();
if *fast { opts = opts.set_fast(); } }
if *indexed {
opts = opts.set_indexed();
}
if *fast {
opts = opts.set_fast();
}
schema_builder.add_date_field(&field_name, opts) schema_builder.add_date_field(&field_name, opts)
} }
} },
} FieldDef::Tag {
FieldDef::Tag { stored, separator: _, case_sensitive: _ } => { stored,
separator: _,
case_sensitive: _,
} => {
let mut text_options = TextOptions::default(); let mut text_options = TextOptions::default();
if *stored { if *stored {
text_options = text_options.set_stored(); text_options = text_options.set_stored();
@@ -178,31 +220,47 @@ impl TantivySearch {
text_options = text_options.set_indexing_options( text_options = text_options.set_indexing_options(
TextFieldIndexing::default() TextFieldIndexing::default()
.set_tokenizer("raw") .set_tokenizer("raw")
.set_index_option(tantivy::schema::IndexRecordOption::Basic) .set_index_option(tantivy::schema::IndexRecordOption::Basic),
); );
schema_builder.add_text_field(&field_name, text_options) schema_builder.add_text_field(&field_name, text_options)
} }
FieldDef::Geo { stored } => { FieldDef::Geo { stored } => {
// For now, store as two f64 fields for lat/lon // For now, store as two f64 fields for lat/lon
let mut opts = tantivy::schema::NumericOptions::default(); let mut opts = tantivy::schema::NumericOptions::default();
if *stored { opts = opts.set_stored(); } if *stored {
opts = opts.set_stored();
}
opts = opts.set_indexed().set_fast(); opts = opts.set_indexed().set_fast();
let lat_field = schema_builder.add_f64_field(&format!("{}_lat", field_name), opts.clone()); let lat_field =
let lon_field = schema_builder.add_f64_field(&format!("{}_lon", field_name), opts); schema_builder.add_f64_field(&format!("{}_lat", field_name), opts.clone());
let lon_field =
schema_builder.add_f64_field(&format!("{}_lon", field_name), opts);
fields.insert(format!("{}_lat", field_name), (lat_field, FieldDef::Numeric { fields.insert(
format!("{}_lat", field_name),
(
lat_field,
FieldDef::Numeric {
stored: *stored, stored: *stored,
indexed: true, indexed: true,
fast: true, fast: true,
precision: NumericType::F64, precision: NumericType::F64,
})); },
fields.insert(format!("{}_lon", field_name), (lon_field, FieldDef::Numeric { ),
);
fields.insert(
format!("{}_lon", field_name),
(
lon_field,
FieldDef::Numeric {
stored: *stored, stored: *stored,
indexed: true, indexed: true,
fast: true, fast: true,
precision: NumericType::F64, precision: NumericType::F64,
})); },
),
);
continue; // Skip adding the geo field itself continue; // Skip adding the geo field itself
} }
}; };
@@ -228,7 +286,8 @@ impl TantivySearch {
let tokenizer_manager = TokenizerManager::default(); let tokenizer_manager = TokenizerManager::default();
index.set_tokenizers(tokenizer_manager); index.set_tokenizers(tokenizer_manager);
let writer = index.writer(1_000_000) let writer = index
.writer(15_000_000)
.map_err(|e| DBError(format!("Failed to create index writer: {}", e)))?; .map_err(|e| DBError(format!("Failed to create index writer: {}", e)))?;
let reader = index let reader = index
@@ -254,7 +313,9 @@ impl TantivySearch {
doc_id: &str, doc_id: &str,
fields: HashMap<String, String>, fields: HashMap<String, String>,
) -> Result<(), DBError> { ) -> Result<(), DBError> {
let mut writer = self.writer.write() let mut writer = self
.writer
.write()
.map_err(|e| DBError(format!("Failed to acquire writer lock: {}", e)))?; .map_err(|e| DBError(format!("Failed to acquire writer lock: {}", e)))?;
// Delete existing document with same ID // Delete existing document with same ID
@@ -277,8 +338,7 @@ impl TantivySearch {
FieldDef::Text { .. } => { FieldDef::Text { .. } => {
doc.add_text(*field, &field_value); doc.add_text(*field, &field_value);
} }
FieldDef::Numeric { precision, .. } => { FieldDef::Numeric { precision, .. } => match precision {
match precision {
NumericType::I64 => { NumericType::I64 => {
if let Ok(v) = field_value.parse::<i64>() { if let Ok(v) = field_value.parse::<i64>() {
doc.add_i64(*field, v); doc.add_i64(*field, v);
@@ -299,9 +359,12 @@ impl TantivySearch {
doc.add_date(*field, DateTime::from_timestamp_millis(v)); doc.add_date(*field, DateTime::from_timestamp_millis(v));
} }
} }
} },
} FieldDef::Tag {
FieldDef::Tag { separator, case_sensitive, .. } => { separator,
case_sensitive,
..
} => {
let tags = if !case_sensitive { let tags = if !case_sensitive {
field_value.to_lowercase() field_value.to_lowercase()
} else { } else {
@@ -317,11 +380,17 @@ impl TantivySearch {
// Parse "lat,lon" format // Parse "lat,lon" format
let parts: Vec<&str> = field_value.split(',').collect(); let parts: Vec<&str> = field_value.split(',').collect();
if parts.len() == 2 { if parts.len() == 2 {
if let (Ok(lat), Ok(lon)) = (parts[0].parse::<f64>(), parts[1].parse::<f64>()) { if let (Ok(lat), Ok(lon)) =
if let Some((lat_field, _)) = self.index_schema.fields.get(&format!("{}_lat", field_name)) { (parts[0].parse::<f64>(), parts[1].parse::<f64>())
{
if let Some((lat_field, _)) =
self.index_schema.fields.get(&format!("{}_lat", field_name))
{
doc.add_f64(*lat_field, lat); doc.add_f64(*lat_field, lat);
} }
if let Some((lon_field, _)) = self.index_schema.fields.get(&format!("{}_lon", field_name)) { if let Some((lon_field, _)) =
self.index_schema.fields.get(&format!("{}_lon", field_name))
{
doc.add_f64(*lon_field, lon); doc.add_f64(*lon_field, lon);
} }
} }
@@ -331,9 +400,12 @@ impl TantivySearch {
} }
} }
writer.add_document(doc).map_err(|e| DBError(format!("Failed to add document: {}", e)))?; writer
.add_document(doc)
.map_err(|e| DBError(format!("Failed to add document: {}", e)))?;
writer.commit() writer
.commit()
.map_err(|e| DBError(format!("Failed to commit: {}", e)))?; .map_err(|e| DBError(format!("Failed to commit: {}", e)))?;
Ok(()) Ok(())
@@ -348,15 +420,20 @@ impl TantivySearch {
// Parse query based on search fields // Parse query based on search fields
let query: Box<dyn Query> = if self.index_schema.default_search_fields.is_empty() { let query: Box<dyn Query> = if self.index_schema.default_search_fields.is_empty() {
return Err(DBError("No searchable fields defined in schema".to_string())); return Err(DBError(
"No searchable fields defined in schema".to_string(),
));
} else { } else {
let query_parser = QueryParser::for_index( let query_parser = QueryParser::for_index(
&self.index, &self.index,
self.index_schema.default_search_fields.clone(), self.index_schema.default_search_fields.clone(),
); );
Box::new(query_parser.parse_query(query_str) Box::new(
.map_err(|e| DBError(format!("Failed to parse query: {}", e)))?) query_parser
.parse_query(query_str)
.map_err(|e| DBError(format!("Failed to parse query: {}", e)))?,
)
}; };
// Apply filters if any // Apply filters if any
@@ -399,16 +476,19 @@ impl TantivySearch {
}; };
// Execute search // Execute search
let top_docs = searcher.search( let top_docs = searcher
.search(
&*final_query, &*final_query,
&TopDocs::with_limit(options.limit + options.offset) &TopDocs::with_limit(options.limit + options.offset),
).map_err(|e| DBError(format!("Search failed: {}", e)))?; )
.map_err(|e| DBError(format!("Search failed: {}", e)))?;
let total_hits = top_docs.len(); let total_hits = top_docs.len();
let mut documents = Vec::new(); let mut documents = Vec::new();
for (score, doc_address) in top_docs.iter().skip(options.offset).take(options.limit) { for (score, doc_address) in top_docs.iter().skip(options.offset).take(options.limit) {
let retrieved_doc: TantivyDocument = searcher.doc(*doc_address) let retrieved_doc: TantivyDocument = searcher
.doc(*doc_address)
.map_err(|e| DBError(format!("Failed to retrieve doc: {}", e)))?; .map_err(|e| DBError(format!("Failed to retrieve doc: {}", e)))?;
let mut doc_fields = HashMap::new(); let mut doc_fields = HashMap::new();
@@ -416,8 +496,7 @@ impl TantivySearch {
// Extract all stored fields // Extract all stored fields
for (field_name, (field, field_def)) in &self.index_schema.fields { for (field_name, (field, field_def)) in &self.index_schema.fields {
match field_def { match field_def {
FieldDef::Text { stored, .. } | FieldDef::Text { stored, .. } | FieldDef::Tag { stored, .. } => {
FieldDef::Tag { stored, .. } => {
if *stored { if *stored {
if let Some(value) = retrieved_doc.get_first(*field) { if let Some(value) = retrieved_doc.get_first(*field) {
if let Some(text) = value.as_str() { if let Some(text) = value.as_str() {
@@ -426,29 +505,27 @@ impl TantivySearch {
} }
} }
} }
FieldDef::Numeric { stored, precision, .. } => { FieldDef::Numeric {
stored, precision, ..
} => {
if *stored { if *stored {
let value_str = match precision { let value_str = match precision {
NumericType::I64 => { NumericType::I64 => retrieved_doc
retrieved_doc.get_first(*field) .get_first(*field)
.and_then(|v| v.as_i64()) .and_then(|v| v.as_i64())
.map(|v| v.to_string()) .map(|v| v.to_string()),
} NumericType::U64 => retrieved_doc
NumericType::U64 => { .get_first(*field)
retrieved_doc.get_first(*field)
.and_then(|v| v.as_u64()) .and_then(|v| v.as_u64())
.map(|v| v.to_string()) .map(|v| v.to_string()),
} NumericType::F64 => retrieved_doc
NumericType::F64 => { .get_first(*field)
retrieved_doc.get_first(*field)
.and_then(|v| v.as_f64()) .and_then(|v| v.as_f64())
.map(|v| v.to_string()) .map(|v| v.to_string()),
} NumericType::Date => retrieved_doc
NumericType::Date => { .get_first(*field)
retrieved_doc.get_first(*field)
.and_then(|v| v.as_datetime()) .and_then(|v| v.as_datetime())
.map(|v| v.into_timestamp_millis().to_string()) .map(|v| v.into_timestamp_millis().to_string()),
}
}; };
if let Some(v) = value_str { if let Some(v) = value_str {
@@ -458,8 +535,18 @@ impl TantivySearch {
} }
FieldDef::Geo { stored } => { FieldDef::Geo { stored } => {
if *stored { if *stored {
let lat_field = self.index_schema.fields.get(&format!("{}_lat", field_name)).unwrap().0; let lat_field = self
let lon_field = self.index_schema.fields.get(&format!("{}_lon", field_name)).unwrap().0; .index_schema
.fields
.get(&format!("{}_lat", field_name))
.unwrap()
.0;
let lon_field = self
.index_schema
.fields
.get(&format!("{}_lon", field_name))
.unwrap()
.0;
let lat = retrieved_doc.get_first(lat_field).and_then(|v| v.as_f64()); let lat = retrieved_doc.get_first(lat_field).and_then(|v| v.as_f64());
let lon = retrieved_doc.get_first(lon_field).and_then(|v| v.as_f64()); let lon = retrieved_doc.get_first(lon_field).and_then(|v| v.as_f64());
@@ -488,12 +575,15 @@ impl TantivySearch {
let searcher = self.reader.searcher(); let searcher = self.reader.searcher();
let num_docs = searcher.num_docs(); let num_docs = searcher.num_docs();
let fields_info: Vec<FieldInfo> = self.index_schema.fields.iter().map(|(name, (_, def))| { let fields_info: Vec<FieldInfo> = self
FieldInfo { .index_schema
.fields
.iter()
.map(|(name, (_, def))| FieldInfo {
name: name.clone(), name: name.clone(),
field_type: format!("{:?}", def), field_type: format!("{:?}", def),
} })
}).collect(); .collect();
Ok(IndexInfo { Ok(IndexInfo {
name: self.name.clone(), name: self.name.clone(),