lancedb_impl #15

Open
maximevanhees wants to merge 7 commits from lancedb_impl into main
3 changed files with 98 additions and 1 deletions
Showing only changes of commit 7d07b57d32 - Show all commits

View File

@@ -1325,6 +1325,7 @@ impl Cmd {
let p_lc = provider.to_lowercase();
let prov = match p_lc.as_str() {
"test-hash" | "testhash" => EmbeddingProvider::TestHash,
"testimagehash" | "image-test-hash" | "imagetesthash" => EmbeddingProvider::ImageTestHash,
"fastembed" | "lancefastembed" => EmbeddingProvider::LanceFastEmbed,
"openai" | "lanceopenai" => EmbeddingProvider::LanceOpenAI,
other => EmbeddingProvider::LanceOther(other.to_string()),
@@ -1346,6 +1347,7 @@ impl Cmd {
arr.push(Protocol::BulkString("provider".to_string()));
arr.push(Protocol::BulkString(match cfg.provider {
EmbeddingProvider::TestHash => "test-hash".to_string(),
EmbeddingProvider::ImageTestHash => "testimagehash".to_string(),
EmbeddingProvider::LanceFastEmbed => "lancefastembed".to_string(),
EmbeddingProvider::LanceOpenAI => "lanceopenai".to_string(),
EmbeddingProvider::LanceOther(ref s) => s.clone(),

View File

@@ -30,8 +30,10 @@ use serde_json::json;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum EmbeddingProvider {
// Deterministic, local-only embedder for CI and offline development.
// Deterministic, local-only embedder for CI and offline development (text).
TestHash,
// Deterministic, local-only embedder for CI and offline development (image).
ImageTestHash,
// Placeholders for LanceDB-supported providers; implementers can add concrete backends later.
LanceFastEmbed,
LanceOpenAI,
@@ -71,6 +73,8 @@ pub trait Embedder: Send + Sync {
}
}
//// ----------------------------- TEXT: deterministic test embedder -----------------------------
/// Deterministic, no-deps, no-network embedder for CI and offline dev.
/// Algorithm:
/// - Fold bytes of UTF-8 into 'dim' buckets with a simple rolling hash
@@ -127,6 +131,77 @@ impl Embedder for TestHashEmbedder {
}
}
//// ----------------------------- IMAGE: trait + deterministic test embedder -----------------------------
/// Image embedding interface (separate from text to keep modality-specific inputs).
pub trait ImageEmbedder: Send + Sync {
/// Human-readable provider/model name
fn name(&self) -> String;
/// Embedding dimension
fn dim(&self) -> usize;
/// Embed a single image (raw bytes)
fn embed_image(&self, bytes: &[u8]) -> Result<Vec<f32>, DBError>;
/// Embed many images; default maps embed_image() over inputs
fn embed_many_images(&self, images: &[Vec<u8>]) -> Result<Vec<Vec<f32>>, DBError> {
images.iter().map(|b| self.embed_image(b)).collect()
}
}
/// Deterministic image embedder that folds bytes into buckets, applies tanh-like nonlinearity,
/// and L2-normalizes. Suitable for CI and offline development.
/// NOTE: This is NOT semantic; it is a stable hash-like representation.
pub struct TestImageHashEmbedder {
dim: usize,
model_name: String,
}
impl TestImageHashEmbedder {
pub fn new(dim: usize, model_name: impl Into<String>) -> Self {
Self { dim, model_name: model_name.into() }
}
fn l2_normalize(mut v: Vec<f32>) -> Vec<f32> {
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
for x in &mut v {
*x /= norm;
}
}
v
}
}
impl ImageEmbedder for TestImageHashEmbedder {
fn name(&self) -> String {
format!("test-image-hash:{}", self.model_name)
}
fn dim(&self) -> usize {
self.dim
}
fn embed_image(&self, bytes: &[u8]) -> Result<Vec<f32>, DBError> {
// Deterministic fold across bytes with two rolling accumulators.
let mut acc = vec![0f32; self.dim];
let mut h1: u32 = 0x811C9DC5; // FNV-like
let mut h2: u32 = 0x9E3779B9; // golden ratio
for (i, b) in bytes.iter().enumerate() {
h1 ^= *b as u32;
h1 = h1.wrapping_mul(16777619u32);
// combine with position and h2
h2 = h2.wrapping_add(((i as u32).rotate_left((i % 13) as u32)) ^ h1.rotate_left((i % 7) as u32));
let idx = (h1 ^ h2) as usize % self.dim;
// Map to [-1,1] and decay with position
let val = ((*b as f32) / 127.5 - 1.0) * (1.0 / (1.0 + (i as f32 / 128.0)));
acc[idx] += val;
}
for x in &mut acc {
*x = x.tanh();
}
Ok(Self::l2_normalize(acc))
}
}
//// OpenAI embedder (supports OpenAI and Azure OpenAI via REST)
struct OpenAIEmbedder {
model: String,
@@ -320,7 +395,25 @@ pub fn create_embedder(config: &EmbeddingConfig) -> Result<Arc<dyn Embedder>, DB
let inner = OpenAIEmbedder::new_from_config(config)?;
Ok(Arc::new(inner))
}
EmbeddingProvider::ImageTestHash => {
Err(DBError("Use create_image_embedder() for image providers".into()))
}
EmbeddingProvider::LanceFastEmbed => Err(DBError("LanceFastEmbed provider not yet implemented in Rust embedding layer; configure 'test-hash' or use 'openai'".into())),
EmbeddingProvider::LanceOther(p) => Err(DBError(format!("Lance provider '{}' not implemented; configure 'openai' or 'test-hash'", p))),
}
}
/// Create an image embedder instance from a config.
pub fn create_image_embedder(config: &EmbeddingConfig) -> Result<Arc<dyn ImageEmbedder>, DBError> {
match &config.provider {
EmbeddingProvider::ImageTestHash => {
let dim = config.get_param_usize("dim").unwrap_or(512);
Ok(Arc::new(TestImageHashEmbedder::new(dim, config.model.clone())))
}
EmbeddingProvider::TestHash | EmbeddingProvider::LanceOpenAI => {
Err(DBError("Configured text provider; dataset expects image provider (e.g., 'testimagehash')".into()))
}
EmbeddingProvider::LanceFastEmbed => Err(DBError("Image provider 'lancefastembed' not yet implemented".into())),
EmbeddingProvider::LanceOther(p) => Err(DBError(format!("Image provider '{}' not implemented; use 'testimagehash' for now", p))),
}
}

View File

@@ -996,6 +996,7 @@ impl RpcServer for RpcServerImpl {
}
let prov = match provider.to_lowercase().as_str() {
"test-hash" | "testhash" => EmbeddingProvider::TestHash,
"testimagehash" | "image-test-hash" | "imagetesthash" => EmbeddingProvider::ImageTestHash,
"fastembed" | "lancefastembed" => EmbeddingProvider::LanceFastEmbed,
"openai" | "lanceopenai" => EmbeddingProvider::LanceOpenAI,
other => EmbeddingProvider::LanceOther(other.to_string()),
@@ -1030,6 +1031,7 @@ impl RpcServer for RpcServerImpl {
Ok(serde_json::json!({
"provider": match cfg.provider {
EmbeddingProvider::TestHash => "test-hash",
EmbeddingProvider::ImageTestHash => "testimagehash",
EmbeddingProvider::LanceFastEmbed => "lancefastembed",
EmbeddingProvider::LanceOpenAI => "lanceopenai",
EmbeddingProvider::LanceOther(ref s) => s,