diff --git a/src/cmd.rs b/src/cmd.rs index f9fdd25..66c63a6 100644 --- a/src/cmd.rs +++ b/src/cmd.rs @@ -1325,6 +1325,7 @@ impl Cmd { let p_lc = provider.to_lowercase(); let prov = match p_lc.as_str() { "test-hash" | "testhash" => EmbeddingProvider::TestHash, + "testimagehash" | "image-test-hash" | "imagetesthash" => EmbeddingProvider::ImageTestHash, "fastembed" | "lancefastembed" => EmbeddingProvider::LanceFastEmbed, "openai" | "lanceopenai" => EmbeddingProvider::LanceOpenAI, other => EmbeddingProvider::LanceOther(other.to_string()), @@ -1346,6 +1347,7 @@ impl Cmd { arr.push(Protocol::BulkString("provider".to_string())); arr.push(Protocol::BulkString(match cfg.provider { EmbeddingProvider::TestHash => "test-hash".to_string(), + EmbeddingProvider::ImageTestHash => "testimagehash".to_string(), EmbeddingProvider::LanceFastEmbed => "lancefastembed".to_string(), EmbeddingProvider::LanceOpenAI => "lanceopenai".to_string(), EmbeddingProvider::LanceOther(ref s) => s.clone(), diff --git a/src/embedding.rs b/src/embedding.rs index c9b8b1b..36e2e99 100644 --- a/src/embedding.rs +++ b/src/embedding.rs @@ -30,8 +30,10 @@ use serde_json::json; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[serde(rename_all = "snake_case")] pub enum EmbeddingProvider { - // Deterministic, local-only embedder for CI and offline development. + // Deterministic, local-only embedder for CI and offline development (text). TestHash, + // Deterministic, local-only embedder for CI and offline development (image). + ImageTestHash, // Placeholders for LanceDB-supported providers; implementers can add concrete backends later. LanceFastEmbed, LanceOpenAI, @@ -71,6 +73,8 @@ pub trait Embedder: Send + Sync { } } +//// ----------------------------- TEXT: deterministic test embedder ----------------------------- + /// Deterministic, no-deps, no-network embedder for CI and offline dev. /// Algorithm: /// - Fold bytes of UTF-8 into 'dim' buckets with a simple rolling hash @@ -127,6 +131,77 @@ impl Embedder for TestHashEmbedder { } } +//// ----------------------------- IMAGE: trait + deterministic test embedder ----------------------------- + +/// Image embedding interface (separate from text to keep modality-specific inputs). +pub trait ImageEmbedder: Send + Sync { + /// Human-readable provider/model name + fn name(&self) -> String; + /// Embedding dimension + fn dim(&self) -> usize; + /// Embed a single image (raw bytes) + fn embed_image(&self, bytes: &[u8]) -> Result, DBError>; + /// Embed many images; default maps embed_image() over inputs + fn embed_many_images(&self, images: &[Vec]) -> Result>, DBError> { + images.iter().map(|b| self.embed_image(b)).collect() + } +} + +/// Deterministic image embedder that folds bytes into buckets, applies tanh-like nonlinearity, +/// and L2-normalizes. Suitable for CI and offline development. +/// NOTE: This is NOT semantic; it is a stable hash-like representation. +pub struct TestImageHashEmbedder { + dim: usize, + model_name: String, +} + +impl TestImageHashEmbedder { + pub fn new(dim: usize, model_name: impl Into) -> Self { + Self { dim, model_name: model_name.into() } + } + + fn l2_normalize(mut v: Vec) -> Vec { + let norm: f32 = v.iter().map(|x| x * x).sum::().sqrt(); + if norm > 0.0 { + for x in &mut v { + *x /= norm; + } + } + v + } +} + +impl ImageEmbedder for TestImageHashEmbedder { + fn name(&self) -> String { + format!("test-image-hash:{}", self.model_name) + } + + fn dim(&self) -> usize { + self.dim + } + + fn embed_image(&self, bytes: &[u8]) -> Result, DBError> { + // Deterministic fold across bytes with two rolling accumulators. + let mut acc = vec![0f32; self.dim]; + let mut h1: u32 = 0x811C9DC5; // FNV-like + let mut h2: u32 = 0x9E3779B9; // golden ratio + for (i, b) in bytes.iter().enumerate() { + h1 ^= *b as u32; + h1 = h1.wrapping_mul(16777619u32); + // combine with position and h2 + h2 = h2.wrapping_add(((i as u32).rotate_left((i % 13) as u32)) ^ h1.rotate_left((i % 7) as u32)); + let idx = (h1 ^ h2) as usize % self.dim; + // Map to [-1,1] and decay with position + let val = ((*b as f32) / 127.5 - 1.0) * (1.0 / (1.0 + (i as f32 / 128.0))); + acc[idx] += val; + } + for x in &mut acc { + *x = x.tanh(); + } + Ok(Self::l2_normalize(acc)) + } +} + //// OpenAI embedder (supports OpenAI and Azure OpenAI via REST) struct OpenAIEmbedder { model: String, @@ -320,7 +395,25 @@ pub fn create_embedder(config: &EmbeddingConfig) -> Result, DB let inner = OpenAIEmbedder::new_from_config(config)?; Ok(Arc::new(inner)) } + EmbeddingProvider::ImageTestHash => { + Err(DBError("Use create_image_embedder() for image providers".into())) + } EmbeddingProvider::LanceFastEmbed => Err(DBError("LanceFastEmbed provider not yet implemented in Rust embedding layer; configure 'test-hash' or use 'openai'".into())), EmbeddingProvider::LanceOther(p) => Err(DBError(format!("Lance provider '{}' not implemented; configure 'openai' or 'test-hash'", p))), } +} + +/// Create an image embedder instance from a config. +pub fn create_image_embedder(config: &EmbeddingConfig) -> Result, DBError> { + match &config.provider { + EmbeddingProvider::ImageTestHash => { + let dim = config.get_param_usize("dim").unwrap_or(512); + Ok(Arc::new(TestImageHashEmbedder::new(dim, config.model.clone()))) + } + EmbeddingProvider::TestHash | EmbeddingProvider::LanceOpenAI => { + Err(DBError("Configured text provider; dataset expects image provider (e.g., 'testimagehash')".into())) + } + EmbeddingProvider::LanceFastEmbed => Err(DBError("Image provider 'lancefastembed' not yet implemented".into())), + EmbeddingProvider::LanceOther(p) => Err(DBError(format!("Image provider '{}' not implemented; use 'testimagehash' for now", p))), + } } \ No newline at end of file diff --git a/src/rpc.rs b/src/rpc.rs index 50dcaee..b589efd 100644 --- a/src/rpc.rs +++ b/src/rpc.rs @@ -996,6 +996,7 @@ impl RpcServer for RpcServerImpl { } let prov = match provider.to_lowercase().as_str() { "test-hash" | "testhash" => EmbeddingProvider::TestHash, + "testimagehash" | "image-test-hash" | "imagetesthash" => EmbeddingProvider::ImageTestHash, "fastembed" | "lancefastembed" => EmbeddingProvider::LanceFastEmbed, "openai" | "lanceopenai" => EmbeddingProvider::LanceOpenAI, other => EmbeddingProvider::LanceOther(other.to_string()), @@ -1030,6 +1031,7 @@ impl RpcServer for RpcServerImpl { Ok(serde_json::json!({ "provider": match cfg.provider { EmbeddingProvider::TestHash => "test-hash", + EmbeddingProvider::ImageTestHash => "testimagehash", EmbeddingProvider::LanceFastEmbed => "lancefastembed", EmbeddingProvider::LanceOpenAI => "lanceopenai", EmbeddingProvider::LanceOther(ref s) => s,