|
|
|
@@ -30,8 +30,10 @@ use serde_json::json;
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
|
|
|
|
#[serde(rename_all = "snake_case")]
|
|
|
|
|
pub enum EmbeddingProvider {
|
|
|
|
|
// Deterministic, local-only embedder for CI and offline development.
|
|
|
|
|
// Deterministic, local-only embedder for CI and offline development (text).
|
|
|
|
|
TestHash,
|
|
|
|
|
// Deterministic, local-only embedder for CI and offline development (image).
|
|
|
|
|
ImageTestHash,
|
|
|
|
|
// Placeholders for LanceDB-supported providers; implementers can add concrete backends later.
|
|
|
|
|
LanceFastEmbed,
|
|
|
|
|
LanceOpenAI,
|
|
|
|
@@ -71,6 +73,8 @@ pub trait Embedder: Send + Sync {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//// ----------------------------- TEXT: deterministic test embedder -----------------------------
|
|
|
|
|
|
|
|
|
|
/// Deterministic, no-deps, no-network embedder for CI and offline dev.
|
|
|
|
|
/// Algorithm:
|
|
|
|
|
/// - Fold bytes of UTF-8 into 'dim' buckets with a simple rolling hash
|
|
|
|
@@ -127,6 +131,77 @@ impl Embedder for TestHashEmbedder {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//// ----------------------------- IMAGE: trait + deterministic test embedder -----------------------------
|
|
|
|
|
|
|
|
|
|
/// Image embedding interface (separate from text to keep modality-specific inputs).
|
|
|
|
|
pub trait ImageEmbedder: Send + Sync {
|
|
|
|
|
/// Human-readable provider/model name
|
|
|
|
|
fn name(&self) -> String;
|
|
|
|
|
/// Embedding dimension
|
|
|
|
|
fn dim(&self) -> usize;
|
|
|
|
|
/// Embed a single image (raw bytes)
|
|
|
|
|
fn embed_image(&self, bytes: &[u8]) -> Result<Vec<f32>, DBError>;
|
|
|
|
|
/// Embed many images; default maps embed_image() over inputs
|
|
|
|
|
fn embed_many_images(&self, images: &[Vec<u8>]) -> Result<Vec<Vec<f32>>, DBError> {
|
|
|
|
|
images.iter().map(|b| self.embed_image(b)).collect()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Deterministic image embedder that folds bytes into buckets, applies tanh-like nonlinearity,
|
|
|
|
|
/// and L2-normalizes. Suitable for CI and offline development.
|
|
|
|
|
/// NOTE: This is NOT semantic; it is a stable hash-like representation.
|
|
|
|
|
pub struct TestImageHashEmbedder {
|
|
|
|
|
dim: usize,
|
|
|
|
|
model_name: String,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl TestImageHashEmbedder {
|
|
|
|
|
pub fn new(dim: usize, model_name: impl Into<String>) -> Self {
|
|
|
|
|
Self { dim, model_name: model_name.into() }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn l2_normalize(mut v: Vec<f32>) -> Vec<f32> {
|
|
|
|
|
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
|
|
|
if norm > 0.0 {
|
|
|
|
|
for x in &mut v {
|
|
|
|
|
*x /= norm;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
v
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl ImageEmbedder for TestImageHashEmbedder {
|
|
|
|
|
fn name(&self) -> String {
|
|
|
|
|
format!("test-image-hash:{}", self.model_name)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn dim(&self) -> usize {
|
|
|
|
|
self.dim
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn embed_image(&self, bytes: &[u8]) -> Result<Vec<f32>, DBError> {
|
|
|
|
|
// Deterministic fold across bytes with two rolling accumulators.
|
|
|
|
|
let mut acc = vec![0f32; self.dim];
|
|
|
|
|
let mut h1: u32 = 0x811C9DC5; // FNV-like
|
|
|
|
|
let mut h2: u32 = 0x9E3779B9; // golden ratio
|
|
|
|
|
for (i, b) in bytes.iter().enumerate() {
|
|
|
|
|
h1 ^= *b as u32;
|
|
|
|
|
h1 = h1.wrapping_mul(16777619u32);
|
|
|
|
|
// combine with position and h2
|
|
|
|
|
h2 = h2.wrapping_add(((i as u32).rotate_left((i % 13) as u32)) ^ h1.rotate_left((i % 7) as u32));
|
|
|
|
|
let idx = (h1 ^ h2) as usize % self.dim;
|
|
|
|
|
// Map to [-1,1] and decay with position
|
|
|
|
|
let val = ((*b as f32) / 127.5 - 1.0) * (1.0 / (1.0 + (i as f32 / 128.0)));
|
|
|
|
|
acc[idx] += val;
|
|
|
|
|
}
|
|
|
|
|
for x in &mut acc {
|
|
|
|
|
*x = x.tanh();
|
|
|
|
|
}
|
|
|
|
|
Ok(Self::l2_normalize(acc))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//// OpenAI embedder (supports OpenAI and Azure OpenAI via REST)
|
|
|
|
|
struct OpenAIEmbedder {
|
|
|
|
|
model: String,
|
|
|
|
@@ -320,7 +395,25 @@ pub fn create_embedder(config: &EmbeddingConfig) -> Result<Arc<dyn Embedder>, DB
|
|
|
|
|
let inner = OpenAIEmbedder::new_from_config(config)?;
|
|
|
|
|
Ok(Arc::new(inner))
|
|
|
|
|
}
|
|
|
|
|
EmbeddingProvider::ImageTestHash => {
|
|
|
|
|
Err(DBError("Use create_image_embedder() for image providers".into()))
|
|
|
|
|
}
|
|
|
|
|
EmbeddingProvider::LanceFastEmbed => Err(DBError("LanceFastEmbed provider not yet implemented in Rust embedding layer; configure 'test-hash' or use 'openai'".into())),
|
|
|
|
|
EmbeddingProvider::LanceOther(p) => Err(DBError(format!("Lance provider '{}' not implemented; configure 'openai' or 'test-hash'", p))),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Create an image embedder instance from a config.
|
|
|
|
|
pub fn create_image_embedder(config: &EmbeddingConfig) -> Result<Arc<dyn ImageEmbedder>, DBError> {
|
|
|
|
|
match &config.provider {
|
|
|
|
|
EmbeddingProvider::ImageTestHash => {
|
|
|
|
|
let dim = config.get_param_usize("dim").unwrap_or(512);
|
|
|
|
|
Ok(Arc::new(TestImageHashEmbedder::new(dim, config.model.clone())))
|
|
|
|
|
}
|
|
|
|
|
EmbeddingProvider::TestHash | EmbeddingProvider::LanceOpenAI => {
|
|
|
|
|
Err(DBError("Configured text provider; dataset expects image provider (e.g., 'testimagehash')".into()))
|
|
|
|
|
}
|
|
|
|
|
EmbeddingProvider::LanceFastEmbed => Err(DBError("Image provider 'lancefastembed' not yet implemented".into())),
|
|
|
|
|
EmbeddingProvider::LanceOther(p) => Err(DBError(format!("Image provider '{}' not implemented; use 'testimagehash' for now", p))),
|
|
|
|
|
}
|
|
|
|
|
}
|