This commit is contained in:
2025-04-23 04:18:28 +02:00
parent 10a7d9bb6b
commit a16ac8f627
276 changed files with 85166 additions and 1 deletions

View File

@@ -0,0 +1,65 @@
# Dedupestor
Dedupestor is a Go package that provides a key-value store with deduplication based on content hashing. It allows for efficient storage of data by ensuring that duplicate content is stored only once, while maintaining references to the original data.
## Features
- Content-based deduplication using SHA-256 hashing
- Reference tracking to maintain data integrity
- Automatic cleanup when all references to data are removed
- Size limits to prevent excessive memory usage
- Persistent storage using the ourdb and radixtree packages
## Usage
```go
import (
"github.com/freeflowuniverse/heroagent/pkg/dedupestor"
)
// Create a new dedupe store
ds, err := dedupestor.New(dedupestor.NewArgs{
Path: "/path/to/store",
Reset: false, // Set to true to reset existing data
})
if err != nil {
// Handle error
}
defer ds.Close()
// Store data with a reference
data := []byte("example data")
ref := dedupestor.Reference{Owner: 1, ID: 1}
id, err := ds.Store(data, ref)
if err != nil {
// Handle error
}
// Retrieve data by ID
retrievedData, err := ds.Get(id)
if err != nil {
// Handle error
}
// Check if data exists
exists := ds.IDExists(id)
// Delete a reference to data
err = ds.Delete(id, ref)
if err != nil {
// Handle error
}
```
## How It Works
1. When data is stored, a SHA-256 hash is calculated for the content
2. If the hash already exists in the store, a new reference is added to the existing data
3. If the hash doesn't exist, the data is stored and a new reference is created
4. When a reference is deleted, it's removed from the metadata
5. When the last reference to data is deleted, the data itself is removed from storage
## Dependencies
- [ourdb](../ourdb): For persistent storage of the actual data
- [radixtree](../radixtree): For efficient storage and retrieval of hash-to-ID mappings

View File

@@ -0,0 +1,196 @@
// Package dedupestor provides a key-value store with deduplication based on content hashing
package dedupestor
import (
"crypto/sha256"
"encoding/hex"
"errors"
"path/filepath"
"github.com/freeflowuniverse/heroagent/pkg/data/ourdb"
"github.com/freeflowuniverse/heroagent/pkg/data/radixtree"
)
// MaxValueSize is the maximum allowed size for values (1MB)
const MaxValueSize = 1024 * 1024
// DedupeStore provides a key-value store with deduplication based on content hashing
type DedupeStore struct {
Radix *radixtree.RadixTree // For storing hash -> id mappings
Data *ourdb.OurDB // For storing the actual data
}
// NewArgs contains arguments for creating a new DedupeStore
type NewArgs struct {
Path string // Base path for the store
Reset bool // Whether to reset existing data
}
// New creates a new deduplication store
func New(args NewArgs) (*DedupeStore, error) {
// Create the radixtree for hash -> id mapping
rt, err := radixtree.New(radixtree.NewArgs{
Path: filepath.Join(args.Path, "radixtree"),
Reset: args.Reset,
})
if err != nil {
return nil, err
}
// Create the ourdb for actual data storage
config := ourdb.DefaultConfig()
config.Path = filepath.Join(args.Path, "data")
config.RecordSizeMax = MaxValueSize
config.IncrementalMode = true // We want auto-incrementing IDs
config.Reset = args.Reset
db, err := ourdb.New(config)
if err != nil {
return nil, err
}
return &DedupeStore{
Radix: rt,
Data: db,
}, nil
}
// Store stores data with its reference and returns its id
// If the data already exists (same hash), returns the existing id without storing again
// appends reference to the radix tree entry of the hash to track references
func (ds *DedupeStore) Store(data []byte, ref Reference) (uint32, error) {
// Check size limit
if len(data) > MaxValueSize {
return 0, errors.New("value size exceeds maximum allowed size of 1MB")
}
// Calculate SHA-256 hash of the value (using SHA-256 instead of blake2b for Go compatibility)
hash := sha256Sum(data)
// Check if this hash already exists
metadataBytes, err := ds.Radix.Get(hash)
if err == nil {
// Value already exists, add new ref & return the id
metadata := BytesToMetadata(metadataBytes)
metadata, err = metadata.AddReference(ref)
if err != nil {
return 0, err
}
err = ds.Radix.Update(hash, metadata.ToBytes())
if err != nil {
return 0, err
}
return metadata.ID, nil
}
// Store the actual data in ourdb
id, err := ds.Data.Set(ourdb.OurDBSetArgs{
Data: data,
})
if err != nil {
return 0, err
}
metadata := Metadata{
ID: id,
References: []Reference{ref},
}
// Store the mapping of hash -> id in radixtree
err = ds.Radix.Set(hash, metadata.ToBytes())
if err != nil {
return 0, err
}
return id, nil
}
// Get retrieves a value by its ID
func (ds *DedupeStore) Get(id uint32) ([]byte, error) {
return ds.Data.Get(id)
}
// GetFromHash retrieves a value by its hash
func (ds *DedupeStore) GetFromHash(hash string) ([]byte, error) {
// Get the ID from radixtree
metadataBytes, err := ds.Radix.Get(hash)
if err != nil {
return nil, err
}
// Convert bytes back to metadata
metadata := BytesToMetadata(metadataBytes)
// Get the actual data from ourdb
return ds.Data.Get(metadata.ID)
}
// IDExists checks if a value with the given ID exists
func (ds *DedupeStore) IDExists(id uint32) bool {
_, err := ds.Data.Get(id)
return err == nil
}
// HashExists checks if a value with the given hash exists
func (ds *DedupeStore) HashExists(hash string) bool {
_, err := ds.Radix.Get(hash)
return err == nil
}
// Delete removes a reference from the hash entry
// If it's the last reference, removes the hash entry and its data
func (ds *DedupeStore) Delete(id uint32, ref Reference) error {
// Get the data to calculate its hash
data, err := ds.Data.Get(id)
if err != nil {
return err
}
// Calculate hash of the value
hash := sha256Sum(data)
// Get the current entry from radixtree
metadataBytes, err := ds.Radix.Get(hash)
if err != nil {
return err
}
metadata := BytesToMetadata(metadataBytes)
metadata, err = metadata.RemoveReference(ref)
if err != nil {
return err
}
if len(metadata.References) == 0 {
// Delete from radixtree
err = ds.Radix.Delete(hash)
if err != nil {
return err
}
// Delete from data db
return ds.Data.Delete(id)
}
// Update hash metadata
return ds.Radix.Update(hash, metadata.ToBytes())
}
// Close closes the dedupe store
func (ds *DedupeStore) Close() error {
err1 := ds.Radix.Close()
err2 := ds.Data.Close()
if err1 != nil {
return err1
}
return err2
}
// Helper function to calculate SHA-256 hash and return as hex string
func sha256Sum(data []byte) string {
hash := sha256.Sum256(data)
return hex.EncodeToString(hash[:])
}

View File

@@ -0,0 +1,532 @@
package dedupestor
import (
"bytes"
"os"
"path/filepath"
"testing"
)
func setupTest(t *testing.T) {
// Ensure test directories exist and are clean
testDirs := []string{
"/tmp/dedupestor_test",
"/tmp/dedupestor_test_size",
"/tmp/dedupestor_test_exists",
"/tmp/dedupestor_test_multiple",
"/tmp/dedupestor_test_refs",
}
for _, dir := range testDirs {
if _, err := os.Stat(dir); err == nil {
err := os.RemoveAll(dir)
if err != nil {
t.Fatalf("Failed to remove test directory %s: %v", dir, err)
}
}
err := os.MkdirAll(dir, 0755)
if err != nil {
t.Fatalf("Failed to create test directory %s: %v", dir, err)
}
}
}
func TestBasicOperations(t *testing.T) {
setupTest(t)
ds, err := New(NewArgs{
Path: "/tmp/dedupestor_test",
Reset: true,
})
if err != nil {
t.Fatalf("Failed to create dedupe store: %v", err)
}
defer ds.Close()
// Test storing and retrieving data
value1 := []byte("test data 1")
ref1 := Reference{Owner: 1, ID: 1}
id1, err := ds.Store(value1, ref1)
if err != nil {
t.Fatalf("Failed to store data: %v", err)
}
retrieved1, err := ds.Get(id1)
if err != nil {
t.Fatalf("Failed to retrieve data: %v", err)
}
if !bytes.Equal(retrieved1, value1) {
t.Fatalf("Retrieved data doesn't match stored data")
}
// Test deduplication with different reference
ref2 := Reference{Owner: 1, ID: 2}
id2, err := ds.Store(value1, ref2)
if err != nil {
t.Fatalf("Failed to store data with second reference: %v", err)
}
if id1 != id2 {
t.Fatalf("Expected same ID for duplicate data, got %d and %d", id1, id2)
}
// Test different data gets different ID
value2 := []byte("test data 2")
ref3 := Reference{Owner: 1, ID: 3}
id3, err := ds.Store(value2, ref3)
if err != nil {
t.Fatalf("Failed to store different data: %v", err)
}
if id1 == id3 {
t.Fatalf("Expected different IDs for different data, got %d for both", id1)
}
retrieved2, err := ds.Get(id3)
if err != nil {
t.Fatalf("Failed to retrieve second data: %v", err)
}
if !bytes.Equal(retrieved2, value2) {
t.Fatalf("Retrieved data doesn't match second stored data")
}
}
func TestSizeLimit(t *testing.T) {
setupTest(t)
ds, err := New(NewArgs{
Path: "/tmp/dedupestor_test_size",
Reset: true,
})
if err != nil {
t.Fatalf("Failed to create dedupe store: %v", err)
}
defer ds.Close()
// Test data under size limit (1KB)
smallData := make([]byte, 1024)
for i := range smallData {
smallData[i] = byte(i % 256)
}
ref := Reference{Owner: 1, ID: 1}
smallID, err := ds.Store(smallData, ref)
if err != nil {
t.Fatalf("Failed to store small data: %v", err)
}
retrieved, err := ds.Get(smallID)
if err != nil {
t.Fatalf("Failed to retrieve small data: %v", err)
}
if !bytes.Equal(retrieved, smallData) {
t.Fatalf("Retrieved data doesn't match stored small data")
}
// Test data over size limit (2MB)
largeData := make([]byte, 2*1024*1024)
for i := range largeData {
largeData[i] = byte(i % 256)
}
_, err = ds.Store(largeData, ref)
if err == nil {
t.Fatalf("Expected error for data exceeding size limit")
}
}
func TestExists(t *testing.T) {
setupTest(t)
ds, err := New(NewArgs{
Path: "/tmp/dedupestor_test_exists",
Reset: true,
})
if err != nil {
t.Fatalf("Failed to create dedupe store: %v", err)
}
defer ds.Close()
value := []byte("test data")
ref := Reference{Owner: 1, ID: 1}
id, err := ds.Store(value, ref)
if err != nil {
t.Fatalf("Failed to store data: %v", err)
}
if !ds.IDExists(id) {
t.Fatalf("IDExists returned false for existing ID")
}
if ds.IDExists(99) {
t.Fatalf("IDExists returned true for non-existent ID")
}
// Calculate hash to test HashExists
data, err := ds.Get(id)
if err != nil {
t.Fatalf("Failed to get data: %v", err)
}
hash := sha256Sum(data)
if !ds.HashExists(hash) {
t.Fatalf("HashExists returned false for existing hash")
}
if ds.HashExists("nonexistenthash") {
t.Fatalf("HashExists returned true for non-existent hash")
}
}
func TestMultipleOperations(t *testing.T) {
setupTest(t)
ds, err := New(NewArgs{
Path: "/tmp/dedupestor_test_multiple",
Reset: true,
})
if err != nil {
t.Fatalf("Failed to create dedupe store: %v", err)
}
defer ds.Close()
// Store multiple values
values := [][]byte{}
ids := []uint32{}
for i := 0; i < 5; i++ {
value := []byte("test data " + string(rune('0'+i)))
values = append(values, value)
ref := Reference{Owner: 1, ID: uint32(i)}
id, err := ds.Store(value, ref)
if err != nil {
t.Fatalf("Failed to store data %d: %v", i, err)
}
ids = append(ids, id)
}
// Verify all values can be retrieved
for i, id := range ids {
retrieved, err := ds.Get(id)
if err != nil {
t.Fatalf("Failed to retrieve data %d: %v", i, err)
}
if !bytes.Equal(retrieved, values[i]) {
t.Fatalf("Retrieved data %d doesn't match stored data", i)
}
}
// Test deduplication by storing same values again
for i, value := range values {
ref := Reference{Owner: 2, ID: uint32(i)}
id, err := ds.Store(value, ref)
if err != nil {
t.Fatalf("Failed to store duplicate data %d: %v", i, err)
}
if id != ids[i] {
t.Fatalf("Expected same ID for duplicate data %d, got %d and %d", i, ids[i], id)
}
}
}
func TestReferences(t *testing.T) {
setupTest(t)
ds, err := New(NewArgs{
Path: "/tmp/dedupestor_test_refs",
Reset: true,
})
if err != nil {
t.Fatalf("Failed to create dedupe store: %v", err)
}
defer ds.Close()
// Store same data with different references
value := []byte("test data")
ref1 := Reference{Owner: 1, ID: 1}
ref2 := Reference{Owner: 1, ID: 2}
ref3 := Reference{Owner: 2, ID: 1}
// Store with first reference
id, err := ds.Store(value, ref1)
if err != nil {
t.Fatalf("Failed to store data with first reference: %v", err)
}
// Store same data with second reference
id2, err := ds.Store(value, ref2)
if err != nil {
t.Fatalf("Failed to store data with second reference: %v", err)
}
if id != id2 {
t.Fatalf("Expected same ID for same data, got %d and %d", id, id2)
}
// Store same data with third reference
id3, err := ds.Store(value, ref3)
if err != nil {
t.Fatalf("Failed to store data with third reference: %v", err)
}
if id != id3 {
t.Fatalf("Expected same ID for same data, got %d and %d", id, id3)
}
// Delete first reference - data should still exist
err = ds.Delete(id, ref1)
if err != nil {
t.Fatalf("Failed to delete first reference: %v", err)
}
if !ds.IDExists(id) {
t.Fatalf("Data should still exist after deleting first reference")
}
// Delete second reference - data should still exist
err = ds.Delete(id, ref2)
if err != nil {
t.Fatalf("Failed to delete second reference: %v", err)
}
if !ds.IDExists(id) {
t.Fatalf("Data should still exist after deleting second reference")
}
// Delete last reference - data should be gone
err = ds.Delete(id, ref3)
if err != nil {
t.Fatalf("Failed to delete third reference: %v", err)
}
if ds.IDExists(id) {
t.Fatalf("Data should be deleted after removing all references")
}
// Verify data is actually deleted by trying to get it
_, err = ds.Get(id)
if err == nil {
t.Fatalf("Expected error getting deleted data")
}
}
func TestMetadataConversion(t *testing.T) {
// Test Reference conversion
ref := Reference{
Owner: 12345,
ID: 67890,
}
bytes := ref.ToBytes()
recovered := BytesToReference(bytes)
if ref.Owner != recovered.Owner || ref.ID != recovered.ID {
t.Fatalf("Reference conversion failed: original %+v, recovered %+v", ref, recovered)
}
// Test Metadata conversion
metadata := Metadata{
ID: 42,
References: []Reference{},
}
ref1 := Reference{Owner: 1, ID: 100}
ref2 := Reference{Owner: 2, ID: 200}
metadata, err := metadata.AddReference(ref1)
if err != nil {
t.Fatalf("Failed to add reference: %v", err)
}
metadata, err = metadata.AddReference(ref2)
if err != nil {
t.Fatalf("Failed to add reference: %v", err)
}
bytes = metadata.ToBytes()
recovered2 := BytesToMetadata(bytes)
if metadata.ID != recovered2.ID || len(metadata.References) != len(recovered2.References) {
t.Fatalf("Metadata conversion failed: original %+v, recovered %+v", metadata, recovered2)
}
for i, ref := range metadata.References {
if ref.Owner != recovered2.References[i].Owner || ref.ID != recovered2.References[i].ID {
t.Fatalf("Reference in metadata conversion failed at index %d", i)
}
}
}
func TestAddRemoveReference(t *testing.T) {
metadata := Metadata{
ID: 1,
References: []Reference{},
}
ref1 := Reference{Owner: 1, ID: 100}
ref2 := Reference{Owner: 2, ID: 200}
// Add first reference
metadata, err := metadata.AddReference(ref1)
if err != nil {
t.Fatalf("Failed to add first reference: %v", err)
}
if len(metadata.References) != 1 {
t.Fatalf("Expected 1 reference after adding first, got %d", len(metadata.References))
}
if metadata.References[0].Owner != ref1.Owner || metadata.References[0].ID != ref1.ID {
t.Fatalf("First reference not added correctly")
}
// Add second reference
metadata, err = metadata.AddReference(ref2)
if err != nil {
t.Fatalf("Failed to add second reference: %v", err)
}
if len(metadata.References) != 2 {
t.Fatalf("Expected 2 references after adding second, got %d", len(metadata.References))
}
// Try adding duplicate reference
metadata, err = metadata.AddReference(ref1)
if err != nil {
t.Fatalf("Failed to add duplicate reference: %v", err)
}
if len(metadata.References) != 2 {
t.Fatalf("Expected 2 references after adding duplicate, got %d", len(metadata.References))
}
// Remove first reference
metadata, err = metadata.RemoveReference(ref1)
if err != nil {
t.Fatalf("Failed to remove first reference: %v", err)
}
if len(metadata.References) != 1 {
t.Fatalf("Expected 1 reference after removing first, got %d", len(metadata.References))
}
if metadata.References[0].Owner != ref2.Owner || metadata.References[0].ID != ref2.ID {
t.Fatalf("Wrong reference removed")
}
// Remove non-existent reference
metadata, err = metadata.RemoveReference(Reference{Owner: 999, ID: 999})
if err != nil {
t.Fatalf("Failed to remove non-existent reference: %v", err)
}
if len(metadata.References) != 1 {
t.Fatalf("Expected 1 reference after removing non-existent, got %d", len(metadata.References))
}
// Remove last reference
metadata, err = metadata.RemoveReference(ref2)
if err != nil {
t.Fatalf("Failed to remove last reference: %v", err)
}
if len(metadata.References) != 0 {
t.Fatalf("Expected 0 references after removing last, got %d", len(metadata.References))
}
}
func TestEmptyMetadataBytes(t *testing.T) {
empty := BytesToMetadata([]byte{})
if empty.ID != 0 || len(empty.References) != 0 {
t.Fatalf("Expected empty metadata, got %+v", empty)
}
}
func TestDeduplicationSize(t *testing.T) {
testDir := "/tmp/dedupestor_test_dedup_size"
// Clean up test directory
if _, err := os.Stat(testDir); err == nil {
os.RemoveAll(testDir)
}
os.MkdirAll(testDir, 0755)
// Create a new dedupe store
ds, err := New(NewArgs{
Path: testDir,
Reset: true,
})
if err != nil {
t.Fatalf("Failed to create dedupe store: %v", err)
}
defer ds.Close()
// Store a large piece of data (100KB)
largeData := make([]byte, 100*1024)
for i := range largeData {
largeData[i] = byte(i % 256)
}
// Store the data with first reference
ref1 := Reference{Owner: 1, ID: 1}
id1, err := ds.Store(largeData, ref1)
if err != nil {
t.Fatalf("Failed to store data with first reference: %v", err)
}
// Get the size of the data directory after first store
dataDir := testDir + "/data"
sizeAfterFirst, err := getDirSize(dataDir)
if err != nil {
t.Fatalf("Failed to get directory size: %v", err)
}
t.Logf("Size after first store: %d bytes", sizeAfterFirst)
// Store the same data with different references multiple times
for i := 2; i <= 10; i++ {
ref := Reference{Owner: uint16(i), ID: uint32(i)}
id, err := ds.Store(largeData, ref)
if err != nil {
t.Fatalf("Failed to store data with reference %d: %v", i, err)
}
// Verify we get the same ID (deduplication is working)
if id != id1 {
t.Fatalf("Expected same ID for duplicate data, got %d and %d", id1, id)
}
}
// Get the size after storing the same data multiple times
sizeAfterMultiple, err := getDirSize(dataDir)
if err != nil {
t.Fatalf("Failed to get directory size: %v", err)
}
t.Logf("Size after storing same data 10 times: %d bytes", sizeAfterMultiple)
// The size should be approximately the same (allowing for metadata overhead)
// We'll check that it hasn't grown significantly (less than 10% increase)
if sizeAfterMultiple > sizeAfterFirst*110/100 {
t.Fatalf("Directory size grew significantly after storing duplicate data: %d -> %d bytes",
sizeAfterFirst, sizeAfterMultiple)
}
// Now store different data
differentData := make([]byte, 100*1024)
for i := range differentData {
differentData[i] = byte((i + 128) % 256) // Different pattern
}
ref11 := Reference{Owner: 11, ID: 11}
_, err = ds.Store(differentData, ref11)
if err != nil {
t.Fatalf("Failed to store different data: %v", err)
}
// Get the size after storing different data
sizeAfterDifferent, err := getDirSize(dataDir)
if err != nil {
t.Fatalf("Failed to get directory size: %v", err)
}
t.Logf("Size after storing different data: %d bytes", sizeAfterDifferent)
// The size should have increased significantly
if sizeAfterDifferent <= sizeAfterMultiple*110/100 {
t.Fatalf("Directory size didn't grow as expected after storing different data: %d -> %d bytes",
sizeAfterMultiple, sizeAfterDifferent)
}
}
// getDirSize returns the total size of all files in a directory in bytes
func getDirSize(path string) (int64, error) {
var size int64
err := filepath.Walk(path, func(_ string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if !info.IsDir() {
size += info.Size()
}
return nil
})
return size, err
}

View File

@@ -0,0 +1,123 @@
// Package dedupestor provides a key-value store with deduplication based on content hashing
package dedupestor
import (
"encoding/binary"
)
// Metadata represents a stored value with its ID and references
type Metadata struct {
ID uint32 // ID of the stored data in the database
References []Reference // List of references to this data
}
// Reference represents a reference to stored data
type Reference struct {
Owner uint16 // Owner identifier
ID uint32 // Reference identifier
}
// ToBytes converts Metadata to bytes for storage
func (m Metadata) ToBytes() []byte {
// Calculate size: 4 bytes for ID + 6 bytes per reference
size := 4 + (len(m.References) * 6)
result := make([]byte, size)
// Write ID (4 bytes)
binary.LittleEndian.PutUint32(result[0:4], m.ID)
// Write references (6 bytes each)
offset := 4
for _, ref := range m.References {
refBytes := ref.ToBytes()
copy(result[offset:offset+6], refBytes)
offset += 6
}
return result
}
// BytesToMetadata converts bytes back to Metadata
func BytesToMetadata(b []byte) Metadata {
if len(b) < 4 {
return Metadata{
ID: 0,
References: []Reference{},
}
}
id := binary.LittleEndian.Uint32(b[0:4])
refs := []Reference{}
// Parse references (each reference is 6 bytes)
for i := 4; i < len(b); i += 6 {
if i+6 <= len(b) {
refs = append(refs, BytesToReference(b[i:i+6]))
}
}
return Metadata{
ID: id,
References: refs,
}
}
// AddReference adds a new reference if it doesn't already exist
func (m Metadata) AddReference(ref Reference) (Metadata, error) {
// Check if reference already exists
for _, existing := range m.References {
if existing.Owner == ref.Owner && existing.ID == ref.ID {
return m, nil
}
}
// Add the new reference
newRefs := append(m.References, ref)
return Metadata{
ID: m.ID,
References: newRefs,
}, nil
}
// RemoveReference removes a reference if it exists
func (m Metadata) RemoveReference(ref Reference) (Metadata, error) {
newRefs := []Reference{}
for _, existing := range m.References {
if existing.Owner != ref.Owner || existing.ID != ref.ID {
newRefs = append(newRefs, existing)
}
}
return Metadata{
ID: m.ID,
References: newRefs,
}, nil
}
// ToBytes converts Reference to bytes
func (r Reference) ToBytes() []byte {
result := make([]byte, 6)
// Write owner (2 bytes)
binary.LittleEndian.PutUint16(result[0:2], r.Owner)
// Write ID (4 bytes)
binary.LittleEndian.PutUint32(result[2:6], r.ID)
return result
}
// BytesToReference converts bytes to Reference
func BytesToReference(b []byte) Reference {
if len(b) < 6 {
return Reference{}
}
owner := binary.LittleEndian.Uint16(b[0:2])
id := binary.LittleEndian.Uint32(b[2:6])
return Reference{
Owner: owner,
ID: id,
}
}