...
This commit is contained in:
65
pkg/data/dedupestor/README.md
Normal file
65
pkg/data/dedupestor/README.md
Normal file
@@ -0,0 +1,65 @@
|
||||
# Dedupestor
|
||||
|
||||
Dedupestor is a Go package that provides a key-value store with deduplication based on content hashing. It allows for efficient storage of data by ensuring that duplicate content is stored only once, while maintaining references to the original data.
|
||||
|
||||
## Features
|
||||
|
||||
- Content-based deduplication using SHA-256 hashing
|
||||
- Reference tracking to maintain data integrity
|
||||
- Automatic cleanup when all references to data are removed
|
||||
- Size limits to prevent excessive memory usage
|
||||
- Persistent storage using the ourdb and radixtree packages
|
||||
|
||||
## Usage
|
||||
|
||||
```go
|
||||
import (
|
||||
"github.com/freeflowuniverse/heroagent/pkg/dedupestor"
|
||||
)
|
||||
|
||||
// Create a new dedupe store
|
||||
ds, err := dedupestor.New(dedupestor.NewArgs{
|
||||
Path: "/path/to/store",
|
||||
Reset: false, // Set to true to reset existing data
|
||||
})
|
||||
if err != nil {
|
||||
// Handle error
|
||||
}
|
||||
defer ds.Close()
|
||||
|
||||
// Store data with a reference
|
||||
data := []byte("example data")
|
||||
ref := dedupestor.Reference{Owner: 1, ID: 1}
|
||||
id, err := ds.Store(data, ref)
|
||||
if err != nil {
|
||||
// Handle error
|
||||
}
|
||||
|
||||
// Retrieve data by ID
|
||||
retrievedData, err := ds.Get(id)
|
||||
if err != nil {
|
||||
// Handle error
|
||||
}
|
||||
|
||||
// Check if data exists
|
||||
exists := ds.IDExists(id)
|
||||
|
||||
// Delete a reference to data
|
||||
err = ds.Delete(id, ref)
|
||||
if err != nil {
|
||||
// Handle error
|
||||
}
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. When data is stored, a SHA-256 hash is calculated for the content
|
||||
2. If the hash already exists in the store, a new reference is added to the existing data
|
||||
3. If the hash doesn't exist, the data is stored and a new reference is created
|
||||
4. When a reference is deleted, it's removed from the metadata
|
||||
5. When the last reference to data is deleted, the data itself is removed from storage
|
||||
|
||||
## Dependencies
|
||||
|
||||
- [ourdb](../ourdb): For persistent storage of the actual data
|
||||
- [radixtree](../radixtree): For efficient storage and retrieval of hash-to-ID mappings
|
196
pkg/data/dedupestor/dedupestor.go
Normal file
196
pkg/data/dedupestor/dedupestor.go
Normal file
@@ -0,0 +1,196 @@
|
||||
// Package dedupestor provides a key-value store with deduplication based on content hashing
|
||||
package dedupestor
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/freeflowuniverse/heroagent/pkg/data/ourdb"
|
||||
"github.com/freeflowuniverse/heroagent/pkg/data/radixtree"
|
||||
)
|
||||
|
||||
// MaxValueSize is the maximum allowed size for values (1MB)
|
||||
const MaxValueSize = 1024 * 1024
|
||||
|
||||
// DedupeStore provides a key-value store with deduplication based on content hashing
|
||||
type DedupeStore struct {
|
||||
Radix *radixtree.RadixTree // For storing hash -> id mappings
|
||||
Data *ourdb.OurDB // For storing the actual data
|
||||
}
|
||||
|
||||
// NewArgs contains arguments for creating a new DedupeStore
|
||||
type NewArgs struct {
|
||||
Path string // Base path for the store
|
||||
Reset bool // Whether to reset existing data
|
||||
}
|
||||
|
||||
// New creates a new deduplication store
|
||||
func New(args NewArgs) (*DedupeStore, error) {
|
||||
// Create the radixtree for hash -> id mapping
|
||||
rt, err := radixtree.New(radixtree.NewArgs{
|
||||
Path: filepath.Join(args.Path, "radixtree"),
|
||||
Reset: args.Reset,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Create the ourdb for actual data storage
|
||||
config := ourdb.DefaultConfig()
|
||||
config.Path = filepath.Join(args.Path, "data")
|
||||
config.RecordSizeMax = MaxValueSize
|
||||
config.IncrementalMode = true // We want auto-incrementing IDs
|
||||
config.Reset = args.Reset
|
||||
|
||||
db, err := ourdb.New(config)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &DedupeStore{
|
||||
Radix: rt,
|
||||
Data: db,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Store stores data with its reference and returns its id
|
||||
// If the data already exists (same hash), returns the existing id without storing again
|
||||
// appends reference to the radix tree entry of the hash to track references
|
||||
func (ds *DedupeStore) Store(data []byte, ref Reference) (uint32, error) {
|
||||
// Check size limit
|
||||
if len(data) > MaxValueSize {
|
||||
return 0, errors.New("value size exceeds maximum allowed size of 1MB")
|
||||
}
|
||||
|
||||
// Calculate SHA-256 hash of the value (using SHA-256 instead of blake2b for Go compatibility)
|
||||
hash := sha256Sum(data)
|
||||
|
||||
// Check if this hash already exists
|
||||
metadataBytes, err := ds.Radix.Get(hash)
|
||||
if err == nil {
|
||||
// Value already exists, add new ref & return the id
|
||||
metadata := BytesToMetadata(metadataBytes)
|
||||
metadata, err = metadata.AddReference(ref)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
err = ds.Radix.Update(hash, metadata.ToBytes())
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return metadata.ID, nil
|
||||
}
|
||||
|
||||
// Store the actual data in ourdb
|
||||
id, err := ds.Data.Set(ourdb.OurDBSetArgs{
|
||||
Data: data,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
metadata := Metadata{
|
||||
ID: id,
|
||||
References: []Reference{ref},
|
||||
}
|
||||
|
||||
// Store the mapping of hash -> id in radixtree
|
||||
err = ds.Radix.Set(hash, metadata.ToBytes())
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return id, nil
|
||||
}
|
||||
|
||||
// Get retrieves a value by its ID
|
||||
func (ds *DedupeStore) Get(id uint32) ([]byte, error) {
|
||||
return ds.Data.Get(id)
|
||||
}
|
||||
|
||||
// GetFromHash retrieves a value by its hash
|
||||
func (ds *DedupeStore) GetFromHash(hash string) ([]byte, error) {
|
||||
// Get the ID from radixtree
|
||||
metadataBytes, err := ds.Radix.Get(hash)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Convert bytes back to metadata
|
||||
metadata := BytesToMetadata(metadataBytes)
|
||||
|
||||
// Get the actual data from ourdb
|
||||
return ds.Data.Get(metadata.ID)
|
||||
}
|
||||
|
||||
// IDExists checks if a value with the given ID exists
|
||||
func (ds *DedupeStore) IDExists(id uint32) bool {
|
||||
_, err := ds.Data.Get(id)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// HashExists checks if a value with the given hash exists
|
||||
func (ds *DedupeStore) HashExists(hash string) bool {
|
||||
_, err := ds.Radix.Get(hash)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// Delete removes a reference from the hash entry
|
||||
// If it's the last reference, removes the hash entry and its data
|
||||
func (ds *DedupeStore) Delete(id uint32, ref Reference) error {
|
||||
// Get the data to calculate its hash
|
||||
data, err := ds.Data.Get(id)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Calculate hash of the value
|
||||
hash := sha256Sum(data)
|
||||
|
||||
// Get the current entry from radixtree
|
||||
metadataBytes, err := ds.Radix.Get(hash)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
metadata := BytesToMetadata(metadataBytes)
|
||||
metadata, err = metadata.RemoveReference(ref)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if len(metadata.References) == 0 {
|
||||
// Delete from radixtree
|
||||
err = ds.Radix.Delete(hash)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Delete from data db
|
||||
return ds.Data.Delete(id)
|
||||
}
|
||||
|
||||
// Update hash metadata
|
||||
return ds.Radix.Update(hash, metadata.ToBytes())
|
||||
}
|
||||
|
||||
// Close closes the dedupe store
|
||||
func (ds *DedupeStore) Close() error {
|
||||
err1 := ds.Radix.Close()
|
||||
err2 := ds.Data.Close()
|
||||
|
||||
if err1 != nil {
|
||||
return err1
|
||||
}
|
||||
return err2
|
||||
}
|
||||
|
||||
// Helper function to calculate SHA-256 hash and return as hex string
|
||||
func sha256Sum(data []byte) string {
|
||||
hash := sha256.Sum256(data)
|
||||
return hex.EncodeToString(hash[:])
|
||||
}
|
532
pkg/data/dedupestor/dedupestor_test.go
Normal file
532
pkg/data/dedupestor/dedupestor_test.go
Normal file
@@ -0,0 +1,532 @@
|
||||
package dedupestor
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func setupTest(t *testing.T) {
|
||||
// Ensure test directories exist and are clean
|
||||
testDirs := []string{
|
||||
"/tmp/dedupestor_test",
|
||||
"/tmp/dedupestor_test_size",
|
||||
"/tmp/dedupestor_test_exists",
|
||||
"/tmp/dedupestor_test_multiple",
|
||||
"/tmp/dedupestor_test_refs",
|
||||
}
|
||||
|
||||
for _, dir := range testDirs {
|
||||
if _, err := os.Stat(dir); err == nil {
|
||||
err := os.RemoveAll(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to remove test directory %s: %v", dir, err)
|
||||
}
|
||||
}
|
||||
err := os.MkdirAll(dir, 0755)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create test directory %s: %v", dir, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBasicOperations(t *testing.T) {
|
||||
setupTest(t)
|
||||
|
||||
ds, err := New(NewArgs{
|
||||
Path: "/tmp/dedupestor_test",
|
||||
Reset: true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create dedupe store: %v", err)
|
||||
}
|
||||
defer ds.Close()
|
||||
|
||||
// Test storing and retrieving data
|
||||
value1 := []byte("test data 1")
|
||||
ref1 := Reference{Owner: 1, ID: 1}
|
||||
id1, err := ds.Store(value1, ref1)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to store data: %v", err)
|
||||
}
|
||||
|
||||
retrieved1, err := ds.Get(id1)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to retrieve data: %v", err)
|
||||
}
|
||||
if !bytes.Equal(retrieved1, value1) {
|
||||
t.Fatalf("Retrieved data doesn't match stored data")
|
||||
}
|
||||
|
||||
// Test deduplication with different reference
|
||||
ref2 := Reference{Owner: 1, ID: 2}
|
||||
id2, err := ds.Store(value1, ref2)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to store data with second reference: %v", err)
|
||||
}
|
||||
if id1 != id2 {
|
||||
t.Fatalf("Expected same ID for duplicate data, got %d and %d", id1, id2)
|
||||
}
|
||||
|
||||
// Test different data gets different ID
|
||||
value2 := []byte("test data 2")
|
||||
ref3 := Reference{Owner: 1, ID: 3}
|
||||
id3, err := ds.Store(value2, ref3)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to store different data: %v", err)
|
||||
}
|
||||
if id1 == id3 {
|
||||
t.Fatalf("Expected different IDs for different data, got %d for both", id1)
|
||||
}
|
||||
|
||||
retrieved2, err := ds.Get(id3)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to retrieve second data: %v", err)
|
||||
}
|
||||
if !bytes.Equal(retrieved2, value2) {
|
||||
t.Fatalf("Retrieved data doesn't match second stored data")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSizeLimit(t *testing.T) {
|
||||
setupTest(t)
|
||||
|
||||
ds, err := New(NewArgs{
|
||||
Path: "/tmp/dedupestor_test_size",
|
||||
Reset: true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create dedupe store: %v", err)
|
||||
}
|
||||
defer ds.Close()
|
||||
|
||||
// Test data under size limit (1KB)
|
||||
smallData := make([]byte, 1024)
|
||||
for i := range smallData {
|
||||
smallData[i] = byte(i % 256)
|
||||
}
|
||||
ref := Reference{Owner: 1, ID: 1}
|
||||
smallID, err := ds.Store(smallData, ref)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to store small data: %v", err)
|
||||
}
|
||||
|
||||
retrieved, err := ds.Get(smallID)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to retrieve small data: %v", err)
|
||||
}
|
||||
if !bytes.Equal(retrieved, smallData) {
|
||||
t.Fatalf("Retrieved data doesn't match stored small data")
|
||||
}
|
||||
|
||||
// Test data over size limit (2MB)
|
||||
largeData := make([]byte, 2*1024*1024)
|
||||
for i := range largeData {
|
||||
largeData[i] = byte(i % 256)
|
||||
}
|
||||
_, err = ds.Store(largeData, ref)
|
||||
if err == nil {
|
||||
t.Fatalf("Expected error for data exceeding size limit")
|
||||
}
|
||||
}
|
||||
|
||||
func TestExists(t *testing.T) {
|
||||
setupTest(t)
|
||||
|
||||
ds, err := New(NewArgs{
|
||||
Path: "/tmp/dedupestor_test_exists",
|
||||
Reset: true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create dedupe store: %v", err)
|
||||
}
|
||||
defer ds.Close()
|
||||
|
||||
value := []byte("test data")
|
||||
ref := Reference{Owner: 1, ID: 1}
|
||||
id, err := ds.Store(value, ref)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to store data: %v", err)
|
||||
}
|
||||
|
||||
if !ds.IDExists(id) {
|
||||
t.Fatalf("IDExists returned false for existing ID")
|
||||
}
|
||||
if ds.IDExists(99) {
|
||||
t.Fatalf("IDExists returned true for non-existent ID")
|
||||
}
|
||||
|
||||
// Calculate hash to test HashExists
|
||||
data, err := ds.Get(id)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to get data: %v", err)
|
||||
}
|
||||
hash := sha256Sum(data)
|
||||
|
||||
if !ds.HashExists(hash) {
|
||||
t.Fatalf("HashExists returned false for existing hash")
|
||||
}
|
||||
if ds.HashExists("nonexistenthash") {
|
||||
t.Fatalf("HashExists returned true for non-existent hash")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMultipleOperations(t *testing.T) {
|
||||
setupTest(t)
|
||||
|
||||
ds, err := New(NewArgs{
|
||||
Path: "/tmp/dedupestor_test_multiple",
|
||||
Reset: true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create dedupe store: %v", err)
|
||||
}
|
||||
defer ds.Close()
|
||||
|
||||
// Store multiple values
|
||||
values := [][]byte{}
|
||||
ids := []uint32{}
|
||||
|
||||
for i := 0; i < 5; i++ {
|
||||
value := []byte("test data " + string(rune('0'+i)))
|
||||
values = append(values, value)
|
||||
ref := Reference{Owner: 1, ID: uint32(i)}
|
||||
id, err := ds.Store(value, ref)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to store data %d: %v", i, err)
|
||||
}
|
||||
ids = append(ids, id)
|
||||
}
|
||||
|
||||
// Verify all values can be retrieved
|
||||
for i, id := range ids {
|
||||
retrieved, err := ds.Get(id)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to retrieve data %d: %v", i, err)
|
||||
}
|
||||
if !bytes.Equal(retrieved, values[i]) {
|
||||
t.Fatalf("Retrieved data %d doesn't match stored data", i)
|
||||
}
|
||||
}
|
||||
|
||||
// Test deduplication by storing same values again
|
||||
for i, value := range values {
|
||||
ref := Reference{Owner: 2, ID: uint32(i)}
|
||||
id, err := ds.Store(value, ref)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to store duplicate data %d: %v", i, err)
|
||||
}
|
||||
if id != ids[i] {
|
||||
t.Fatalf("Expected same ID for duplicate data %d, got %d and %d", i, ids[i], id)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestReferences(t *testing.T) {
|
||||
setupTest(t)
|
||||
|
||||
ds, err := New(NewArgs{
|
||||
Path: "/tmp/dedupestor_test_refs",
|
||||
Reset: true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create dedupe store: %v", err)
|
||||
}
|
||||
defer ds.Close()
|
||||
|
||||
// Store same data with different references
|
||||
value := []byte("test data")
|
||||
ref1 := Reference{Owner: 1, ID: 1}
|
||||
ref2 := Reference{Owner: 1, ID: 2}
|
||||
ref3 := Reference{Owner: 2, ID: 1}
|
||||
|
||||
// Store with first reference
|
||||
id, err := ds.Store(value, ref1)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to store data with first reference: %v", err)
|
||||
}
|
||||
|
||||
// Store same data with second reference
|
||||
id2, err := ds.Store(value, ref2)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to store data with second reference: %v", err)
|
||||
}
|
||||
if id != id2 {
|
||||
t.Fatalf("Expected same ID for same data, got %d and %d", id, id2)
|
||||
}
|
||||
|
||||
// Store same data with third reference
|
||||
id3, err := ds.Store(value, ref3)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to store data with third reference: %v", err)
|
||||
}
|
||||
if id != id3 {
|
||||
t.Fatalf("Expected same ID for same data, got %d and %d", id, id3)
|
||||
}
|
||||
|
||||
// Delete first reference - data should still exist
|
||||
err = ds.Delete(id, ref1)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to delete first reference: %v", err)
|
||||
}
|
||||
if !ds.IDExists(id) {
|
||||
t.Fatalf("Data should still exist after deleting first reference")
|
||||
}
|
||||
|
||||
// Delete second reference - data should still exist
|
||||
err = ds.Delete(id, ref2)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to delete second reference: %v", err)
|
||||
}
|
||||
if !ds.IDExists(id) {
|
||||
t.Fatalf("Data should still exist after deleting second reference")
|
||||
}
|
||||
|
||||
// Delete last reference - data should be gone
|
||||
err = ds.Delete(id, ref3)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to delete third reference: %v", err)
|
||||
}
|
||||
if ds.IDExists(id) {
|
||||
t.Fatalf("Data should be deleted after removing all references")
|
||||
}
|
||||
|
||||
// Verify data is actually deleted by trying to get it
|
||||
_, err = ds.Get(id)
|
||||
if err == nil {
|
||||
t.Fatalf("Expected error getting deleted data")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetadataConversion(t *testing.T) {
|
||||
// Test Reference conversion
|
||||
ref := Reference{
|
||||
Owner: 12345,
|
||||
ID: 67890,
|
||||
}
|
||||
|
||||
bytes := ref.ToBytes()
|
||||
recovered := BytesToReference(bytes)
|
||||
|
||||
if ref.Owner != recovered.Owner || ref.ID != recovered.ID {
|
||||
t.Fatalf("Reference conversion failed: original %+v, recovered %+v", ref, recovered)
|
||||
}
|
||||
|
||||
// Test Metadata conversion
|
||||
metadata := Metadata{
|
||||
ID: 42,
|
||||
References: []Reference{},
|
||||
}
|
||||
|
||||
ref1 := Reference{Owner: 1, ID: 100}
|
||||
ref2 := Reference{Owner: 2, ID: 200}
|
||||
|
||||
metadata, err := metadata.AddReference(ref1)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to add reference: %v", err)
|
||||
}
|
||||
metadata, err = metadata.AddReference(ref2)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to add reference: %v", err)
|
||||
}
|
||||
|
||||
bytes = metadata.ToBytes()
|
||||
recovered2 := BytesToMetadata(bytes)
|
||||
|
||||
if metadata.ID != recovered2.ID || len(metadata.References) != len(recovered2.References) {
|
||||
t.Fatalf("Metadata conversion failed: original %+v, recovered %+v", metadata, recovered2)
|
||||
}
|
||||
|
||||
for i, ref := range metadata.References {
|
||||
if ref.Owner != recovered2.References[i].Owner || ref.ID != recovered2.References[i].ID {
|
||||
t.Fatalf("Reference in metadata conversion failed at index %d", i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAddRemoveReference(t *testing.T) {
|
||||
metadata := Metadata{
|
||||
ID: 1,
|
||||
References: []Reference{},
|
||||
}
|
||||
|
||||
ref1 := Reference{Owner: 1, ID: 100}
|
||||
ref2 := Reference{Owner: 2, ID: 200}
|
||||
|
||||
// Add first reference
|
||||
metadata, err := metadata.AddReference(ref1)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to add first reference: %v", err)
|
||||
}
|
||||
if len(metadata.References) != 1 {
|
||||
t.Fatalf("Expected 1 reference after adding first, got %d", len(metadata.References))
|
||||
}
|
||||
if metadata.References[0].Owner != ref1.Owner || metadata.References[0].ID != ref1.ID {
|
||||
t.Fatalf("First reference not added correctly")
|
||||
}
|
||||
|
||||
// Add second reference
|
||||
metadata, err = metadata.AddReference(ref2)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to add second reference: %v", err)
|
||||
}
|
||||
if len(metadata.References) != 2 {
|
||||
t.Fatalf("Expected 2 references after adding second, got %d", len(metadata.References))
|
||||
}
|
||||
|
||||
// Try adding duplicate reference
|
||||
metadata, err = metadata.AddReference(ref1)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to add duplicate reference: %v", err)
|
||||
}
|
||||
if len(metadata.References) != 2 {
|
||||
t.Fatalf("Expected 2 references after adding duplicate, got %d", len(metadata.References))
|
||||
}
|
||||
|
||||
// Remove first reference
|
||||
metadata, err = metadata.RemoveReference(ref1)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to remove first reference: %v", err)
|
||||
}
|
||||
if len(metadata.References) != 1 {
|
||||
t.Fatalf("Expected 1 reference after removing first, got %d", len(metadata.References))
|
||||
}
|
||||
if metadata.References[0].Owner != ref2.Owner || metadata.References[0].ID != ref2.ID {
|
||||
t.Fatalf("Wrong reference removed")
|
||||
}
|
||||
|
||||
// Remove non-existent reference
|
||||
metadata, err = metadata.RemoveReference(Reference{Owner: 999, ID: 999})
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to remove non-existent reference: %v", err)
|
||||
}
|
||||
if len(metadata.References) != 1 {
|
||||
t.Fatalf("Expected 1 reference after removing non-existent, got %d", len(metadata.References))
|
||||
}
|
||||
|
||||
// Remove last reference
|
||||
metadata, err = metadata.RemoveReference(ref2)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to remove last reference: %v", err)
|
||||
}
|
||||
if len(metadata.References) != 0 {
|
||||
t.Fatalf("Expected 0 references after removing last, got %d", len(metadata.References))
|
||||
}
|
||||
}
|
||||
|
||||
func TestEmptyMetadataBytes(t *testing.T) {
|
||||
empty := BytesToMetadata([]byte{})
|
||||
if empty.ID != 0 || len(empty.References) != 0 {
|
||||
t.Fatalf("Expected empty metadata, got %+v", empty)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeduplicationSize(t *testing.T) {
|
||||
testDir := "/tmp/dedupestor_test_dedup_size"
|
||||
|
||||
// Clean up test directory
|
||||
if _, err := os.Stat(testDir); err == nil {
|
||||
os.RemoveAll(testDir)
|
||||
}
|
||||
os.MkdirAll(testDir, 0755)
|
||||
|
||||
// Create a new dedupe store
|
||||
ds, err := New(NewArgs{
|
||||
Path: testDir,
|
||||
Reset: true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create dedupe store: %v", err)
|
||||
}
|
||||
defer ds.Close()
|
||||
|
||||
// Store a large piece of data (100KB)
|
||||
largeData := make([]byte, 100*1024)
|
||||
for i := range largeData {
|
||||
largeData[i] = byte(i % 256)
|
||||
}
|
||||
|
||||
// Store the data with first reference
|
||||
ref1 := Reference{Owner: 1, ID: 1}
|
||||
id1, err := ds.Store(largeData, ref1)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to store data with first reference: %v", err)
|
||||
}
|
||||
|
||||
// Get the size of the data directory after first store
|
||||
dataDir := testDir + "/data"
|
||||
sizeAfterFirst, err := getDirSize(dataDir)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to get directory size: %v", err)
|
||||
}
|
||||
t.Logf("Size after first store: %d bytes", sizeAfterFirst)
|
||||
|
||||
// Store the same data with different references multiple times
|
||||
for i := 2; i <= 10; i++ {
|
||||
ref := Reference{Owner: uint16(i), ID: uint32(i)}
|
||||
id, err := ds.Store(largeData, ref)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to store data with reference %d: %v", i, err)
|
||||
}
|
||||
|
||||
// Verify we get the same ID (deduplication is working)
|
||||
if id != id1 {
|
||||
t.Fatalf("Expected same ID for duplicate data, got %d and %d", id1, id)
|
||||
}
|
||||
}
|
||||
|
||||
// Get the size after storing the same data multiple times
|
||||
sizeAfterMultiple, err := getDirSize(dataDir)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to get directory size: %v", err)
|
||||
}
|
||||
t.Logf("Size after storing same data 10 times: %d bytes", sizeAfterMultiple)
|
||||
|
||||
// The size should be approximately the same (allowing for metadata overhead)
|
||||
// We'll check that it hasn't grown significantly (less than 10% increase)
|
||||
if sizeAfterMultiple > sizeAfterFirst*110/100 {
|
||||
t.Fatalf("Directory size grew significantly after storing duplicate data: %d -> %d bytes",
|
||||
sizeAfterFirst, sizeAfterMultiple)
|
||||
}
|
||||
|
||||
// Now store different data
|
||||
differentData := make([]byte, 100*1024)
|
||||
for i := range differentData {
|
||||
differentData[i] = byte((i + 128) % 256) // Different pattern
|
||||
}
|
||||
|
||||
ref11 := Reference{Owner: 11, ID: 11}
|
||||
_, err = ds.Store(differentData, ref11)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to store different data: %v", err)
|
||||
}
|
||||
|
||||
// Get the size after storing different data
|
||||
sizeAfterDifferent, err := getDirSize(dataDir)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to get directory size: %v", err)
|
||||
}
|
||||
t.Logf("Size after storing different data: %d bytes", sizeAfterDifferent)
|
||||
|
||||
// The size should have increased significantly
|
||||
if sizeAfterDifferent <= sizeAfterMultiple*110/100 {
|
||||
t.Fatalf("Directory size didn't grow as expected after storing different data: %d -> %d bytes",
|
||||
sizeAfterMultiple, sizeAfterDifferent)
|
||||
}
|
||||
}
|
||||
|
||||
// getDirSize returns the total size of all files in a directory in bytes
|
||||
func getDirSize(path string) (int64, error) {
|
||||
var size int64
|
||||
err := filepath.Walk(path, func(_ string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !info.IsDir() {
|
||||
size += info.Size()
|
||||
}
|
||||
return nil
|
||||
})
|
||||
return size, err
|
||||
}
|
123
pkg/data/dedupestor/metadata.go
Normal file
123
pkg/data/dedupestor/metadata.go
Normal file
@@ -0,0 +1,123 @@
|
||||
// Package dedupestor provides a key-value store with deduplication based on content hashing
|
||||
package dedupestor
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
)
|
||||
|
||||
// Metadata represents a stored value with its ID and references
|
||||
type Metadata struct {
|
||||
ID uint32 // ID of the stored data in the database
|
||||
References []Reference // List of references to this data
|
||||
}
|
||||
|
||||
// Reference represents a reference to stored data
|
||||
type Reference struct {
|
||||
Owner uint16 // Owner identifier
|
||||
ID uint32 // Reference identifier
|
||||
}
|
||||
|
||||
// ToBytes converts Metadata to bytes for storage
|
||||
func (m Metadata) ToBytes() []byte {
|
||||
// Calculate size: 4 bytes for ID + 6 bytes per reference
|
||||
size := 4 + (len(m.References) * 6)
|
||||
result := make([]byte, size)
|
||||
|
||||
// Write ID (4 bytes)
|
||||
binary.LittleEndian.PutUint32(result[0:4], m.ID)
|
||||
|
||||
// Write references (6 bytes each)
|
||||
offset := 4
|
||||
for _, ref := range m.References {
|
||||
refBytes := ref.ToBytes()
|
||||
copy(result[offset:offset+6], refBytes)
|
||||
offset += 6
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// BytesToMetadata converts bytes back to Metadata
|
||||
func BytesToMetadata(b []byte) Metadata {
|
||||
if len(b) < 4 {
|
||||
return Metadata{
|
||||
ID: 0,
|
||||
References: []Reference{},
|
||||
}
|
||||
}
|
||||
|
||||
id := binary.LittleEndian.Uint32(b[0:4])
|
||||
refs := []Reference{}
|
||||
|
||||
// Parse references (each reference is 6 bytes)
|
||||
for i := 4; i < len(b); i += 6 {
|
||||
if i+6 <= len(b) {
|
||||
refs = append(refs, BytesToReference(b[i:i+6]))
|
||||
}
|
||||
}
|
||||
|
||||
return Metadata{
|
||||
ID: id,
|
||||
References: refs,
|
||||
}
|
||||
}
|
||||
|
||||
// AddReference adds a new reference if it doesn't already exist
|
||||
func (m Metadata) AddReference(ref Reference) (Metadata, error) {
|
||||
// Check if reference already exists
|
||||
for _, existing := range m.References {
|
||||
if existing.Owner == ref.Owner && existing.ID == ref.ID {
|
||||
return m, nil
|
||||
}
|
||||
}
|
||||
|
||||
// Add the new reference
|
||||
newRefs := append(m.References, ref)
|
||||
return Metadata{
|
||||
ID: m.ID,
|
||||
References: newRefs,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// RemoveReference removes a reference if it exists
|
||||
func (m Metadata) RemoveReference(ref Reference) (Metadata, error) {
|
||||
newRefs := []Reference{}
|
||||
for _, existing := range m.References {
|
||||
if existing.Owner != ref.Owner || existing.ID != ref.ID {
|
||||
newRefs = append(newRefs, existing)
|
||||
}
|
||||
}
|
||||
|
||||
return Metadata{
|
||||
ID: m.ID,
|
||||
References: newRefs,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ToBytes converts Reference to bytes
|
||||
func (r Reference) ToBytes() []byte {
|
||||
result := make([]byte, 6)
|
||||
|
||||
// Write owner (2 bytes)
|
||||
binary.LittleEndian.PutUint16(result[0:2], r.Owner)
|
||||
|
||||
// Write ID (4 bytes)
|
||||
binary.LittleEndian.PutUint32(result[2:6], r.ID)
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// BytesToReference converts bytes to Reference
|
||||
func BytesToReference(b []byte) Reference {
|
||||
if len(b) < 6 {
|
||||
return Reference{}
|
||||
}
|
||||
|
||||
owner := binary.LittleEndian.Uint16(b[0:2])
|
||||
id := binary.LittleEndian.Uint32(b[2:6])
|
||||
|
||||
return Reference{
|
||||
Owner: owner,
|
||||
ID: id,
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user