From a67db029b44e08e3ce978b4e57ae7b86f369f1a1 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Mon, 30 Dec 2024 10:29:09 +0100 Subject: [PATCH 01/32] WIP --- buffer_test.go | 18 ++-- cache.go | 23 +++--- distribution_test.go | 3 +- examples/batch/main.go | 4 +- examples/buffering/main.go | 4 +- .../distributed-early-refreshes/client.go | 9 +- examples/distribution/client.go | 9 +- examples/generics/main.go | 4 +- examples/missing/main.go | 4 +- examples/permutations/main.go | 4 +- examples/refreshes/main.go | 4 +- fetch.go | 82 +++++++++++++++---- fetch_test.go | 34 +++++--- metrics.go | 16 ++-- options.go | 11 ++- options_test.go | 22 ++++- passthrough_test.go | 2 +- refresh.go | 2 +- shard.go | 43 ++++++---- sturdyc_test.go | 29 ++++--- 20 files changed, 224 insertions(+), 103 deletions(-) diff --git a/buffer_test.go b/buffer_test.go index 3aa6c4e..071ca7b 100644 --- a/buffer_test.go +++ b/buffer_test.go @@ -20,6 +20,7 @@ func TestBatchIsRefreshedWhenTheTimeoutExpires(t *testing.T) { evictionPercentage := 10 minRefreshDelay := time.Minute * 5 maxRefreshDelay := time.Minute * 10 + synchronousRefreshDelay := time.Minute * 30 refreshRetryInterval := time.Millisecond * 10 batchSize := 10 batchBufferTimeout := time.Minute @@ -34,7 +35,7 @@ func TestBatchIsRefreshedWhenTheTimeoutExpires(t *testing.T) { // 2. The 'batchBufferTimeout' threshold is exceeded. client := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, refreshRetryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithRefreshCoalescing(batchSize, batchBufferTimeout), sturdyc.WithClock(clock), @@ -86,6 +87,7 @@ func TestBatchIsRefreshedWhenTheBufferSizeIsReached(t *testing.T) { ttl := time.Hour minRefreshDelay := time.Minute * 5 maxRefreshDelay := time.Minute * 10 + synchronousRefreshDelay := time.Minute * 30 refreshRetryInterval := time.Millisecond * 10 batchSize := 10 batchBufferTimeout := time.Minute @@ -100,7 +102,7 @@ func TestBatchIsRefreshedWhenTheBufferSizeIsReached(t *testing.T) { // 2. The 'batchBufferTimeout' threshold is exceeded. client := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, refreshRetryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithRefreshCoalescing(batchSize, batchBufferTimeout), sturdyc.WithClock(clock), @@ -180,6 +182,7 @@ func TestBatchIsNotRefreshedByDuplicates(t *testing.T) { evictionPercentage := 10 minRefreshDelay := time.Minute * 5 maxRefreshDelay := time.Minute * 10 + synchronousRefreshDelay := time.Minute * 30 refreshRetryInterval := time.Millisecond * 10 batchSize := 10 batchBufferTimeout := time.Minute @@ -194,7 +197,7 @@ func TestBatchIsNotRefreshedByDuplicates(t *testing.T) { // 2. The 'batchBufferTimeout' threshold is exceeded. client := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, refreshRetryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithRefreshCoalescing(batchSize, batchBufferTimeout), sturdyc.WithClock(clock), @@ -250,6 +253,7 @@ func TestBatchesAreGroupedByPermutations(t *testing.T) { evictionPercentage := 15 minRefreshDelay := time.Minute * 5 maxRefreshDelay := time.Minute * 10 + synchronousRefreshDelay := time.Minute * 30 refreshRetryInterval := time.Millisecond * 10 batchSize := 5 batchBufferTimeout := time.Minute @@ -264,7 +268,7 @@ func TestBatchesAreGroupedByPermutations(t *testing.T) { // 2. The 'batchBufferTimeout' threshold is exceeded. c := sturdyc.New[any](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, refreshRetryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithRefreshCoalescing(batchSize, batchBufferTimeout), sturdyc.WithClock(clock), @@ -339,6 +343,7 @@ func TestLargeBatchesAreChunkedCorrectly(t *testing.T) { evictionPercentage := 23 minRefreshDelay := time.Minute * 5 maxRefreshDelay := time.Minute * 10 + synchronousRefreshDelay := time.Minute * 30 refreshRetryInterval := time.Millisecond * 10 batchSize := 5 batchBufferTimeout := time.Minute @@ -353,7 +358,7 @@ func TestLargeBatchesAreChunkedCorrectly(t *testing.T) { // 2. The 'batchBufferTimeout' threshold is exceeded. client := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, refreshRetryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithRefreshCoalescing(batchSize, batchBufferTimeout), sturdyc.WithClock(clock), @@ -401,6 +406,7 @@ func TestValuesAreUpdatedCorrectly(t *testing.T) { evictionPercentage := 10 minRefreshDelay := time.Minute * 5 maxRefreshDelay := time.Minute * 10 + synchronousRefreshDelay := time.Minute * 50 refreshRetryInterval := time.Millisecond * 10 batchSize := 10 batchBufferTimeout := time.Minute @@ -415,7 +421,7 @@ func TestValuesAreUpdatedCorrectly(t *testing.T) { // 2. The 'batchBufferTimeout' threshold is exceeded. client := sturdyc.New[any](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, refreshRetryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithRefreshCoalescing(batchSize, batchBufferTimeout), sturdyc.WithClock(clock), diff --git a/cache.go b/cache.go index 239b71c..e63a5e5 100644 --- a/cache.go +++ b/cache.go @@ -29,11 +29,12 @@ type Config struct { metricsRecorder DistributedMetricsRecorder log Logger - refreshInBackground bool - minRefreshTime time.Duration - maxRefreshTime time.Duration - retryBaseDelay time.Duration - storeMissingRecords bool + earlyRefreshes bool + minRefreshTime time.Duration + maxRefreshTime time.Duration + synchronousRefreshTime time.Duration + retryBaseDelay time.Duration + storeMissingRecords bool bufferRefreshes bool batchMutex sync.Mutex @@ -127,11 +128,11 @@ func (c *Client[T]) getShard(key string) *shard[T] { // getWithState retrieves a single value from the cache and returns additional // information about the state of the record. The state includes whether the record // exists, if it has been marked as missing, and if it is due for a refresh. -func (c *Client[T]) getWithState(key string) (value T, exists, markedAsMissing, refresh bool) { +func (c *Client[T]) getWithState(key string) (value T, exists, markedAsMissing, backgroundRefresh, synchronousRefresh bool) { shard := c.getShard(key) - val, exists, markedAsMissing, refresh := shard.get(key) - c.reportCacheHits(exists, markedAsMissing, refresh) - return val, exists, markedAsMissing, refresh + val, exists, markedAsMissing, backgroundRefresh, synchronousRefresh := shard.get(key) + c.reportCacheHits(exists, markedAsMissing, backgroundRefresh, synchronousRefresh) + return val, exists, markedAsMissing, backgroundRefresh, synchronousRefresh } // Get retrieves a single value from the cache. @@ -145,8 +146,8 @@ func (c *Client[T]) getWithState(key string) (value T, exists, markedAsMissing, // The value corresponding to the key and a boolean indicating if the value was found. func (c *Client[T]) Get(key string) (T, bool) { shard := c.getShard(key) - val, ok, markedAsMissing, refresh := shard.get(key) - c.reportCacheHits(ok, markedAsMissing, refresh) + val, ok, markedAsMissing, backgroundRefresh, synchronousRefresh := shard.get(key) + c.reportCacheHits(ok, markedAsMissing, backgroundRefresh, synchronousRefresh) return val, ok && !markedAsMissing } diff --git a/distribution_test.go b/distribution_test.go index a9fd468..49fe989 100644 --- a/distribution_test.go +++ b/distribution_test.go @@ -779,6 +779,7 @@ func TestPartialResponseForRefreshesDoesNotResultInMissingRecords(t *testing.T) ttl := time.Hour minRefreshDelay := time.Minute * 5 maxRefreshDelay := time.Minute * 10 + synchronousRefreshDelay := time.Minute * 30 refreshRetryInterval := time.Millisecond * 10 batchSize := 10 batchBufferTimeout := time.Minute @@ -788,7 +789,7 @@ func TestPartialResponseForRefreshesDoesNotResultInMissingRecords(t *testing.T) c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, refreshRetryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithRefreshCoalescing(batchSize, batchBufferTimeout), sturdyc.WithDistributedStorageEarlyRefreshes(distributedStorage, refreshAfter), diff --git a/examples/batch/main.go b/examples/batch/main.go index 7f68cfe..79ca1c6 100644 --- a/examples/batch/main.go +++ b/examples/batch/main.go @@ -54,12 +54,14 @@ func main() { // used to spread out the refreshes for entries evenly over time. minRefreshDelay := time.Second maxRefreshDelay := time.Second * 2 + // Set a synchronous refresh delay for when we want a refresh to happen synchronously. + synchronousRefreshDelay := time.Second * 30 // The base for exponential backoff when retrying a refresh. retryBaseDelay := time.Millisecond * 10 // Create a cache client with the specified configuration. cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, retryBaseDelay), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, retryBaseDelay), ) // Create a new API instance with the cache client. diff --git a/examples/buffering/main.go b/examples/buffering/main.go index 98ad1ee..25c2d67 100644 --- a/examples/buffering/main.go +++ b/examples/buffering/main.go @@ -56,6 +56,8 @@ func main() { // used to spread out the refreshes for entries evenly over time. minRefreshDelay := time.Second maxRefreshDelay := time.Second * 2 + // Set a synchronous refresh delay for when we want a refresh to happen synchronously. + synchronousRefreshDelay := time.Second * 30 // The base for exponential backoff when retrying a refresh. retryBaseDelay := time.Millisecond * 10 // Whether to store misses in the sturdyc. This can be useful to @@ -68,7 +70,7 @@ func main() { // Create a new cache client with the specified configuration. cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, retryBaseDelay), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, retryBaseDelay), sturdyc.WithRefreshCoalescing(batchSize, batchBufferTimeout), ) diff --git a/examples/distributed-early-refreshes/client.go b/examples/distributed-early-refreshes/client.go index dbda4c3..dadbef8 100644 --- a/examples/distributed-early-refreshes/client.go +++ b/examples/distributed-early-refreshes/client.go @@ -19,9 +19,10 @@ const ( // Configuration for the early in-memory refreshes. const ( - minRefreshTime = 2 * time.Second - maxRefreshTime = 4 * time.Second - retryBaseDelay = 5 * time.Second + minRefreshTime = 2 * time.Second + maxRefreshTime = 4 * time.Second + synchronousRefreshTime = 30 * time.Second + retryBaseDelay = 5 * time.Second ) // Configuration for the refresh coalescing. @@ -36,7 +37,7 @@ const refreshAfter = time.Second func newAPIClient(distributedStorage sturdyc.DistributedStorageWithDeletions) *apiClient { return &apiClient{ cache: sturdyc.New[any](capacity, numberOfShards, ttl, percentageOfRecordsToEvictWhenFull, - sturdyc.WithEarlyRefreshes(minRefreshTime, maxRefreshTime, retryBaseDelay), + sturdyc.WithEarlyRefreshes(minRefreshTime, maxRefreshTime, synchronousRefreshTime, retryBaseDelay), sturdyc.WithRefreshCoalescing(idealBufferSize, bufferTimeout), sturdyc.WithDistributedStorageEarlyRefreshes(distributedStorage, refreshAfter), // NOTE: Uncommenting this line will make the cache mark the records as diff --git a/examples/distribution/client.go b/examples/distribution/client.go index 0753941..0c348eb 100644 --- a/examples/distribution/client.go +++ b/examples/distribution/client.go @@ -22,9 +22,10 @@ const ( // Configuration for the early in-memory refreshes. const ( - minRefreshTime = 100 * time.Millisecond - maxRefreshTime = 500 * time.Millisecond - retryBaseDelay = time.Second + minRefreshTime = 100 * time.Millisecond + maxRefreshTime = 500 * time.Millisecond + synchronousRefreshTime = 30 * time.Second + retryBaseDelay = time.Second ) // Configuration for the refresh coalescing. @@ -37,7 +38,7 @@ func newAPIClient(distributedStorage sturdyc.DistributedStorage) *apiClient { return &apiClient{ cache: sturdyc.New[any](capacity, numberOfShards, ttl, percentageOfRecordsToEvictWhenFull, sturdyc.WithMissingRecordStorage(), - sturdyc.WithEarlyRefreshes(minRefreshTime, maxRefreshTime, retryBaseDelay), + sturdyc.WithEarlyRefreshes(minRefreshTime, maxRefreshTime, synchronousRefreshTime, retryBaseDelay), sturdyc.WithRefreshCoalescing(idealBufferSize, bufferTimeout), sturdyc.WithDistributedStorage(distributedStorage), ), diff --git a/examples/generics/main.go b/examples/generics/main.go index 6d98565..a683d77 100644 --- a/examples/generics/main.go +++ b/examples/generics/main.go @@ -54,12 +54,14 @@ func main() { // used to spread out the refreshes for entries evenly over time. minRefreshDelay := time.Second maxRefreshDelay := time.Second * 2 + // Set a synchronous refresh delay for when we want a refresh to happen synchronously. + synchronousRefreshDelay := time.Second * 30 // The base for exponential backoff when retrying a refresh. retryBaseDelay := time.Millisecond * 10 // Create a new cache client with the specified configuration. cacheClient := sturdyc.New[any](capacity, numShards, ttl, evictionPercentage, - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, retryBaseDelay), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, retryBaseDelay), sturdyc.WithRefreshCoalescing(10, time.Second*15), ) diff --git a/examples/missing/main.go b/examples/missing/main.go index 6347ed8..dcdb042 100644 --- a/examples/missing/main.go +++ b/examples/missing/main.go @@ -51,12 +51,14 @@ func main() { // used to spread out the refreshes for entries evenly over time. minRefreshDelay := time.Millisecond * 10 maxRefreshDelay := time.Millisecond * 30 + // Set a synchronous refresh delay for when we want a refresh to happen synchronously. + synchronousRefreshDelay := time.Second * 30 // The base for exponential backoff when retrying a refresh. retryBaseDelay := time.Millisecond * 10 // Create a cache client with the specified configuration. cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, retryBaseDelay), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, retryBaseDelay), sturdyc.WithMissingRecordStorage(), ) diff --git a/examples/permutations/main.go b/examples/permutations/main.go index 42c0153..dae93ea 100644 --- a/examples/permutations/main.go +++ b/examples/permutations/main.go @@ -56,12 +56,14 @@ func main() { // used to spread out the refreshes for entries evenly over time. minRefreshDelay := time.Second maxRefreshDelay := time.Second * 2 + // Set a synchronous refresh delay for when we want a refresh to happen synchronously. + synchronousRefreshDelay := time.Second * 30 // The base for exponential backoff when retrying a refresh. retryBaseDelay := time.Millisecond * 10 // Create a new cache client with the specified configuration. cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, retryBaseDelay), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, retryBaseDelay), ) // We will fetch these IDs using various option sets, meaning that the ID alone diff --git a/examples/refreshes/main.go b/examples/refreshes/main.go index ed47b8d..19f7722 100644 --- a/examples/refreshes/main.go +++ b/examples/refreshes/main.go @@ -49,6 +49,8 @@ func main() { // We don't want our outgoing requests graph to look like a comb. minRefreshDelay := time.Millisecond * 10 maxRefreshDelay := time.Millisecond * 30 + // Set a synchronous refresh delay for when we want a refresh to happen synchronously. + synchronousRefreshDelay := time.Second * 30 // The base used for exponential backoff when retrying a refresh. Most of the // time, we perform refreshes well in advance of the records expiry time. // Hence, we can use this to make it easier for a system that is having @@ -60,7 +62,7 @@ func main() { // Create a cache client with the specified configuration. cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, retryBaseDelay), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, retryBaseDelay), ) // Create a new API instance with the cache client. diff --git a/fetch.go b/fetch.go index 2ce31ca..f9b9764 100644 --- a/fetch.go +++ b/fetch.go @@ -6,24 +6,30 @@ import ( "maps" ) -func (c *Client[T]) groupIDs(ids []string, keyFn KeyFn) (hits map[string]T, misses, refreshes []string) { +func (c *Client[T]) groupIDs(ids []string, keyFn KeyFn) (hits map[string]T, misses, backgroundRefreshes, synchronousRefreshes []string) { hits = make(map[string]T) misses = make([]string, 0) - refreshes = make([]string, 0) + backgroundRefreshes = make([]string, 0) + synchronousRefreshes = make([]string, 0) for _, id := range ids { key := keyFn(id) - value, exists, markedAsMissing, shouldRefresh := c.getWithState(key) + value, exists, markedAsMissing, backgroundRefresh, synchronousRefresh := c.getWithState(key) + + if synchronousRefresh { + synchronousRefreshes = append(synchronousRefreshes, id) + } // Check if the record should be refreshed in the background. - if shouldRefresh { - refreshes = append(refreshes, id) + if backgroundRefresh && !synchronousRefresh { + backgroundRefreshes = append(backgroundRefreshes, id) } if markedAsMissing { continue } + // If the record should be synchronously refreshed, it's going to be added to both the hits and misses maps. if !exists { misses = append(misses, id) continue @@ -31,16 +37,36 @@ func (c *Client[T]) groupIDs(ids []string, keyFn KeyFn) (hits map[string]T, miss hits[id] = value } - return hits, misses, refreshes + return hits, misses, backgroundRefreshes, synchronousRefreshes } func getFetch[V, T any](ctx context.Context, c *Client[T], key string, fetchFn FetchFn[V]) (T, error) { + value, ok, markedAsMissing, backgroundRefresh, synchronousRefresh := c.getWithState(key) wrappedFetch := wrap[T](distributedFetch(c, key, fetchFn)) - // Begin by checking if we have the item in our cache. - value, ok, markedAsMissing, shouldRefresh := c.getWithState(key) + if synchronousRefresh { + res, err := callAndCache(ctx, c, key, wrappedFetch) + // Check if the record has been deleted at the source. If it has, we'll + // delete it from the cache too. NOTE: The callAndCache function converts + // ErrNotFound to ErrMissingRecord. + if ok && !markedAsMissing && errors.Is(err, ErrMissingRecord) { + c.Delete(key) + } + + if errors.Is(err, ErrMissingRecord) { + return res, err + } + + // If the call to synchrounously refresh the record failed, + // we'll return the latest value if we have it in the cache. + if err != nil && ok { + return value, nil + } - if shouldRefresh { + return res, err + } + + if backgroundRefresh { c.safeGo(func() { c.refresh(key, wrappedFetch) }) @@ -99,26 +125,50 @@ func GetOrFetch[V, T any](ctx context.Context, c *Client[T], key string, fetchFn func getFetchBatch[V, T any](ctx context.Context, c *Client[T], ids []string, keyFn KeyFn, fetchFn BatchFetchFn[V]) (map[string]T, error) { wrappedFetch := wrapBatch[T](distributedBatchFetch[V, T](c, keyFn, fetchFn)) - cachedRecords, cacheMisses, idsToRefresh := c.groupIDs(ids, keyFn) + cachedRecords, cacheMisses, idsToBackgroundRefresh, idsToSynchronouslyRefresh := c.groupIDs(ids, keyFn) - // If any records need to be refreshed, we'll do so in the background. - if len(idsToRefresh) > 0 { + // Schedule background refreshes. + if len(idsToBackgroundRefresh) > 0 { c.safeGo(func() { if c.bufferRefreshes { - bufferBatchRefresh(c, idsToRefresh, keyFn, wrappedFetch) + bufferBatchRefresh(c, idsToBackgroundRefresh, keyFn, wrappedFetch) return } - c.refreshBatch(idsToRefresh, keyFn, wrappedFetch) + c.refreshBatch(idsToBackgroundRefresh, keyFn, wrappedFetch) }) } // If we were able to retrieve all records from the cache, we can return them straight away. - if len(cacheMisses) == 0 { + if len(cacheMisses) == 0 && len(idsToSynchronouslyRefresh) == 0 { return cachedRecords, nil } - callBatchOpts := callBatchOpts[T, T]{ids: cacheMisses, keyFn: keyFn, fn: wrappedFetch} + // Create a list of the IDs that we're going to fetch from the underlying data source or distributed storage. + cacheMissesAndSyncRefreshes := make([]string, 0, len(cacheMisses)+len(idsToSynchronouslyRefresh)) + cacheMissesAndSyncRefreshes = append(cacheMissesAndSyncRefreshes, cacheMisses...) + cacheMissesAndSyncRefreshes = append(cacheMissesAndSyncRefreshes, idsToSynchronouslyRefresh...) + + callBatchOpts := callBatchOpts[T, T]{ids: cacheMissesAndSyncRefreshes, keyFn: keyFn, fn: wrappedFetch} response, err := callAndCacheBatch(ctx, c, callBatchOpts) + + // If we did a call to synchronously refresh some of the records, and it + // didn't fail, we'll have to check if any of the IDs have been deleted at + // the underlying data source. If they have, we'll have to delete them from + // the cache and remove them from the cachedRecords map so that we don't + // return them. + if err == nil && len(idsToSynchronouslyRefresh) > 0 { + for _, id := range idsToSynchronouslyRefresh { + // If we have it in the cache, but not in the response, it means + // that the ID no longer exists at the underlying data source. + _, okResponse := response[id] + _, okCache := cachedRecords[id] + if okCache && !okResponse { + delete(cachedRecords, id) + c.Delete(keyFn(id)) + } + } + } + if err != nil && !errors.Is(err, ErrOnlyCachedRecords) { if len(cachedRecords) > 0 { return cachedRecords, ErrOnlyCachedRecords diff --git a/fetch_test.go b/fetch_test.go index 32f4a32..9c60126 100644 --- a/fetch_test.go +++ b/fetch_test.go @@ -61,12 +61,13 @@ func TestGetOrFetchStampedeProtection(t *testing.T) { clock := sturdyc.NewTestClock(time.Now()) minRefreshDelay := time.Millisecond * 500 maxRefreshDelay := time.Millisecond * 500 + synchronousRefreshDelay := time.Second refreshRetryInterval := time.Millisecond * 10 // The cache is going to have a 2 second TTL, and the first refresh should happen within a second. c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, refreshRetryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithClock(clock), ) @@ -112,12 +113,13 @@ func TestGetOrFetchRefreshRetries(t *testing.T) { evictionPercentage := 10 minRefreshDelay := time.Second maxRefreshDelay := time.Second * 2 + synchronousRefreshDelay := time.Second * 10 retryInterval := time.Millisecond * 10 clock := sturdyc.NewTestClock(time.Now()) c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, retryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, retryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithClock(clock), ) @@ -167,12 +169,13 @@ func TestGetOrFetchMissingRecord(t *testing.T) { evictionPercentage := 20 minRefreshDelay := time.Second maxRefreshDelay := time.Second * 2 + synchronousRefreshDelay := time.Second * 10 retryInterval := time.Millisecond * 10 clock := sturdyc.NewTestClock(time.Now()) c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), sturdyc.WithClock(clock), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, retryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, retryInterval), sturdyc.WithMissingRecordStorage(), ) @@ -266,13 +269,14 @@ func TestBatchGetOrFetchNilMapMissingRecords(t *testing.T) { numShards := 1 ttl := time.Minute evictionPercentage := 50 - minRefreshDelay := time.Minute - maxRefreshDelay := time.Minute * 2 + minRefreshDelay := time.Second + maxRefreshDelay := time.Second * 2 + synchronousRefreshDelay := time.Second * 10 retryInterval := time.Second clock := sturdyc.NewTestClock(time.Now()) c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, retryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, retryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithClock(clock), ) @@ -316,11 +320,12 @@ func TestGetOrFetchBatchRetries(t *testing.T) { evictionPercentage := 10 minRefreshDelay := time.Hour maxRefreshDelay := time.Hour * 2 + synchronousRefreshDelay := time.Hour * 4 retryInterval := time.Second clock := sturdyc.NewTestClock(time.Now()) c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, retryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, retryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithClock(clock), ) @@ -428,10 +433,11 @@ func TestGetOrFetchBatchStampedeProtection(t *testing.T) { clock := sturdyc.NewTestClock(time.Now()) minRefreshDelay := time.Millisecond * 500 maxRefreshDelay := time.Millisecond * 1000 + synchronousRefreshDelay := time.Millisecond * 1500 refreshRetryInterval := time.Millisecond * 10 c := sturdyc.New[string](capacity, shards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, refreshRetryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithClock(clock), sturdyc.WithMetrics(newTestMetricsRecorder(shards)), @@ -502,11 +508,12 @@ func TestGetOrFetchDeletesRecordsThatHaveBeenRemovedAtTheSource(t *testing.T) { clock := sturdyc.NewTestClock(time.Now()) minRefreshDelay := time.Millisecond * 500 maxRefreshDelay := time.Second + synchronousRefreshDelay := time.Minute refreshRetryInterval := time.Millisecond * 10 c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, refreshRetryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), sturdyc.WithClock(clock), ) @@ -548,11 +555,12 @@ func TestGetOrFetchConvertsDeletedRecordsToMissingRecords(t *testing.T) { clock := sturdyc.NewTestClock(time.Now()) minRefreshDelay := time.Millisecond * 500 maxRefreshDelay := time.Second + synchronousRefreshDelay := time.Minute refreshRetryInterval := time.Millisecond * 10 c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, refreshRetryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithClock(clock), ) @@ -601,11 +609,12 @@ func TestGetOrFetchBatchDeletesRecordsThatHaveBeenRemovedAtTheSource(t *testing. clock := sturdyc.NewTestClock(time.Now()) minRefreshDelay := time.Millisecond * 500 maxRefreshDelay := time.Second + synchronousRefreshDelay := time.Minute refreshRetryInterval := time.Millisecond * 10 c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, refreshRetryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), sturdyc.WithClock(clock), ) @@ -648,11 +657,12 @@ func TestGetFetchBatchConvertsDeletedRecordsToMissingRecords(t *testing.T) { clock := sturdyc.NewTestClock(time.Now()) minRefreshDelay := time.Millisecond * 500 maxRefreshDelay := time.Second + synchronousRefreshDelay := time.Minute refreshRetryInterval := time.Millisecond * 10 c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, sturdyc.WithNoContinuousEvictions(), - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, refreshRetryInterval), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), sturdyc.WithMissingRecordStorage(), sturdyc.WithClock(clock), ) diff --git a/metrics.go b/metrics.go index 55ea6c5..10dbb14 100644 --- a/metrics.go +++ b/metrics.go @@ -6,7 +6,9 @@ type MetricsRecorder interface { // CacheMiss is called for every key that results in a cache miss. CacheMiss() // Refresh is called when a get operation results in a refresh. - Refresh() + BackgroundRefresh() + // SynchronousRefresh is called when a get operation results in a synchronous refresh. + SynchronousRefresh() // MissingRecord is called every time the cache is asked to // look up a key which has been marked as missing. MissingRecord() @@ -31,7 +33,7 @@ type DistributedMetricsRecorder interface { DistributedCacheMiss() // DistributedRefresh is called when we retrieve a record from // the distributed storage that should be refreshed. - DistributedRefresh() + DistributedRefresh() // TODO: Should this be renamed to DistributedBackgroundRefresh? And should we add DistributedSynchronousRefresh? // DistributedMissingRecord is called when we retrieve a record from the // distributed storage that has been marked as a missing record. DistributedMissingRecord() @@ -71,7 +73,7 @@ func (s *shard[T]) reportEntriesEvicted(n int) { } // reportCacheHits is used to report cache hits and misses to the metrics recorder. -func (c *Client[T]) reportCacheHits(cacheHit, missingRecord, refresh bool) { +func (c *Client[T]) reportCacheHits(cacheHit, missingRecord, backgroundRefresh, synchronousRefresh bool) { if c.metricsRecorder == nil { return } @@ -80,8 +82,12 @@ func (c *Client[T]) reportCacheHits(cacheHit, missingRecord, refresh bool) { c.metricsRecorder.MissingRecord() } - if refresh { - c.metricsRecorder.Refresh() + if backgroundRefresh { + c.metricsRecorder.BackgroundRefresh() + } + + if synchronousRefresh { + c.metricsRecorder.SynchronousRefresh() } if !cacheHit { diff --git a/options.go b/options.go index 7322f02..9ddbb48 100644 --- a/options.go +++ b/options.go @@ -56,11 +56,12 @@ func WithMissingRecordStorage() Option { // gets scheduled when the key is requested again after a random time between // minRefreshTime and maxRefreshTime. This is an important distinction because // it means that the cache won't just naively refresh every key it's ever seen. -func WithEarlyRefreshes(minRefreshTime, maxRefreshTime, retryBaseDelay time.Duration) Option { +func WithEarlyRefreshes(minRefreshTime, maxRefreshTime, synchronousRefresthTime, retryBaseDelay time.Duration) Option { return func(c *Config) { - c.refreshInBackground = true + c.earlyRefreshes = true c.minRefreshTime = minRefreshTime c.maxRefreshTime = maxRefreshTime + c.synchronousRefreshTime = synchronousRefresthTime c.retryBaseDelay = retryBaseDelay } } @@ -161,7 +162,7 @@ func validateConfig(capacity, numShards int, ttl time.Duration, evictionPercenta panic("evictionPercentage must be between 0 and 100") } - if !cfg.refreshInBackground && cfg.bufferRefreshes { + if !cfg.earlyRefreshes && cfg.bufferRefreshes { panic("refresh buffering requires background refreshes to be enabled") } @@ -181,6 +182,10 @@ func validateConfig(capacity, numShards int, ttl time.Duration, evictionPercenta panic("minRefreshTime must be less than or equal to maxRefreshTime") } + if cfg.maxRefreshTime > cfg.synchronousRefreshTime { + panic("maxRefreshTime must be less than or equal to synchronousRefreshTime") + } + if cfg.retryBaseDelay < 0 { panic("retryBaseDelay must be greater than or equal to 0") } diff --git a/options_test.go b/options_test.go index 9a83aed..dd99bc8 100644 --- a/options_test.go +++ b/options_test.go @@ -91,7 +91,7 @@ func TestPanicsIfTheRefreshBufferSizeIsLessThanOne(t *testing.T) { } }() sturdyc.New[string](100, 10, time.Minute, 5, - sturdyc.WithEarlyRefreshes(time.Minute, time.Hour, time.Second), + sturdyc.WithEarlyRefreshes(time.Minute, time.Hour, time.Hour*2, time.Second), sturdyc.WithRefreshCoalescing(0, time.Minute), ) } @@ -106,7 +106,7 @@ func TestPanicsIfTheRefreshBufferTimeoutIsLessThanOne(t *testing.T) { } }() sturdyc.New[string](100, 10, time.Minute, 5, - sturdyc.WithEarlyRefreshes(time.Minute, time.Hour, time.Second), + sturdyc.WithEarlyRefreshes(time.Minute, time.Hour, time.Hour*2, time.Second), sturdyc.WithRefreshCoalescing(10, 0), ) } @@ -135,7 +135,21 @@ func TestPanicsIfTheMinRefreshTimeIsGreaterThanTheMaxRefreshTime(t *testing.T) { } }() sturdyc.New[string](100, 10, time.Minute, 5, - sturdyc.WithEarlyRefreshes(time.Hour, time.Minute, time.Second), + sturdyc.WithEarlyRefreshes(time.Hour, time.Minute, time.Hour*2, time.Second), + ) +} + +func TestPanicsIfTheBackgroundRefreshTimeIsGreaterThanTheSynchronousRefreshTime(t *testing.T) { + t.Parallel() + + defer func() { + err := recover() + if err == nil { + t.Error("expected a panic when trying to use a greater background refresh time than the synchronous refresh time") + } + }() + sturdyc.New[string](100, 10, time.Minute, 5, + sturdyc.WithEarlyRefreshes(time.Minute, time.Hour*2, time.Hour*1, time.Second), ) } @@ -149,6 +163,6 @@ func TestPanicsIfTheRetryBaseDelayIsLessThanZero(t *testing.T) { } }() sturdyc.New[string](100, 10, time.Minute, 5, - sturdyc.WithEarlyRefreshes(time.Minute, time.Hour, -1), + sturdyc.WithEarlyRefreshes(time.Minute, time.Hour, time.Hour*2, -1), ) } diff --git a/passthrough_test.go b/passthrough_test.go index 20269d7..ea397a1 100644 --- a/passthrough_test.go +++ b/passthrough_test.go @@ -6,8 +6,8 @@ import ( "testing" "time" - "github.com/viccon/sturdyc" "github.com/google/go-cmp/cmp" + "github.com/viccon/sturdyc" ) func TestPassthrough(t *testing.T) { diff --git a/refresh.go b/refresh.go index a4e3eba..7162de9 100644 --- a/refresh.go +++ b/refresh.go @@ -28,7 +28,7 @@ func (c *Client[T]) refreshBatch(ids []string, keyFn KeyFn, fetchFn BatchFetchFn // Check if any of the records have been deleted at the data source. for _, id := range ids { - _, okCache, _, _ := c.getWithState(keyFn(id)) + _, okCache, _, _, _ := c.getWithState(keyFn(id)) _, okResponse := response[id] if okResponse { diff --git a/shard.go b/shard.go index c39c6a5..e9f50c7 100644 --- a/shard.go +++ b/shard.go @@ -8,12 +8,13 @@ import ( // entry represents a single cache entry. type entry[T any] struct { - key string - value T - expiresAt time.Time - refreshAt time.Time - numOfRefreshRetries int - isMissingRecord bool + key string + value T + expiresAt time.Time + backgroundRefreshAt time.Time + synchronousRefreshAt time.Time + numOfRefreshRetries int + isMissingRecord bool } // shard is a thread-safe data structure that holds a subset of the cache entries. @@ -79,7 +80,7 @@ func (s *shard[T]) forceEvict() { s.reportEntriesEvicted(entriesEvicted) } -// get retrieves attempts to retrieve a value from the shard. +// get attempts to retrieve a value from the shard. // // Parameters: // @@ -91,20 +92,25 @@ func (s *shard[T]) forceEvict() { // exists: A boolean indicating if the value exists in the shard. // markedAsMissing: A boolean indicating if the key has been marked as a missing record. // refresh: A boolean indicating if the value should be refreshed in the background. -func (s *shard[T]) get(key string) (val T, exists, markedAsMissing, refresh bool) { +func (s *shard[T]) get(key string) (val T, exists, markedAsMissing, backgroundRefresh, synchronousRefresh bool) { s.RLock() item, ok := s.entries[key] if !ok { s.RUnlock() - return val, false, false, false + return val, false, false, false, false } if s.clock.Now().After(item.expiresAt) { s.RUnlock() - return val, false, false, false + return val, false, false, false, false } - shouldRefresh := s.refreshInBackground && s.clock.Now().After(item.refreshAt) + // Check if the record should be synchronously refreshed. + if s.earlyRefreshes && s.clock.Now().After(item.synchronousRefreshAt) { + return item.value, true, item.isMissingRecord, false, true + } + + shouldRefresh := s.earlyRefreshes && s.clock.Now().After(item.backgroundRefreshAt) if shouldRefresh { // Release the read lock, and switch to a write lock. s.RUnlock() @@ -113,22 +119,22 @@ func (s *shard[T]) get(key string) (val T, exists, markedAsMissing, refresh bool // However, during the time it takes to switch locks, another goroutine // might have acquired it and moved the refreshAt. Therefore, we'll have to // check if this operation should still be performed. - if !s.clock.Now().After(item.refreshAt) { + if !s.clock.Now().After(item.backgroundRefreshAt) { s.Unlock() - return item.value, true, item.isMissingRecord, false + return item.value, true, item.isMissingRecord, false, false } // Update the "refreshAt" so no other goroutines attempts to refresh the same entry. nextRefresh := s.retryBaseDelay * (1 << item.numOfRefreshRetries) - item.refreshAt = s.clock.Now().Add(nextRefresh) + item.backgroundRefreshAt = s.clock.Now().Add(nextRefresh) item.numOfRefreshRetries++ s.Unlock() - return item.value, true, item.isMissingRecord, shouldRefresh + return item.value, true, item.isMissingRecord, shouldRefresh, false } s.RUnlock() - return item.value, true, item.isMissingRecord, false + return item.value, true, item.isMissingRecord, false, false } // set writes a key-value pair to the shard and returns a @@ -158,14 +164,15 @@ func (s *shard[T]) set(key string, value T, isMissingRecord bool) bool { isMissingRecord: isMissingRecord, } - if s.refreshInBackground { + if s.earlyRefreshes { // If there is a difference between the min- and maxRefreshTime we'll use that to // set a random padding so that the refreshes get spread out evenly over time. var padding time.Duration if s.minRefreshTime != s.maxRefreshTime { padding = time.Duration(rand.Int64N(int64(s.maxRefreshTime - s.minRefreshTime))) } - newEntry.refreshAt = now.Add(s.minRefreshTime + padding) + newEntry.backgroundRefreshAt = now.Add(s.minRefreshTime + padding) + newEntry.synchronousRefreshAt = now.Add(s.synchronousRefreshTime) newEntry.numOfRefreshRetries = 0 } diff --git a/sturdyc_test.go b/sturdyc_test.go index 781f323..f5b4103 100644 --- a/sturdyc_test.go +++ b/sturdyc_test.go @@ -24,15 +24,16 @@ func randKey(n int) string { type TestMetricsRecorder struct { sync.Mutex - cacheHits int - cacheMisses int - refreshes int - missingRecords int - evictions int - forcedEvictions int - evictedEntries int - shards map[int]int - batchSizes []int + cacheHits int + cacheMisses int + backgroundRefreshes int + synchronousRefreshes int + missingRecords int + evictions int + forcedEvictions int + evictedEntries int + shards map[int]int + batchSizes []int } func newTestMetricsRecorder(numShards int) *TestMetricsRecorder { @@ -54,10 +55,16 @@ func (r *TestMetricsRecorder) CacheMiss() { r.cacheMisses++ } -func (r *TestMetricsRecorder) Refresh() { +func (r *TestMetricsRecorder) BackgroundRefresh() { r.Lock() defer r.Unlock() - r.refreshes++ + r.backgroundRefreshes++ +} + +func (r *TestMetricsRecorder) SynchronousRefresh() { + r.Lock() + defer r.Unlock() + r.synchronousRefreshes++ } func (r *TestMetricsRecorder) MissingRecord() { From 4fd8749ca238648cb854915ca688744e4118b822 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Wed, 1 Jan 2025 14:09:43 +0100 Subject: [PATCH 02/32] WIP --- metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics.go b/metrics.go index 10dbb14..b9d9449 100644 --- a/metrics.go +++ b/metrics.go @@ -33,7 +33,7 @@ type DistributedMetricsRecorder interface { DistributedCacheMiss() // DistributedRefresh is called when we retrieve a record from // the distributed storage that should be refreshed. - DistributedRefresh() // TODO: Should this be renamed to DistributedBackgroundRefresh? And should we add DistributedSynchronousRefresh? + DistributedRefresh() // DistributedMissingRecord is called when we retrieve a record from the // distributed storage that has been marked as a missing record. DistributedMissingRecord() From 83cba1f74bbbe25f052227261304b7380475849d Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Thu, 2 Jan 2025 11:02:33 +0100 Subject: [PATCH 03/32] WIP --- errors.go | 12 +- fetch.go | 17 +- fetch_test.go | 656 ++++++++++++++++++++++++++++++++++++++++++++++++++ inflight.go | 2 +- safe.go | 5 - shard.go | 1 + 6 files changed, 677 insertions(+), 16 deletions(-) diff --git a/errors.go b/errors.go index 160c610..e0eebf9 100644 --- a/errors.go +++ b/errors.go @@ -20,10 +20,14 @@ var ( // ErrMissingRecord is returned by client.GetOrFetch and client.Passthrough when a record has been marked // as missing. The cache will still try to refresh the record in the background if it's being requested. ErrMissingRecord = errors.New("sturdyc: the record has been marked as missing in the cache") - // ErrOnlyCachedRecords is returned by client.GetOrFetchBatch and client.PassthroughBatch - // when some of the requested records are available in the cache, but the attempt to - // fetch the remaining records failed. As the consumer, you can then decide whether to - // proceed with the cached records or if the entire batch is necessary. + // ErrOnlyCachedRecords is returned by client.GetOrFetchBatch and + // client.PassthroughBatch when some of the requested records are available + // in the cache, but the attempt to fetch the remaining records failed. It + // may also be returned when you're using the WithEarlyRefreshes + // functionality, and the call to synchronously refresh a record failed. The + // cache will then give you the latest data it has cached, and you as the + // consumer can then decide whether to proceed with the cached records or if + // the newest data is necessary. ErrOnlyCachedRecords = errors.New("sturdyc: failed to fetch the records that were not in the cache") // ErrInvalidType is returned when you try to use one of the generic // package level functions but the type assertion fails. diff --git a/fetch.go b/fetch.go index f9b9764..b70412d 100644 --- a/fetch.go +++ b/fetch.go @@ -48,19 +48,22 @@ func getFetch[V, T any](ctx context.Context, c *Client[T], key string, fetchFn F res, err := callAndCache(ctx, c, key, wrappedFetch) // Check if the record has been deleted at the source. If it has, we'll // delete it from the cache too. NOTE: The callAndCache function converts - // ErrNotFound to ErrMissingRecord. - if ok && !markedAsMissing && errors.Is(err, ErrMissingRecord) { + // ErrNotFound to ErrMissingRecord if missing record storage is enabled. + if ok && errors.Is(err, ErrNotFound) { c.Delete(key) } - if errors.Is(err, ErrMissingRecord) { + if errors.Is(err, ErrMissingRecord) || errors.Is(err, ErrNotFound) { return res, err } // If the call to synchrounously refresh the record failed, - // we'll return the latest value if we have it in the cache. + // we'll return the latest value if we have it in the cache + // along with a ErrOnlyCachedRecords error. The consumer can + // then decide whether to proceed with the cached data or to + // propagate the error. if err != nil && ok { - return value, nil + return value, ErrOnlyCachedRecords } return res, err @@ -163,8 +166,10 @@ func getFetchBatch[V, T any](ctx context.Context, c *Client[T], ids []string, ke _, okResponse := response[id] _, okCache := cachedRecords[id] if okCache && !okResponse { + if !c.storeMissingRecords { + c.Delete(keyFn(id)) + } delete(cachedRecords, id) - c.Delete(keyFn(id)) } } } diff --git a/fetch_test.go b/fetch_test.go index 9c60126..f1db441 100644 --- a/fetch_test.go +++ b/fetch_test.go @@ -702,3 +702,659 @@ func TestGetFetchBatchConvertsDeletedRecordsToMissingRecords(t *testing.T) { t.Errorf("expected key3 to not be returned by Get") } } + +func TestGetFetchSynchronousRefreshes(t *testing.T) { + t.Parallel() + + ctx := context.Background() + capacity := 1000 + numShards := 10 + ttl := time.Hour + evictionPercentage := 10 + minBackgroundRefreshDelay := time.Second + maxBackgroundRefreshDelay := time.Second * 2 + synchronousRefreshDelay := time.Second * 10 + retryInterval := time.Millisecond * 10 + clock := sturdyc.NewTestClock(time.Now()) + + c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithNoContinuousEvictions(), + sturdyc.WithEarlyRefreshes(minBackgroundRefreshDelay, maxBackgroundRefreshDelay, synchronousRefreshDelay, retryInterval), + sturdyc.WithMissingRecordStorage(), + sturdyc.WithClock(clock), + ) + + id := "1" + fetchObserver := NewFetchObserver(1) + fetchObserver.Response(id) + + res, err := sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if res != "value1" { + t.Errorf("expected value1, got %v", res) + } + + // Now, let's make the fetchObserver return a new value, and only move the + // clock enough to warrant a background refresh. The value we get should + // still be the same as the previous one because the refresh happens in the + // background. + fetchObserver.Clear() + fetchObserver.Response("2") + clock.Add(maxBackgroundRefreshDelay + 1) + res, err = sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if res != "value1" { + t.Errorf("expected value1, got %v", res) + } + + // Now we can wait for the background refresh to complete, and then assert + // that the next time we ask for this ID we'll get the new value. + time.Sleep(time.Millisecond * 100) + res, err = sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if res != "value2" { + t.Errorf("expected value2, got %v", res) + } + + // Let's do this again, but this time we'll move the clock passed the synchronous refresh delay. + // This should result in a synchronous refresh and we should get the new value right away. + fetchObserver.Clear() + fetchObserver.Response("3") + clock.Add(synchronousRefreshDelay + 1) + res, err = sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if res != "value3" { + t.Errorf("expected value3, got %v", res) + } + fetchObserver.AssertFetchCount(t, 3) +} + +func TestGetFetchBatchSynchronousRefreshes(t *testing.T) { + t.Parallel() + + ctx := context.Background() + capacity := 1000 + numShards := 10 + ttl := time.Hour + evictionPercentage := 10 + minBackgroundRefreshDelay := time.Second + maxBackgroundRefreshDelay := time.Second * 2 + synchronousRefreshDelay := time.Second * 10 + retryInterval := time.Millisecond * 10 + clock := sturdyc.NewTestClock(time.Now()) + + c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithNoContinuousEvictions(), + sturdyc.WithEarlyRefreshes(minBackgroundRefreshDelay, maxBackgroundRefreshDelay, synchronousRefreshDelay, retryInterval), + sturdyc.WithMissingRecordStorage(), + sturdyc.WithClock(clock), + ) + + firstBatchOfIDs := []string{"1", "2", "3"} + fetchObserver := NewFetchObserver(2) + fetchObserver.BatchResponse(firstBatchOfIDs) + + _, err := sturdyc.GetOrFetchBatch(ctx, c, firstBatchOfIDs, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + fetchObserver.AssertRequestedRecords(t, firstBatchOfIDs) + fetchObserver.AssertFetchCount(t, 1) + + // Now, let's move the clock 5 seconds and then request another batch of IDs. + clock.Add(time.Second * 5) + secondBatchOfIDs := []string{"4", "5", "6"} + fetchObserver.BatchResponse(secondBatchOfIDs) + _, err = sturdyc.GetOrFetchBatch(ctx, c, secondBatchOfIDs, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + + fetchObserver.AssertRequestedRecords(t, secondBatchOfIDs) + fetchObserver.AssertFetchCount(t, 2) + + // At this point, we should have IDs 1-3 in the cache that are 5 seconds + // old, and IDs 4-6 that are completely new. If we now move the clock another + // 5 seconds, we should reach the point where IDs 1-3 are due for a + // synchronous refresh, and IDs 4-6 are due for a background refresh. + clock.Add((time.Second * 5) + 1) + + fullBatchOfIDs := []string{"1", "2", "3", "4", "5", "6"} + fetchObserver.BatchResponse(firstBatchOfIDs) + _, err = sturdyc.GetOrFetchBatch(ctx, c, fullBatchOfIDs, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + // We'll assert that two refreshes happened. One synchronous refresh and one background refresh. + <-fetchObserver.FetchCompleted + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + fetchObserver.AssertFetchCount(t, 4) +} + +func TestGetFetchSynchronousRefreshConvertsToMissingRecord(t *testing.T) { + t.Parallel() + + ctx := context.Background() + capacity := 1000 + numShards := 10 + ttl := time.Hour + evictionPercentage := 10 + minBackgroundRefreshDelay := time.Second + maxBackgroundRefreshDelay := time.Second * 2 + synchronousRefreshDelay := time.Second * 10 + retryInterval := time.Millisecond * 10 + clock := sturdyc.NewTestClock(time.Now()) + + c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithNoContinuousEvictions(), + sturdyc.WithEarlyRefreshes(minBackgroundRefreshDelay, maxBackgroundRefreshDelay, synchronousRefreshDelay, retryInterval), + sturdyc.WithMissingRecordStorage(), + sturdyc.WithClock(clock), + ) + + id := "1" + fetchObserver := NewFetchObserver(1) + fetchObserver.Response(id) + + res, err := sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if res != "value1" { + t.Errorf("expected value1, got %v", res) + } + + // Here, we'll set up the next request to return a not found error. Given + // that we have missing record storage enabled, we'll expect that the + // synchronous refresh returns a sturdyc.MissingRecord error. + fetchObserver.Clear() + fetchObserver.Err(sturdyc.ErrNotFound) + clock.Add(synchronousRefreshDelay + 1) + + _, err = sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + if !errors.Is(err, sturdyc.ErrMissingRecord) { + t.Fatalf("expected ErrMissingRecord, got %v", err) + } + <-fetchObserver.FetchCompleted + fetchObserver.AssertFetchCount(t, 2) + if c.Size() != 1 { + t.Errorf("expected cache size to be 1, got %d", c.Size()) + } + + // Let's also make sure that the record can reappear again. + fetchObserver.Clear() + fetchObserver.Response("2") + clock.Add(synchronousRefreshDelay + 1) + + res, err = sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if res != "value2" { + t.Errorf("expected value2, got %v", res) + } + fetchObserver.AssertFetchCount(t, 3) +} + +func TestGetFetchBatchSynchronousRefreshConvertsToMissingRecord(t *testing.T) { + t.Parallel() + + ctx := context.Background() + capacity := 1000 + numShards := 10 + ttl := time.Hour + evictionPercentage := 10 + minBackgroundRefreshDelay := time.Second + maxBackgroundRefreshDelay := time.Second * 2 + synchronousRefreshDelay := time.Second * 10 + retryInterval := time.Millisecond * 10 + clock := sturdyc.NewTestClock(time.Now()) + + c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithNoContinuousEvictions(), + sturdyc.WithEarlyRefreshes(minBackgroundRefreshDelay, maxBackgroundRefreshDelay, synchronousRefreshDelay, retryInterval), + sturdyc.WithMissingRecordStorage(), + sturdyc.WithClock(clock), + ) + + ids := []string{"1", "2", "3", "4", "5", "6"} + fetchObserver := NewFetchObserver(1) + fetchObserver.BatchResponse(ids) + + res, err := sturdyc.GetOrFetchBatch(ctx, c, ids, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 6 { + t.Fatalf("expected 6 records, got %d", len(res)) + } + if c.Size() != 6 { + t.Errorf("expected cache size to be 6, got %d", c.Size()) + } + fetchObserver.AssertRequestedRecords(t, ids) + fetchObserver.AssertFetchCount(t, 1) + + // Now, let's move the clock passed the synchronous refresh delay, + // and make the refresh only return values for IDs 1-3. + clock.Add(synchronousRefreshDelay + 1) + fetchObserver.Clear() + fetchObserver.BatchResponse([]string{"1", "2", "3"}) + + res, err = sturdyc.GetOrFetchBatch(ctx, c, ids, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 3 { + t.Fatalf("expected 3 records, got %d", len(res)) + } + if c.Size() != 6 { + t.Errorf("expected cache size to be 6, got %d", c.Size()) + } + fetchObserver.AssertRequestedRecords(t, ids) + fetchObserver.AssertFetchCount(t, 2) + + // Next, let's assert that the records were successfully stored as missing. + clock.Add(minBackgroundRefreshDelay - 1) + res, err = sturdyc.GetOrFetchBatch(ctx, c, ids, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 3 { + t.Fatalf("expected 3 records, got %d", len(res)) + } + if c.Size() != 6 { + t.Errorf("expected cache size to be 6, got %d", c.Size()) + } + + // And finally, let's make sure that the records can reappear again. + clock.Add(synchronousRefreshDelay) + fetchObserver.Clear() + fetchObserver.BatchResponse(ids) + + res, err = sturdyc.GetOrFetchBatch(ctx, c, ids, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 6 { + t.Fatalf("expected 6 records, got %d", len(res)) + } + if c.Size() != 6 { + t.Errorf("expected cache size to be 6, got %d", c.Size()) + } + fetchObserver.AssertRequestedRecords(t, ids) +} + +func TestGetFetchSynchronousRefreshDeletion(t *testing.T) { + t.Parallel() + + ctx := context.Background() + capacity := 1000 + numShards := 10 + ttl := time.Hour + evictionPercentage := 10 + minBackgroundRefreshDelay := time.Second + maxBackgroundRefreshDelay := time.Second * 2 + synchronousRefreshDelay := time.Second * 10 + retryInterval := time.Millisecond * 10 + clock := sturdyc.NewTestClock(time.Now()) + + c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithNoContinuousEvictions(), + sturdyc.WithEarlyRefreshes(minBackgroundRefreshDelay, maxBackgroundRefreshDelay, synchronousRefreshDelay, retryInterval), + sturdyc.WithClock(clock), + ) + + id := "1" + fetchObserver := NewFetchObserver(1) + fetchObserver.Response(id) + + res, err := sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if res != "value1" { + t.Errorf("expected value1, got %v", res) + } + + // Here, we'll set up the next request to return a not found error. Given + // that we have missing record storage enabled, we'll expect that the + // synchronous refresh returns a sturdyc.MissingRecord error. + fetchObserver.Clear() + fetchObserver.Err(sturdyc.ErrNotFound) + clock.Add(synchronousRefreshDelay + 1) + + _, err = sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + if !errors.Is(err, sturdyc.ErrNotFound) { + t.Fatalf("expected ErrNotFound, got %v", err) + } + <-fetchObserver.FetchCompleted + fetchObserver.AssertFetchCount(t, 2) + if c.Size() != 0 { + t.Errorf("expected cache size to be 0, got %d", c.Size()) + } + + // Let's also make sure that the record can reappear again. + clock.Add(synchronousRefreshDelay) + fetchObserver.Clear() + fetchObserver.Response(id) + res, err = sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if res != "value1" { + t.Errorf("expected value1, got %v", res) + } + if c.Size() != 1 { + t.Errorf("expected cache size to be 1, got %d", c.Size()) + } + fetchObserver.AssertFetchCount(t, 3) +} + +func TestGetFetchBatchSynchronousRefreshDeletion(t *testing.T) { + t.Parallel() + + ctx := context.Background() + capacity := 1000 + numShards := 10 + ttl := time.Hour + evictionPercentage := 10 + minBackgroundRefreshDelay := time.Second + maxBackgroundRefreshDelay := time.Second * 2 + synchronousRefreshDelay := time.Second * 10 + retryInterval := time.Millisecond * 10 + clock := sturdyc.NewTestClock(time.Now()) + + c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithNoContinuousEvictions(), + sturdyc.WithEarlyRefreshes(minBackgroundRefreshDelay, maxBackgroundRefreshDelay, synchronousRefreshDelay, retryInterval), + sturdyc.WithClock(clock), + ) + + ids := []string{"1", "2", "3", "4", "5", "6"} + fetchObserver := NewFetchObserver(1) + fetchObserver.BatchResponse(ids) + + res, err := sturdyc.GetOrFetchBatch(ctx, c, ids, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 6 { + t.Fatalf("expected 6 records, got %d", len(res)) + } + if c.Size() != 6 { + t.Errorf("expected cache size to be 6, got %d", c.Size()) + } + fetchObserver.AssertRequestedRecords(t, ids) + fetchObserver.AssertFetchCount(t, 1) + + // Now, let's move the clock passed the synchronous refresh delay, + // and make the refresh only return values for IDs 1-3. + clock.Add(synchronousRefreshDelay + 1) + fetchObserver.Clear() + fetchObserver.BatchResponse([]string{"1", "2", "3"}) + + res, err = sturdyc.GetOrFetchBatch(ctx, c, ids, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 3 { + t.Fatalf("expected 3 records, got %d", len(res)) + } + // IDs 4-6 should not have been deleted. + if c.Size() != 3 { + t.Errorf("expected cache size to be 3, got %d", c.Size()) + } + fetchObserver.AssertRequestedRecords(t, ids) + fetchObserver.AssertFetchCount(t, 2) + + // Next, let's assert that the records doesn't reappear the next time we ask for the same IDs. + fetchObserver.Clear() + fetchObserver.BatchResponse([]string{}) + clock.Add(minBackgroundRefreshDelay - 1) + + res, err = sturdyc.GetOrFetchBatch(ctx, c, ids, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 3 { + t.Fatalf("expected 3 records, got %d", len(res)) + } + if c.Size() != 3 { + t.Errorf("expected cache size to be 3, got %d", c.Size()) + } + // IDs 4-6 should have been deleted, hence we should get another outgoing request. + fetchObserver.AssertFetchCount(t, 3) + fetchObserver.AssertRequestedRecords(t, []string{"4", "5", "6"}) + + // Finally, let's make sure that the records can reappear again. + clock.Add(synchronousRefreshDelay) + fetchObserver.Clear() + fetchObserver.BatchResponse(ids) + + res, err = sturdyc.GetOrFetchBatch(ctx, c, ids, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 6 { + t.Fatalf("expected 6 records, got %d", len(res)) + } + if c.Size() != 6 { + t.Errorf("expected cache size to be 6, got %d", c.Size()) + } + fetchObserver.AssertRequestedRecords(t, ids) + fetchObserver.AssertFetchCount(t, 4) +} + +func TestGetFetchSynchronousRefreshFailureGivesLatestValue(t *testing.T) { + t.Parallel() + + ctx := context.Background() + capacity := 1000 + numShards := 10 + ttl := time.Hour + evictionPercentage := 10 + minBackgroundRefreshDelay := time.Second + maxBackgroundRefreshDelay := time.Second * 2 + synchronousRefreshDelay := time.Second * 10 + retryInterval := time.Millisecond * 10 + clock := sturdyc.NewTestClock(time.Now()) + + c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithNoContinuousEvictions(), + sturdyc.WithEarlyRefreshes(minBackgroundRefreshDelay, maxBackgroundRefreshDelay, synchronousRefreshDelay, retryInterval), + sturdyc.WithMissingRecordStorage(), + sturdyc.WithClock(clock), + ) + + id := "1" + fetchObserver := NewFetchObserver(1) + fetchObserver.Response(id) + + res, err := sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if res != "value1" { + t.Errorf("expected value1, got %v", res) + } + + // Here, we'll set up the next request to return an error. Given that + // we still have this key cached, we expect the cache to give us an + // ErrOnlyCachedRecords error along with the cached value. + fetchObserver.Clear() + fetchObserver.Err(errors.New("error")) + clock.Add(synchronousRefreshDelay + 1) + + res, err = sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + <-fetchObserver.FetchCompleted + if !errors.Is(err, sturdyc.ErrOnlyCachedRecords) { + t.Fatalf("expected ErrOnlyCachedRecords, got %v", err) + } + if res != "value1" { + t.Errorf("expected value1, got %v", res) + } + fetchObserver.AssertFetchCount(t, 2) + + // Now, requesting the same ID again should result in another synchronous + // refresh without us having to move the clock. Let's set this one up to return + // an actual value. + fetchObserver.Clear() + fetchObserver.Response("2") + + res, err = sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if res != "value2" { + t.Errorf("expected value2, got %v", res) + } + fetchObserver.AssertFetchCount(t, 3) +} + +func TestGetFetchBatchSynchronousRefreshFailureGivesLatestValue(t *testing.T) { + t.Parallel() + + ctx := context.Background() + capacity := 1000 + numShards := 10 + ttl := time.Hour + evictionPercentage := 10 + minBackgroundRefreshDelay := time.Second + maxBackgroundRefreshDelay := time.Second * 2 + synchronousRefreshDelay := time.Second * 10 + retryInterval := time.Millisecond * 10 + clock := sturdyc.NewTestClock(time.Now()) + + c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithNoContinuousEvictions(), + sturdyc.WithEarlyRefreshes(minBackgroundRefreshDelay, maxBackgroundRefreshDelay, synchronousRefreshDelay, retryInterval), + sturdyc.WithMissingRecordStorage(), + sturdyc.WithClock(clock), + ) + + ids := []string{"1", "2", "3", "4", "5"} + fetchObserver := NewFetchObserver(1) + fetchObserver.BatchResponse(ids) + + res, err := sturdyc.GetOrFetchBatch(ctx, c, ids, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 5 { + t.Fatalf("expected 5 records, got %d", len(res)) + } + fetchObserver.AssertRequestedRecords(t, ids) + fetchObserver.AssertFetchCount(t, 1) + + // Now, let's move the clock passed the synchronous refresh + // delay, and make the next call return an error. + clock.Add(synchronousRefreshDelay + 1) + fetchObserver.Clear() + fetchObserver.Err(errors.New("error")) + + res, err = sturdyc.GetOrFetchBatch(ctx, c, ids, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if !errors.Is(err, sturdyc.ErrOnlyCachedRecords) { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 5 { + t.Fatalf("expected 5 records, got %d", len(res)) + } + fetchObserver.AssertRequestedRecords(t, ids) + fetchObserver.AssertFetchCount(t, 2) + + // If a synchronous refresh fails, we won't do any exponential backoff. + clock.Add(time.Millisecond) + res, err = sturdyc.GetOrFetchBatch(ctx, c, ids, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if !errors.Is(err, sturdyc.ErrOnlyCachedRecords) { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 5 { + t.Fatalf("expected 5 records, got %d", len(res)) + } + fetchObserver.AssertRequestedRecords(t, ids) + fetchObserver.AssertFetchCount(t, 3) +} + +func TestGetFetchSynchronousRefreshStampedeProtection(t *testing.T) { + t.Parallel() + + ctx := context.Background() + capacity := 10 + numShards := 2 + ttl := time.Second * 2 + evictionPercentage := 10 + clock := sturdyc.NewTestClock(time.Now()) + minRefreshDelay := time.Millisecond * 500 + maxRefreshDelay := time.Millisecond * 500 + synchronousRefreshDelay := time.Second + refreshRetryInterval := time.Millisecond * 10 + + // The cache is going to have a 2 second TTL, and the first refresh should happen within a second. + c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithNoContinuousEvictions(), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), + sturdyc.WithMissingRecordStorage(), + sturdyc.WithClock(clock), + ) + + id := "1" + fetchObserver := NewFetchObserver(1000) + fetchObserver.Response(id) + + // We will start the test by trying to get key1, which wont exist in the sturdyc. Hence, + // the fetch function is going to get called and we'll set the initial value to val1. + sturdyc.GetOrFetch[string](ctx, c, id, fetchObserver.Fetch) + + <-fetchObserver.FetchCompleted + fetchObserver.AssertFetchCount(t, 1) + + // Now, we're going to go past the synchronous refresh delay and try to retrieve the key from 1000 goroutines at once. + numGoroutines := 1000 + clock.Add(synchronousRefreshDelay + 1) + var wg sync.WaitGroup + wg.Add(numGoroutines) + for i := 0; i < numGoroutines; i++ { + go func() { + defer wg.Done() + _, err := sturdyc.GetOrFetch(ctx, c, id, fetchObserver.Fetch) + if err != nil { + panic(err) + } + }() + } + wg.Wait() + + <-fetchObserver.FetchCompleted + fetchObserver.AssertFetchCount(t, 2) +} + +func TestGetFetchBatchMixOfSynchronousAndAsynchronousRefreshes(t *testing.T) {} diff --git a/inflight.go b/inflight.go index 5368aa4..f4f2464 100644 --- a/inflight.go +++ b/inflight.go @@ -33,7 +33,7 @@ func makeCall[T, V any](ctx context.Context, c *Client[T], key string, fn FetchF }() response, err := fn(ctx) - if err != nil && c.storeMissingRecords && errors.Is(err, ErrNotFound) { + if c.storeMissingRecords && errors.Is(err, ErrNotFound) { c.StoreMissingRecord(key) call.err = ErrMissingRecord return diff --git a/safe.go b/safe.go index f5d86a4..ed33c30 100644 --- a/safe.go +++ b/safe.go @@ -36,11 +36,6 @@ func wrap[T, V any](fetchFn FetchFn[V]) FetchFn[T] { } func unwrap[V, T any](val T, err error) (V, error) { - if err != nil { - var zero V - return zero, err - } - v, ok := any(val).(V) if !ok { return v, ErrInvalidType diff --git a/shard.go b/shard.go index e9f50c7..bb512b7 100644 --- a/shard.go +++ b/shard.go @@ -107,6 +107,7 @@ func (s *shard[T]) get(key string) (val T, exists, markedAsMissing, backgroundRe // Check if the record should be synchronously refreshed. if s.earlyRefreshes && s.clock.Now().After(item.synchronousRefreshAt) { + s.RUnlock() return item.value, true, item.isMissingRecord, false, true } From 53743532e1fedfd9de23690eb21634f35def95aa Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Thu, 2 Jan 2025 12:50:27 +0100 Subject: [PATCH 04/32] WIP --- fetch_test.go | 212 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 211 insertions(+), 1 deletion(-) diff --git a/fetch_test.go b/fetch_test.go index f1db441..18ca076 100644 --- a/fetch_test.go +++ b/fetch_test.go @@ -3,7 +3,10 @@ package sturdyc_test import ( "context" "errors" + "math/rand/v2" + "strconv" "sync" + "sync/atomic" "testing" "time" @@ -1357,4 +1360,211 @@ func TestGetFetchSynchronousRefreshStampedeProtection(t *testing.T) { fetchObserver.AssertFetchCount(t, 2) } -func TestGetFetchBatchMixOfSynchronousAndAsynchronousRefreshes(t *testing.T) {} +func TestGetFetchBatchSynchronousRefreshStampedeProtection(t *testing.T) { + t.Parallel() + + ctx := context.Background() + capacity := 10 + numShards := 2 + ttl := time.Second * 2 + evictionPercentage := 10 + clock := sturdyc.NewTestClock(time.Now()) + minRefreshDelay := time.Millisecond * 500 + maxRefreshDelay := time.Millisecond * 500 + synchronousRefreshDelay := time.Second + refreshRetryInterval := time.Millisecond * 10 + + // The cache is going to have a 2 second TTL, and the first refresh should happen within a second. + c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithNoContinuousEvictions(), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), + sturdyc.WithMissingRecordStorage(), + sturdyc.WithClock(clock), + ) + + ids := []string{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10"} + fetchObserver := NewFetchObserver(1) + fetchObserver.BatchResponse(ids) + + res, err := sturdyc.GetOrFetchBatch(ctx, c, ids, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 10 { + t.Fatalf("expected 10 records, got %d", len(res)) + } + fetchObserver.AssertRequestedRecords(t, ids) + fetchObserver.AssertFetchCount(t, 1) + fetchObserver.Clear() + + // Now, we're going to go past the synchronous refresh delay and try to + // retrieve 3 random keys (without duplicates) from 1000 goroutines at once. + // In an ideal world, this should lead to 4 outgoing requests, e.g: + // 1, 2, 3 + // 4, 5, 6 + // 7, 8, 9 + // However, we're not using delaying these requests, hence we could get a + // maximum of 10 outgoing requests if the batches were to get spread out + // something like this: + // 1, 2, 3 + // 1, 2, 4 + // 1, 2, 5 + // 1, 2, 6 + // 1, 2, 7 + // 1, 2, ... + clock.Add(synchronousRefreshDelay + 1) + numGoroutines := 1000 + var wg sync.WaitGroup + wg.Add(numGoroutines) + + // We need to create another fetch mock because the fetchObserver resolves + // the response immediately. However, we want to delay the response in order to + // check that the deduplication works as expected. If we don't delay the + // function responding, we'll have other goroutines with synchronous refresh + // ids that are going to send of another request right after. + signal := make(chan struct{}) + var callCount atomic.Int32 + fetchMock := func(_ context.Context, ids []string) (map[string]string, error) { + <-signal + callCount.Add(1) + responseMap := make(map[string]string, len(ids)) + for _, id := range ids { + responseMap[id] = "value" + id + } + return responseMap, nil + } + + for i := 0; i < numGoroutines; i++ { + go func() { + defer wg.Done() + uniqueIDs := make(map[string]struct{}) + for len(uniqueIDs) < 3 { + id := ids[rand.IntN(len(ids))] + if _, ok := uniqueIDs[id]; ok { + continue + } + uniqueIDs[id] = struct{}{} + } + idsToFetch := make([]string, 0, 3) + for id := range uniqueIDs { + idsToFetch = append(idsToFetch, id) + } + res, err := sturdyc.GetOrFetchBatch(ctx, c, idsToFetch, c.BatchKeyFn("item"), fetchMock) + if err != nil { + panic(err) + } + if len(res) != 3 { + panic("expected 3 records, got " + strconv.Itoa(len(res))) + } + for _, id := range idsToFetch { + if _, ok := res[id]; !ok { + panic("expected id " + id + " to be in the response") + } + } + }() + } + // Allow all of the goroutines to start and get some CPU time. + time.Sleep(time.Millisecond * 500) + // Now, we'll close the channel which should give all of the goroutines their response. + close(signal) + // Wait for the wait group so that we can run the assertions within the goroutines. + wg.Wait() + if callCount.Load() > 10 { + t.Errorf("expected no more than 10 calls, got %d", callCount.Load()) + } +} + +func TestGetFetchBatchMixOfSynchronousAndAsynchronousRefreshes(t *testing.T) { + t.Parallel() + + ctx := context.Background() + capacity := 10 + numShards := 2 + ttl := time.Second * 2 + evictionPercentage := 10 + clock := sturdyc.NewTestClock(time.Now()) + minRefreshDelay := time.Millisecond * 500 + maxRefreshDelay := time.Millisecond * 500 + synchronousRefreshDelay := time.Second + refreshRetryInterval := time.Millisecond * 10 + batchSize := 20 + batchBufferTimeout := time.Millisecond * 50 + + c := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithNoContinuousEvictions(), + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, refreshRetryInterval), + sturdyc.WithMissingRecordStorage(), + sturdyc.WithRefreshCoalescing(batchSize, batchBufferTimeout), + sturdyc.WithClock(clock), + ) + + // We'll start by fetching one batch of IDs, and make some assertions. + firstBatchOfIDs := []string{"1", "2", "3"} + fetchObserver := NewFetchObserver(2) + fetchObserver.BatchResponse(firstBatchOfIDs) + res, err := sturdyc.GetOrFetchBatch(ctx, c, firstBatchOfIDs, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 3 { + t.Fatalf("expected 3 records, got %d", len(res)) + } + fetchObserver.AssertRequestedRecords(t, firstBatchOfIDs) + fetchObserver.Clear() + + // Next, we'll move the clock past the synchronous refresh delay, and + // make a call for a second batch of IDs. The first batch of IDs should + // now be a second old, and due for a synchronous refresh the next time + // any of the IDs are requested. + clock.Add(synchronousRefreshDelay + 1) + secondBatchOfIDs := []string{"4", "5", "6"} + fetchObserver.BatchResponse(secondBatchOfIDs) + res, err = sturdyc.GetOrFetchBatch(ctx, c, secondBatchOfIDs, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 3 { + t.Fatalf("expected 3 records, got %d", len(res)) + } + fetchObserver.AssertRequestedRecords(t, secondBatchOfIDs) + fetchObserver.Clear() + + // Now we'll move the clock passed the maxRefreshDelay, which should make + // the second batch of IDs due for a refresh, but not a synchronous one. + clock.Add(maxRefreshDelay + 1) + + // Here we create a third batch of IDs which contains one of the IDs from the + // first batch, and another ID from the second batch, and an additional ID + // that we haven't seen before. + thirdBatchOfIDs := []string{"1", "4", "23"} + fetchObserver.BatchResponse([]string{"1", "23"}) + res, err = sturdyc.GetOrFetchBatch(ctx, c, thirdBatchOfIDs, c.BatchKeyFn("item"), fetchObserver.FetchBatch) + <-fetchObserver.FetchCompleted + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if len(res) != 3 { + t.Fatalf("expected 3 records, got %d", len(res)) + } + // We only expect to have called the underlying data source with the + // ID from the first batch, and the ID we haven't seen before. + fetchObserver.AssertRequestedRecords(t, []string{"1", "23"}) + fetchObserver.AssertFetchCount(t, 3) + fetchObserver.Clear() + + // Since we're using the WithRefreshCoalescing option, the cache will have created + // an internal buffer where it's trying to gather 10 IDs before sending them off + // to the underlying data source. However, we're only asking for one ID from the + // second batch. Therefore, we'll have to move the clock in order to make the cache + // exceed the buffering timeout. + fetchObserver.BatchResponse([]string{"4"}) + // Give the buffering goroutine a chance to run before we move the clock. + time.Sleep(time.Millisecond * 200) + clock.Add(batchBufferTimeout + 1) + <-fetchObserver.FetchCompleted + fetchObserver.AssertRequestedRecords(t, []string{"4"}) + fetchObserver.AssertFetchCount(t, 4) +} From b25220326bdeb5b06ac12c0b720686dcc9e9137c Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Thu, 2 Jan 2025 12:55:29 +0100 Subject: [PATCH 05/32] WIP --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 877c5fe..1ac2378 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ ![sturdyC-fn-2](https://github.com/user-attachments/assets/2def120a-ad2b-4590-bef0-83c461af1b07) +> *A sturdy gopher shielding data sources from rapidly incoming requests # `sturdyc`: a caching library for building sturdy systems From 8a985807c90943d6634a7906f481151731770f28 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Thu, 2 Jan 2025 12:55:57 +0100 Subject: [PATCH 06/32] WIP --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1ac2378..187bb71 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![sturdyC-fn-2](https://github.com/user-attachments/assets/2def120a-ad2b-4590-bef0-83c461af1b07) -> *A sturdy gopher shielding data sources from rapidly incoming requests +> A sturdy gopher shielding data sources from rapidly incoming requests # `sturdyc`: a caching library for building sturdy systems From e401972ee48427c844f88c2d5d64d9e57ac01931 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Thu, 2 Jan 2025 12:57:34 +0100 Subject: [PATCH 07/32] WIP --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 187bb71..452ab4a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![sturdyC-fn-2](https://github.com/user-attachments/assets/2def120a-ad2b-4590-bef0-83c461af1b07) -> A sturdy gopher shielding data sources from rapidly incoming requests +> A sturdy gopher shielding data sources from rapidly incoming requests. # `sturdyc`: a caching library for building sturdy systems From af0a074de367157f822fd4a57cd97e814c2d94ce Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Thu, 2 Jan 2025 13:15:12 +0100 Subject: [PATCH 08/32] WIP --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 452ab4a..e8a7b77 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,13 @@ It has all the functionality you would expect from a caching library, but what **sets it apart** are the features designed to make I/O heavy applications both _robust_ and _highly performant_. +The API is very simple to use. If you’re currently retrieving your data from a +distributed cache, database, or API, you should be able to add this package to +your application for a significant performance boost without losing data +freshness -- provided you configure your cache client correctly. As you will see +below, there are many options, and I encourage you to read through this README +and experiment with the examples. + # Installing ```sh From 249aa593849e044c471a86d35e079fc2bf50c88f Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Thu, 2 Jan 2025 13:21:48 +0100 Subject: [PATCH 09/32] WIP --- README.md | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index e8a7b77..057f0be 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,18 @@ _robust_ and _highly performant_. The API is very simple to use. If you’re currently retrieving your data from a distributed cache, database, or API, you should be able to add this package to your application for a significant performance boost without losing data -freshness -- provided you configure your cache client correctly. As you will see -below, there are many options, and I encourage you to read through this README -and experiment with the examples. +freshness -- provided you configure your cache client correctly. As you will +see below, there are many options, and I encourage you to read through this +README and experiment with the examples to get an understanding of how it +works. Below is a screenshot showing the P95 latency improvements we've +observed after using this package in front of our distributed key-value store: + +  +Screenshot 2024-05-10 at 10 15 18 +  + +In addition to this, we were also able to reduce our number of outgoing +requests by more than 90% after enabling the refresh coalescing option. # Installing @@ -90,18 +99,6 @@ based on recency. The eviction algorithm uses time complexity without requiring write locks on reads to update a recency list. -### Latency improvements - -Below is a screenshot showing the latency improvements we've observed after -replacing our old cache with this package: - -  -Screenshot 2024-05-10 at 10 15 18 -  - -In addition to this, we've seen our number of outgoing requests decrease by -more than 90% after enabling refresh coalescing. - # Adding `sturdyc` to your application: I have tried to design the API in a way that should make it effortless to add From 7f4af7e8e8f208473eff22d160c0f30f0277e830 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Thu, 2 Jan 2025 13:24:19 +0100 Subject: [PATCH 10/32] WIP --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 057f0be..a40aa49 100644 --- a/README.md +++ b/README.md @@ -23,18 +23,18 @@ _robust_ and _highly performant_. The API is very simple to use. If you’re currently retrieving your data from a distributed cache, database, or API, you should be able to add this package to your application for a significant performance boost without losing data -freshness -- provided you configure your cache client correctly. As you will -see below, there are many options, and I encourage you to read through this -README and experiment with the examples to get an understanding of how it -works. Below is a screenshot showing the P95 latency improvements we've -observed after using this package in front of our distributed key-value store: +freshness — provided you configure your cache client correctly. As you will see +below, there are many options, and I encourage you to read through this README +and experiment with the examples to get an understanding of how it works. Below +is a screenshot showing the P95 latency improvements we've observed after adding +this package in front of our distributed key-value store:   Screenshot 2024-05-10 at 10 15 18   In addition to this, we were also able to reduce our number of outgoing -requests by more than 90% after enabling the refresh coalescing option. +requests by more than 90% after enabling the _refresh coalescing_ option. # Installing From cd94f5f3188d6b9ae7a1a80f5fe661cec66e7c63 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Thu, 2 Jan 2025 15:43:43 +0100 Subject: [PATCH 11/32] WIP --- README.md | 255 ++++++++++++++++++++++++------------- examples/refreshes/main.go | 20 +-- 2 files changed, 174 insertions(+), 101 deletions(-) diff --git a/README.md b/README.md index a40aa49..3cb495d 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,9 @@ distributed cache, database, or API, you should be able to add this package to your application for a significant performance boost without losing data freshness — provided you configure your cache client correctly. As you will see below, there are many options, and I encourage you to read through this README -and experiment with the examples to get an understanding of how it works. Below -is a screenshot showing the P95 latency improvements we've observed after adding +and experiment with the examples to get an understanding of how it works. + +Here is a screenshot showing the P95 latency improvements we've observed after adding this package in front of our distributed key-value store:   @@ -42,6 +43,26 @@ requests by more than 90% after enabling the _refresh coalescing_ option. go get github.com/viccon/sturdyc ``` +# Table of contents + +Below is the table of contents for what this README is going to cover. However, +if this is your first time using this package, I encourage you to **read these +examples in the order they appear**. Most of them build on each other, and many +share configurations. + +- [**creating a cache client**](https://github.com/viccon/sturdyc?tab=readme-ov-file#creating-a-cache-client) +- [**stampede protection**](https://github.com/viccon/sturdyc?tab=readme-ov-file#stampede-protection) +- [**early refreshes**](https://github.com/viccon/sturdyc?tab=readme-ov-file#early-refreshes) +- [**deletions**](https://github.com/viccon/sturdyc?tab=readme-ov-file#deletions) +- [**caching non-existent records**](https://github.com/viccon/sturdyc?tab=readme-ov-file#non-existent-records) +- [**caching batch endpoints per record**](https://github.com/viccon/sturdyc?tab=readme-ov-file#batch-endpoints) +- [**cache key permutations**](https://github.com/viccon/sturdyc?tab=readme-ov-file#cache-key-permutations) +- [**refresh coalescing**](https://github.com/viccon/sturdyc?tab=readme-ov-file#refresh-coalescing) +- [**request passthrough**](https://github.com/viccon/sturdyc?tab=readme-ov-file#passthrough) +- [**distributed storage**](https://github.com/viccon/sturdyc?tab=readme-ov-file#distributed-storage) +- [**custom metrics**](https://github.com/viccon/sturdyc?tab=readme-ov-file#custom-metrics) +- [**generics**](https://github.com/viccon/sturdyc?tab=readme-ov-file#generics) + # At a glance ### Deduplication @@ -183,25 +204,6 @@ retrieves the data has all of the values it needs. Next, we'll look at how to configure the cache in more detail. -# Table of contents - -I've included examples that cover the entire API, and I encourage you to **read -these examples in the order they appear**. Most of them build on each other, -and many share configurations. Here is a brief overview of what the examples -are going to cover: - -- [**creating a cache client**](https://github.com/viccon/sturdyc?tab=readme-ov-file#creating-a-cache-client) -- [**stampede protection**](https://github.com/viccon/sturdyc?tab=readme-ov-file#stampede-protection) -- [**early refreshes**](https://github.com/viccon/sturdyc?tab=readme-ov-file#early-refreshes) -- [**caching non-existent records**](https://github.com/viccon/sturdyc?tab=readme-ov-file#non-existent-records) -- [**caching batch endpoints per record**](https://github.com/viccon/sturdyc?tab=readme-ov-file#batch-endpoints) -- [**cache key permutations**](https://github.com/viccon/sturdyc?tab=readme-ov-file#cache-key-permutations) -- [**refresh coalescing**](https://github.com/viccon/sturdyc?tab=readme-ov-file#refresh-coalescing) -- [**request passthrough**](https://github.com/viccon/sturdyc?tab=readme-ov-file#passthrough) -- [**distributed storage**](https://github.com/viccon/sturdyc?tab=readme-ov-file#distributed-storage) -- [**custom metrics**](https://github.com/viccon/sturdyc?tab=readme-ov-file#custom-metrics) -- [**generics**](https://github.com/viccon/sturdyc?tab=readme-ov-file#generics) - # Creating a cache client The first thing you will have to do is to create a cache client to hold your @@ -241,12 +243,12 @@ cache, come in at once. Preventing this has been one of the key objectives for this package. We do not want to cause a significant load on an underlying data source every time one of -our keys expires. +our keys expires. To address this, `sturdyc` performs _in-flight_ tracking for +every key. -The `GetOrFetch` function takes a key and a function for retrieving the data if -it's not in the cache. The cache is going to ensure that we never have more -than a single request per key. It achieves this by tracking all of the -in-flight requests: +We can demonstrate this using the `GetOrFetch` function which takes a key, and +a function for retrieving the data if it's not in the cache. The cache is going +to ensure that we never have more than a single request per key: ```go var count atomic.Int32 @@ -273,7 +275,7 @@ in-flight requests: ``` -Running this program we'll see that our requests for "key2" were deduplicated, +Running this program we'll see that our requests for "key2" got deduplicated, and that the fetchFn only got called once: ```sh @@ -287,14 +289,21 @@ and that the fetchFn only got called once: 2024/05/21 08:06:29 1337 true ``` -For data sources that supports batching, we're able to use the -`GetOrFetchBatch` function. To demonstrate this, I'll create a mock function -that sleeps for `5` seconds, and then returns a map with a numerical value for -every ID: +The in-flight tracking works for batch operations too. The cache is able to +deduplicate a batch of cache misses, and then assemble the response by picking +records from multiple in-flight requests. + +To demonstrate this, we'll use the `GetOrFetchBatch` function, which can be +used to retrieve data from a data source capable of handling requests for +multiple records at once. + +We'll start by creating a mock function that sleeps for `5` seconds, and then +returns a map with a numerical value for every ID: ```go var count atomic.Int32 fetchFn := func(_ context.Context, ids []string) (map[string]int, error) { + // Increment the counter so that we can assert how many times this function was called. count.Add(1) time.Sleep(time.Second * 5) @@ -319,10 +328,14 @@ IDs each: } ``` -IDs can often be fetched from multiple data sources. Hence, we'll want to -prefix the ID in order to make the cache key unique. The package provides more -functionality for this that we'll see later on, but for now we'll use the most -simple version which adds a string prefix to every ID: +IDs can often be used to fetch data from multiple data sources. As an example, +we might use an userId to fetch orders, payments, shipment options, etc. Hence, +if we're using the cache with an API client, we'll want to prefix this user ID +with the actual endpoint we're using in order to make the cache key unique. + +The package provides more functionality for this that we'll see later on, but +for now we'll use the most simple version which adds a string prefix to every +ID: ```go keyPrefixFn := cacheClient.BatchKeyFn("my-data-source") @@ -338,7 +351,8 @@ We can now request each batch in a separate goroutine: }() } - // Give the goroutines above a chance to run to ensure that the batches are in-flight. + // Sleep to give the goroutines above a chance to run. + // This ensures that the batches are in-flight. time.Sleep(time.Second * 3) ``` @@ -347,7 +361,12 @@ this, we'll test the stampede protection by launching another five goroutines. Each goroutine is going to request two random IDs from our batches: ```go - // Launch another 5 goroutines that are going to pick two random IDs from any of the batches. + // Launch another 5 goroutines that are going to pick two random IDs from any of our in-flight batches. + // e.g: + // [1,8] + // [4,11] + // [14,2] + // [6,15] var wg sync.WaitGroup for i := 0; i < 5; i++ { wg.Add(1) @@ -364,7 +383,8 @@ Each goroutine is going to request two random IDs from our batches: ``` Running this program, and looking at the logs, we'll see that the cache is able -to pick IDs from different batches: +resolve all of these values without generating any additional outgoing requests +even though the IDs are picked from different batches: ```sh ❯ go run . @@ -379,37 +399,60 @@ to pick IDs from different batches: 2024/05/21 09:14:23 fetchFn was called 3 times <---- NOTE: We only generated 3 outgoing requests. ``` -And on the last line, we can see that the additional calls didn't generate any -further outgoing requests. The entire example is available [here.](https://github.com/viccon/sturdyc/tree/main/examples/basic) - -## Early refreshes - -Being able to prevent your most frequently used records from ever expiring can -have a significant impact on your application's latency. Therefore, the package -provides a `WithEarlyRefreshes` option, which instructs the cache to -continuously refresh these records in the background before they expire. - -A refresh gets scheduled if a key is **requested again** after a configurable -amount of time has passed. This is an important distinction because it means -that the cache doesn't just naively refresh every key it's ever seen. Instead, -it only refreshes the records that are actually in active rotation, while -allowing unused keys to be deleted once their TTL expires. +The entire example is available [here.](https://github.com/viccon/sturdyc/tree/main/examples/basic) + +# Early refreshes + +Serving data from memory is typically one to two orders of magnitude faster +than reading from disk, and if you have to retrieve the data across a network +the difference can grow even larger. Consequently, we're often able to +significantly improve our applications performance by adding an in-memory +cache. + +However, one has to be aware of the usual trade-offs. Suppose we use a TTL of +10 seconds. That means the cached data can be up to 10 seconds old. In many +applications this may be acceptable, but in others it can introduce stale +reads. Additionally, once the cached value expires, the first request after +expiration must refresh the cache, resulting in a longer response time for that +user. This can make the average latency look very different from the P90–P99 +tail latencies, since those percentiles capture the refresh delays. This can +make it difficult to configure appropriate alarms for your applications +response times. + +`sturdyc` aims to give you a lot of control over these choices when you enable +the early refreshes functionality. It will prevent your most frequently used +records from ever expiring by continuously refreshing them in the background. +This has a significant impact on your applications latency. We've seen the P99 +of some of our applications go from 50ms down to 1. + +One thing to note about these background refreshes is that they are scheduled +if a key is **requested again** after a configurable amount of time has passed. +This is an important distinction because it means that the cache doesn't just +naively refresh every key it's ever seen. Instead, it only refreshes the +records that are actually in active rotation, while allowing unused keys to be +deleted once their TTL expires. This also means that the request that gets +chosen to refresh the value won’t retrieve the updated data right away. To +address this, you can provide a synchronous refresh time, where you essentially +say, "If the data is older than x, I want the refresh to be blocking." Below is an example configuration that you can use to enable this functionality: ```go func main() { - // Set a minimum and maximum refresh delay for the record. This is - // used to spread out the refreshes of our entries evenly over time. - // We don't want our outgoing requests graph to look like a comb that - // sends a spike of refreshes every 30 ms. + // Set a minimum and maximum refresh delay for the records. This is used to + // spread out the refreshes of our records evenly over time. If we're running + // our application across 100 containers, we don't want to send a spike of + // refreshes from every container every 30 ms. Instead, we'll use some + // randomization to spread them out evenly between 10 and 30 ms. minRefreshDelay := time.Millisecond * 10 maxRefreshDelay := time.Millisecond * 30 - // The base used for exponential backoff when retrying a refresh. Most of the - // time, we perform refreshes well in advance of the records expiry time. - // Hence, we can use this to make it easier for a system that is having - // trouble to get back on it's feet by making fewer refreshes when we're - // seeing a lot of errors. Once we receive a successful response, the + // Set a synchronous refresh delay for when we want a refresh to happen synchronously. + synchronousRefreshDelay := time.Second * 30 + // The base used for exponential backoff when retrying a background refresh. + // Most of the time, we perform refreshes well in advance of the records + // expiry time. Hence, we can use this to make it easier for a system that + // is having trouble to get back on it's feet by making fewer refreshes when + // we're seeing a lot of errors. Once we receive a successful response, the // refreshes return to their original frequency. You can set this to 0 // if you don't want this behavior. retryBaseDelay := time.Millisecond * 10 @@ -421,8 +464,8 @@ func main() { } ``` -And to get a feeling for how this works, we'll use the configuration above and -then create a simple API client which embedds the cache: +And to get a better feeling for how this works, we'll use the configuration +above, and then we'll create a simple API client which embedds the cache: ```go type API struct { @@ -474,6 +517,7 @@ Running this program, we're going to see that the value gets refreshed once every 2-3 retrievals: ```sh +cd examples/refreshes go run . 2024/04/07 09:05:29 Fetching value for key: key 2024/04/07 09:05:29 Value: value @@ -486,26 +530,51 @@ go run . ... ``` -This is going to reduce your response times significantly because none of your -users will have to wait for the I/O operation that refreshes the data. It's -always performed in the background as long as the key is being continuously -requested. Being afraid that the record might get too stale if users stop -requesting it is an indication of a TTL that is set too high. Remember, even if -the TTL is exceeded and the key expires, you'll still get deduplication if it's -suddenly requested in a burst again. The only difference is that the users will -have to wait for the I/O operation that retrieves it. - -Additionally, to provide a degraded experience when an upstream system -encounters issues, you can set a high TTL and a low refresh time. When -everything is working as expected, the records will be refreshed continuously. -However, if the upstream system encounters issues and stops responding, you can -fall back to cached records for the duration of the TTL. +If this was a real application it would have reduced our response times +significantly because none of our users would have to wait for the I/O +operation that refreshes the data. It's always performed in the background as +long as the key is being continuously requested. + +We don't have to be afraid that the data for infrequently used keys gets stale +either, given that we set the synchronous refresh delay like this: + +```go + synchronousRefreshDelay := time.Second * 30 +``` + +If a key isn't requested again within 30 seconds, the cache will make the +refresh synchronous. Even if a minute has passed and 1,000 requests suddenly +come in for this key, the stampede protection will kick in and make the refresh +synchronous for all of them, while also ensuring that only a single request is +made to the underlying data source. + +I also like to use this feature to provide a degraded experience when an +upstream system encounters issues. For this, I choose a high TTL and a low +refresh time, so that when everything is working as expected, the records are +refreshed continuously. However, if the upstream system stops responding, I can +rely on cached records for the entire duration of the TTL. + +One important note is that the synchronous refresh time isn’t affected by the +exponential backoff. The number of background refreshes is going to get reduced +if an upsteam system is experiencing errors. However, if we reach a point where +all of the records are older than the synchronous refresh time, we're going to +send a steady stream of outgoing requests. That is because I think of the +synchronous refresh time as "I really don’t want the data to be older than +this," so if a synchronous refresh fails, I want the very next request to +attempt another refresh, because the data is now older than I’d like it to be. + +Also, if you don't want this functionality you could just set a short TTL. The +cache will never return a record where the TTL has expired. + +The entire example is available [here.](https://github.com/viccon/sturdyc/tree/main/examples/refreshes) + +# Deletions What if the record was deleted? Our cache might use a 2-hour-long TTL, and we definitely don't want it to take that long for the deletion to propagate. -However, if we were to modify our client so that it returns an error after the -first request: +However, if we were to modify our client from the previous example so that it +returns an error after the first request: ```go type API struct { @@ -533,7 +602,7 @@ func (a *API) Get(ctx context.Context, key string) (string, error) { and then run the program again: ```sh -cd examples/stampede +cd examples/refreshes go run . ``` @@ -577,7 +646,7 @@ for every refresh, but the value is still being printed: This is a bit tricky because how you determine if a record has been deleted is going to vary based on your data source. It could be a status code, zero value, -empty list, specific error message, etc. There is no way for the cache to +empty list, specific error message, etc. There is no easy way for the cache to figure this out implicitly. It couldn't simply delete a record every time it receives an error. If an @@ -599,7 +668,7 @@ fetchFn := func(_ context.Context) (string, error) { } ``` -This tell's the cache that the record is no longer available at the underlying data source. +This tells the cache that the record is no longer available at the underlying data source. Therefore, if this record is being fetched as a background refresh, the cache will quickly see if it has a record for this key, and subsequently delete it. @@ -644,16 +713,16 @@ just a single ID wasn't found: and then have the cache swallow that error and return nil, felt much less intuitive. -The entire example is available [here.](https://github.com/viccon/sturdyc/tree/main/examples/refreshes) +This code is based on the example available [here.](https://github.com/viccon/sturdyc/tree/main/examples/refreshes) # Non-existent records In the example above, we could see that once we delete the key, the following -iterations lead to a continuous stream of outgoing requests. This will happen -for every ID that doesn't exist at the data source. If we can't retrieve it, we -can't cache it. If we can't cache it, we can't serve it from memory. If this -happens frequently, we'll experience a lot of I/O operations, which will -significantly increase our system's latency. +iterations lead to a continuous stream of outgoing requests. This would also +happen for every ID that doesn't exist at the underlying data source. If we +can't retrieve it, we can't cache it. If we can't cache it, we can't serve it +from memory. If this happens frequently, we'll experience a lot of I/O +operations, which will significantly increase our system's latency. The reasons why someone might request IDs that don't exist can vary. It could be due to a faulty CMS configuration, or perhaps it's caused by a slow @@ -662,9 +731,11 @@ distributed system. Regardless, this will negatively impact our systems performance. To address this issue, we can instruct the cache to mark these IDs as missing -records. Missing records are refreshed at the same frequency as regular -records. Hence, if an ID is continuously requested, and the upstream eventually -returns a valid response, we'll see it propagate to our cache. +records. If you're using this functionality in combination with the +`WithEarlyRefreshes` option, they are going to get refreshed at the same +frequency as regular records. Hence, if an ID is continuously requested, and +the upstream eventually returns a valid response, we'll see it propagate to our +cache. To illustrate, I'll make some small modifications to the code from the previous example. The only thing I'm going to change is to make the API client return a diff --git a/examples/refreshes/main.go b/examples/refreshes/main.go index 19f7722..c58d145 100644 --- a/examples/refreshes/main.go +++ b/examples/refreshes/main.go @@ -42,20 +42,22 @@ func main() { evictionPercentage := 10 // =========================================================== - // =================== Background refreshes ================== + // ===================== Early refreshes ==================== // =========================================================== - // Set a minimum and maximum refresh delay for the record. This is - // used to spread out the refreshes of our entries evenly over time. - // We don't want our outgoing requests graph to look like a comb. + // Set a minimum and maximum refresh delay for the records. This is used to + // spread out the refreshes of our records evenly over time. If we're running + // our application across 100 containers, we don't want to send a spike of + // refreshes from every container every 30 ms. Instead, we'll use some + // randomization to spread them out evenly between 10 and 30 ms. minRefreshDelay := time.Millisecond * 10 maxRefreshDelay := time.Millisecond * 30 // Set a synchronous refresh delay for when we want a refresh to happen synchronously. synchronousRefreshDelay := time.Second * 30 - // The base used for exponential backoff when retrying a refresh. Most of the - // time, we perform refreshes well in advance of the records expiry time. - // Hence, we can use this to make it easier for a system that is having - // trouble to get back on it's feet by making fewer refreshes when we're - // seeing a lot of errors. Once we receive a successful response, the + // The base used for exponential backoff when retrying a background refresh. + // Most of the time, we perform refreshes well in advance of the records + // expiry time. Hence, we can use this to make it easier for a system that + // is having trouble to get back on it's feet by making fewer refreshes when + // we're seeing a lot of errors. Once we receive a successful response, the // refreshes return to their original frequency. You can set this to 0 // if you don't want this behavior. retryBaseDelay := time.Millisecond * 10 From 1f27fc1c02ca7763cd6b6f95ae7bfa5d579d93ae Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Fri, 3 Jan 2025 13:36:57 +0100 Subject: [PATCH 12/32] WIP --- README.md | 589 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 379 insertions(+), 210 deletions(-) diff --git a/README.md b/README.md index 3cb495d..1d3f2a9 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,8 @@ examples in the order they appear**. Most of them build on each other, and many share configurations. - [**creating a cache client**](https://github.com/viccon/sturdyc?tab=readme-ov-file#creating-a-cache-client) +- [**evictions**](https://github.com/viccon/sturdyc?tab=readme-ov-file#evictions) +- [**get or fetch**](https://github.com/viccon/sturdyc?tab=readme-ov-file#get-or-fetch) - [**stampede protection**](https://github.com/viccon/sturdyc?tab=readme-ov-file#stampede-protection) - [**early refreshes**](https://github.com/viccon/sturdyc?tab=readme-ov-file#early-refreshes) - [**deletions**](https://github.com/viccon/sturdyc?tab=readme-ov-file#deletions) @@ -63,51 +65,37 @@ share configurations. - [**custom metrics**](https://github.com/viccon/sturdyc?tab=readme-ov-file#custom-metrics) - [**generics**](https://github.com/viccon/sturdyc?tab=readme-ov-file#generics) -# At a glance - -### Deduplication - -`sturdyc` performs _in-flight_ tracking for every key. This also works for -batch operations, where it can deduplicate a batch of cache misses and then -assemble the response by picking records from multiple in-flight requests. - -### Early refreshes +# Creating a cache client -There is also a lot of extra functionality you can enable, one being _early -refreshes_ which instructs the cache to refresh the keys which are in active -rotation, thereby preventing them from ever expiring. This can have a huge -impact on an applications latency as you're able to continiously serve the most -frequently used data from memory: +The first thing you will have to do is to create a cache client to hold your +configuration: ```go -sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, exponentialBackOff) -``` - -### Batching + // Maximum number of entries in the cache. Exceeding this number will trigger + // an eviction (as long as the "evictionPercentage" is greater than 0). + capacity := 10000 + // Number of shards to use. Increasing this number will reduce write lock collisions. + numShards := 10 + // Time-to-live for cache entries. + ttl := 2 * time.Hour + // Percentage of entries to evict when the cache reaches its capacity. Setting this + // to 0 will make writes a no-op until an item has either expired or been deleted. + evictionPercentage := 10 -When the cache retrieves data from a batchable source, it will disassemble the -response and then cache each record individually based on the permutations of -the options with which it was fetched. + // Create a cache client with the specified configuration. + cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage) -This can be used to **significantly reduce** the application's outgoing -requests by also enabling _refresh coalescing_. Internally, `sturdyc` -creates a buffer for each unique option set and gathers IDs until the -`idealBatchSize` is reached or the `batchBufferTimeout` expires: + cacheClient.Set("key1", 99) + log.Println(cacheClient.Size()) + log.Println(cacheClient.Get("key1")) -```go -sturdyc.WithRefreshCoalescing(idealBatchSize, batchBufferTimeout) + cacheClient.Delete("key1") + log.Println(cacheClient.Size()) + log.Println(cacheClient.Get("key1")) ``` -### Distributed key-value store - -You can also configure `sturdyc` to synchronize its in-memory cache with a -**distributed key-value store** of your choosing: - -```go -sturdyc.WithDistributedStorage(storage), -``` +# Evictions -### Evictions The cache runs a background job which continuously evicts expired records from each shard. However, there are options to both tweak the interval and disable the functionality altogether. This is can give you a slight performance boost @@ -120,14 +108,18 @@ based on recency. The eviction algorithm uses time complexity without requiring write locks on reads to update a recency list. -# Adding `sturdyc` to your application: +Next, we'll start to look at some of the more _advanced features_. + +# Get or fetch I have tried to design the API in a way that should make it effortless to add -`sturdyc` to an existing application. We'll use the following two methods of an -API client as examples: +`sturdyc` to an existing application. To take advantage of the more advanced +functionality that we'll see in the sections below you'll essentially just be +interacting with two functions: `GetOrFetch` and `GetOrFetchBatch`. + +All you would have to do is to take your existing code: ```go -// Order retrieves a single order by ID. func (c *Client) Order(ctx context.Context, id string) (Order, error) { timeoutCtx, cancel := context.WithTimeout(ctx, c.timeout) defer cancel() @@ -140,25 +132,10 @@ func (c *Client) Order(ctx context.Context, id string) (Order, error) { return response, err } - -// Orders retrieves a batch of orders by their IDs. -func (c *Client) Orders(ctx context.Context, ids []string) (map[string]Order, error) { - timeoutCtx, cancel := context.WithTimeout(ctx, c.timeout) - defer cancel() - - var response map[string]Order - err := requests.URL(c.orderURL). - Path("/orders"). - Param("ids", strings.Join(ids, ",")). - ToJSON(&response). - Fetch(timeoutCtx) - - return response, err -} ``` -All we have to do is wrap the code that retrieves the data in a function, and -then hand it over to our cache client: +and wrap the actual lines that retrieves the data in a function, and then hand +that over to our cache client: ```go func (c *Client) Order(ctx context.Context, id string) (Order, error) { @@ -175,65 +152,18 @@ func (c *Client) Order(ctx context.Context, id string) (Order, error) { return response, err } - return c.cache.GetOrFetch(ctx, "order-"+id, fetchFunc) -} - -func (c *Client) Orders(ctx context.Context, ids []string) (map[string]Order, error) { - fetchFunc := func(ctx context.Context, cacheMisses []string) (map[string]Order, error) { - timeoutCtx, cancel := context.WithTimeout(ctx, c.timeout) - defer cancel() - - var response map[string]Order - err := requests.URL(c.orderURL). - Path("/orders"). - Param("ids", strings.Join(cacheMisses, ",")). - ToJSON(&response). - Fetch(timeoutCtx) - - return response, err - } - - return c.cache.GetOrFetchBatch(ctx, ids, c.persistentCache.BatchKeyFn("orders"), fetchFunc) + return c.cache.GetOrFetch(ctx, id, fetchFunc) } ``` -The example above retrieves the data from an HTTP API, but it's just as easy to -wrap a database query, a remote procedure call, a disk read, or any other I/O -operation. We can also use closures to make sure that the function that -retrieves the data has all of the values it needs. - -Next, we'll look at how to configure the cache in more detail. - -# Creating a cache client +The cache is then going to return the value from the cache if it's available, +and otherwise it will call the `fetchFn` to retrieve the data from the +underlying data source. -The first thing you will have to do is to create a cache client to hold your -configuration: - -```go - // Maximum number of entries in the cache. Exceeding this number will trigger - // an eviction (as long as the "evictionPercentage" is greater than 0). - capacity := 10000 - // Number of shards to use. Increasing this number will reduce write lock collisions. - numShards := 10 - // Time-to-live for cache entries. - ttl := 2 * time.Hour - // Percentage of entries to evict when the cache reaches its capacity. Setting this - // to 0 will make writes a no-op until an item has either expired or been deleted. - evictionPercentage := 10 - - // Create a cache client with the specified configuration. - cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage) - - cacheClient.Set("key1", 99) - log.Println(cacheClient.Size()) - log.Println(cacheClient.Get("key1")) - - cacheClient.Delete("key1") - log.Println(cacheClient.Size()) - log.Println(cacheClient.Get("key1")) -``` - -Next, we'll look at some of the more _advanced features_. +Most of our examples are going to be retrieving data from HTTP APIs, but it's +just as easy to wrap a database query, a remote procedure call, a disk read, or +any other I/O operation. We'll also see how we can use closures to pass query +parameters and other options. # Stampede protection @@ -246,9 +176,10 @@ want to cause a significant load on an underlying data source every time one of our keys expires. To address this, `sturdyc` performs _in-flight_ tracking for every key. -We can demonstrate this using the `GetOrFetch` function which takes a key, and -a function for retrieving the data if it's not in the cache. The cache is going -to ensure that we never have more than a single request per key: +We can demonstrate this using the `GetOrFetch` function which, as I mentioned +before, takes a key, and a function for retrieving the data if it's not in the +cache. The cache is going to ensure that we never have more than a single +request per key: ```go var count atomic.Int32 @@ -403,9 +334,9 @@ The entire example is available [here.](https://github.com/viccon/sturdyc/tree/m # Early refreshes -Serving data from memory is typically one to two orders of magnitude faster -than reading from disk, and if you have to retrieve the data across a network -the difference can grow even larger. Consequently, we're often able to +Serving data from memory is typically at least one to two orders of magnitude +faster than reading from disk, and if you have to retrieve the data across a +network the difference can grow even larger. Consequently, we're often able to significantly improve our applications performance by adding an in-memory cache. @@ -836,7 +767,7 @@ The entire example is available [here.](https://github.com/viccon/sturdyc/tree/m # Batch endpoints One challenge with caching batchable endpoints is that you have to find a way -to reduce the number of keys. To illustrate, let's say that we have 10 000 +to reduce the number of cache keys. To illustrate, let's say that we have 10 000 records, and an endpoint for fetching them that allows for batches of 20. The IDs for the batch are supplied as query parameters, for example, `https://example.com?ids=1,2,3,4,5,...20`. If we were to use this as the cache @@ -857,17 +788,40 @@ and this is if we're sending perfect batches of 20. If we were to do 1 to 20 IDs (not just exactly 20 each time) the total number of combinations would be the sum of combinations for each k from 1 to 20. -At this point, we would essentially just be paying for extra RAM, as the hit -rate for each key would be so low that we'd have better odds of winning the -lottery. +At this point, the hit rate for each key would be so low that we'd have better +odds of winning the lottery. To prevent this, `sturdyc` pulls the response apart and caches each record individually. This effectively prevents super-polynomial growth in the number -of cache keys because the batch itself is never going to be inlcuded in the +of cache keys because the batch itself is never going to be included in the key. -To get a feeling for how this works, let's once again build a small example -application. This time, we'll start with the API client: +To get a better feeling for how this works, we can look at the function signature +for the `GetOrFetchBatch` function: + +```go +func (c *Client[T]) GetOrFetchBatch(ctx context.Context, ids []string, keyFn KeyFn, fetchFn BatchFetchFn[T]) (map[string]T, error) {} +``` + +What the cache does is that it takes the IDs, applies the `keyFn` to them, and +then checks each key individually if it's present in the cache. The keys that +aren't present will be fetched using the `fetchFn`. + +The `fetchFn` is going to have this signature where it returns a map where the ID is the key: + +```go +type BatchFetchFn[T any] func(ctx context.Context, ids []string) (map[string]T, error) +``` + +The cache can use this to iterate through the response map, again apply the +`keyFn` to each ID, and then store each record individually in the cache. + +Sometimes, the function signature for the `BatchFetchFn` can feel too limited. +You may need additional options and not just the IDs to retrieve the data. But +don't worry, we'll look at how to solve this in the next section! + +For now, to get some code to play around with, let's once again build a small +example application. This time, we'll start with the API client: ```go type API struct { @@ -954,29 +908,87 @@ The entire example is available [here.](https://github.com/viccon/sturdyc/tree/m # Cache key permutations -If you're attempting to cache data from an upstream system, the ID alone may be -insufficient to uniquely identify the record in your cache. The endpoint you're -calling might accept a variety of options that transform the data in different -ways. +As I mentioned in the previous section, the function signature for the +`BatchFetchFn`, which the `GetOrFetchBatch` function uses, can feel too limited: -Consider this: +```go +type BatchFetchFn[T any] func(ctx context.Context, ids []string) (map[string]T, error) +``` -```sh -curl https://movie-api/movies?ids=1,2,3&filterUpcoming=true&includeTrailers=false -curl https://movie-api/movies?ids=1,2,3&filterUpcoming=false&includeTrailers=true +What if you're fetching data from some endpoint that accepts a variety of query +parameters? Or perhaps you're doing a database query and want to apply some +ordering and filtering to the data? + +We can easily get around this by using closures. Let's illustrate this by +looking at an actual API client I've written: + +```go +const moviesByIDsCacheKeyPrefix = "movies-by-ids" + +type MoviesByIDsOpts struct { + IncludeUpcoming bool + IncludeUpsell bool +} + +func (c *Client) MoviesByIDs(ctx context.Context, ids []string, opts MoviesByIDsOpts) (map[string]Movie, error) { + cacheKeyFunc := c.cache.PermutatedBatchKeyFn(moviesByIDsCacheKeyPrefix, opts) + fetchFunc := func(ctx context.Context, cacheMisses []string) (map[string]Movie, error) { + timeoutCtx, cancel := context.WithTimeout(ctx, c.timeout) + defer cancel() + + var response map[string]Movie + err := requests.URL(c.baseURL). + Path("/movies"). + Param("ids", strings.Join(cacheMisses, ",")). + Param("include_upcoming", strconv.FormatBool(opts.IncludeUpcoming)). + Param("include_upsell", strconv.FormatBool(opts.IncludeUpsell)). + ToJSON(&response). + Fetch(timeoutCtx) + return response, err + } + return sturdyc.GetOrFetchBatch(ctx, c.cache, ids, cacheKeyFunc, fetchFunc) +} ``` -The IDs might be enough to uniquely identify these records in a database. -However, when you're consuming them through another system, they will probably -appear completely different as transformations are applied based on the options -you pass it. Hence, it's important that we store these records once for each -unique option set. +The API clients `MoviesByIDs` function calls an external API to fetch movies by +IDs, and the `BatchFetchFn` that we're passing to `sturdyc` uses a closure to +provide the query parameters we need. -The options does not have to be query parameters either. The data source you're -consuming could still be a database, and the options that you want to make part -of the cache key could be different types of filters. +However, one **important** thing to note here is that the ID is _no longer_ +enough to _uniquely_ identify a record in our cache. The query parameters will +most likely be used by the system we're calling to transform the data in +various ways. Hence, we should cache each movie once for each permutation of +our options: -Below is a small example application to showcase this functionality: +``` +IncludeUpcoming: true IncludeUpsell: true +IncludeUpcoming: false IncludeUpsell: false +IncludeUpcoming: true IncludeUpsell: false +IncludeUpcoming: false IncludeUpsell: true +``` + +This is what the `PermutatedBatchKeyFn` is used for. It takes a prefix and a +struct which internally it uses reflection on in order to concatenate the +**exported** fields to form a unique cache key that would look something like +this: + +``` +// movies-by-ids is our prefix that we passed as the +// first argument to the PermutatedBatchKeyFn function. +movies-by-ids-true-true-ID-1 +movies-by-ids-false-false-ID-1 +movies-by-ids-true-false-ID-1 +movies-by-ids-false-true-ID-1 +``` + +Please note that the struct should be flat without nesting. The fields can be +`time.Time` values, as well as any basic types, pointers to these types, and +slices containing them. + +Once again, I'll provide a small example application that you can play around +with to get a deeper understanding of this functionality. We're essentially +going to use the same API client as before, but this time we're going to use +the `PermutatedBatchKeyFn` rather than the `BatchKeyFn`: ```go type OrderOptions struct { @@ -1012,15 +1024,6 @@ func (a *OrderAPI) OrderStatus(ctx context.Context, ids []string, opts OrderOpti } ``` -The main difference from the previous example is that we're using -`PermutatedBatchKeyFn` instead of `BatchKeyFn`. Internally, the cache will use -reflection to extract the names and values of every **exported** field in the -`opts` struct, and then include them when it constructs the cache keys. - -The struct should be flat without nesting. The fields can be `time.Time` -values, as well as any basic types, pointers to these types, and slices -containing them. - Now, let's try to use this client: ```go @@ -1055,9 +1058,9 @@ At this point, the cache has stored each record individually for each option set. We can imagine that the keys would look something like this: ``` -FEDEX-2024-04-06-id1 -DHL-2024-04-07-id1 -UPS-2024-04-08-id1 +FEDEX-2024-04-06-ID-1 +DHL-2024-04-07-ID-1 +UPS-2024-04-08-ID-1 etc.. ``` @@ -1109,14 +1112,65 @@ The entire example is available [here.](https://github.com/viccon/sturdyc/tree/m # Refresh coalescing -As seen in the example above, we're storing the records once for every set of -options. However, we're not really utilizing the fact that the endpoint is -batchable when we're performing the refreshes. +As you may recall, our client is using the `WithEarlyRefreshes` option to +refresh the records in the background whenever their keys are requested again +after a certain amount of time has passed. And as seen in the example above, +we're successfully storing the records once for every permutation of the +options we use to retrieve it. However, we're not really utilizing the fact +that the endpoint is batchable when we're performing the refreshes. To make this more efficient, we can enable the **refresh coalescing** -functionality. Internally, the cache is going to create a buffer for every -cache key permutation. It is then going to collect ids until it reaches a -certain size, or exceeds a time-based threshold. +functionality, but before we'll update our example to use it let's just take a +moment to understand how it works. + +To start, we need to understand what determines whether two IDs can be +coalesced for a refresh: *the options*. E.g, do we want to perform the same +data transformations for both IDs? If so, they can be sent in the same batch. +This applies when we use the cache in front of a database too. Do we want to +use the same filters, sorting, etc? + +If we look at the movie example from before, you can see that I've extracted +these options into a struct: + +```go +const moviesByIDsCacheKeyPrefix = "movies-by-ids" + +type MoviesByIDsOpts struct { + IncludeUpcoming bool + IncludeUpsell bool +} + +func (c *Client) MoviesByIDs(ctx context.Context, ids []string, opts MoviesByIDsOpts) (map[string]Movie, error) { + cacheKeyFunc := c.cache.PermutatedBatchKeyFn(moviesByIDsCacheKeyPrefix, opts) + fetchFunc := func(ctx context.Context, cacheMisses []string) (map[string]Movie, error) { + // ... + defer cancel() + } + return sturdyc.GetOrFetchBatch(ctx, c.cache, ids, cacheKeyFunc, fetchFunc) +} +``` + +And as I mentioned before, the `PermutatedBatchKeyFn` is going to perform +reflection on this struct to create cache keys that look something like this: + +``` +movies-by-ids-true-true-ID-1 +movies-by-ids-false-false-ID-1 +movies-by-ids-true-false-ID-1 +movies-by-ids-false-true-ID-1 +``` + +What the refresh coalescing functionality then does is that it removes the ID +but keeps the permutation string and uses it to create and uniquely +identifiable buffer where it can gather IDs that should be refreshed with the +same options: + +``` +movies-by-ids-true-true +movies-by-ids-false-false +movies-by-ids-true-false +movies-by-ids-false-true +``` The only change we have to make to the previous example is to enable this feature: @@ -1140,8 +1194,33 @@ func main() { } ``` -and now we can see that the cache performs the refreshes in batches per -permutation of our query params: +So now we're saying that we want to coalesce the refreshes for each +permutation, and try to process them in batches of 3. However, if it's not able +to reach that size within 30 seconds we want the refresh to happen anyway. + +And if you recall the output from our last run of this example code where the +refreshes happened one by one: + +```sh +go run . +2024/04/07 13:33:56 Filling the cache with all IDs for all option sets +2024/04/07 13:33:56 Fetching: [id1 id2 id3], carrier: FEDEX, delivery time: 2024-04-06 +2024/04/07 13:33:56 Fetching: [id1 id2 id3], carrier: DHL, delivery time: 2024-04-07 +2024/04/07 13:33:56 Fetching: [id1 id2 id3], carrier: UPS, delivery time: 2024-04-08 +2024/04/07 13:33:56 Cache filled +2024/04/07 13:33:58 Fetching: [id1], carrier: FEDEX, delivery time: 2024-04-06 +2024/04/07 13:33:58 Fetching: [id1], carrier: UPS, delivery time: 2024-04-08 +2024/04/07 13:33:58 Fetching: [id1], carrier: DHL, delivery time: 2024-04-07 +2024/04/07 13:33:58 Fetching: [id2], carrier: UPS, delivery time: 2024-04-08 +2024/04/07 13:33:58 Fetching: [id2], carrier: FEDEX, delivery time: 2024-04-06 +2024/04/07 13:33:58 Fetching: [id2], carrier: DHL, delivery time: 2024-04-07 +2024/04/07 13:33:58 Fetching: [id3], carrier: FEDEX, delivery time: 2024-04-06 +2024/04/07 13:33:58 Fetching: [id3], carrier: UPS, delivery time: 2024-04-08 +2024/04/07 13:33:58 Fetching: [id3], carrier: DHL, delivery time: 2024-04-07 +``` + +We'll now try to run this code again, but with the `WithRefreshCoalescing` +option enabled: ```sh go run . @@ -1155,17 +1234,68 @@ go run . 2024/04/07 13:45:44 Fetching: [id1 id2 id3], carrier: UPS, delivery time: 2024-04-08 ``` -The number of outgoing requests for the refreshes went from **9** to **3**. -Imagine what a batch size of 50 would do for your applications performance! +The number of refreshes went from **9** to **3**. Imagine what a batch size of +50 would could do for your applications performance! + +There is more information about this in the section about metrics, but for our +production applications we're also using the caches `WithMetrics` option so +that we can monitor how well our refreshes are performing: + +Screenshot 2024-05-04 at 12 38 04 +> This chart shows the batch sizes for our coalesced refreshes. + +Screenshot 2024-05-04 at 12 38 20 +> This chart shows the average batch size of our refreshes for two different data sources The entire example is available [here.](https://github.com/viccon/sturdyc/tree/main/examples/buffering) +Another point to note is how effectively the options we've seen so far can be +combined to create high-performing, flexible, and robust caching solutions: + +```go + capacity := 10000 + numShards := 10 + ttl := 2 * time.Hour + evictionPercentage := 10 + minRefreshDelay := time.Second + maxRefreshDelay := time.Second * 2 + synchronousRefreshDelay := time.Second * 120 // 2 minutes. + retryBaseDelay := time.Millisecond * 10 + batchSize := 10 + batchBufferTimeout := time.Second * 15 + + cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, retryBaseDelay), + sturdyc.WithRefreshCoalescing(batchSize, batchBufferTimeout), + ) +``` + +With the configuration above, the keys in active rotation are going to be +scheduled for a refresh every 1-2 seconds. For batchable data sources, where we +are making use of the `GetOrFetchBatch` function, we'll ask the cache (using +the `WithRefreshCoalescing` option) to delay them for up to 15 seconds or until +a batch size of 10 is reached. + +What if a key that hasn't been refreshed in the last 120 seconds is suddenly +requested? Given the `synchronousRefreshDelay` passed to the +`WithEarlyRefreshes` option, the cache will skip any background refresh and +instead perform a synchronous refresh to ensure that the data is fresh. Did +1000 requests suddenly arrive for this key? No problem, the in-flight tracking +makes sure that we only make **one** request to the underlying data source. +This works for refreshes too by the way. If 1000 requests arrived for a key +that was 3 seconds old (greater than our `maxRefreshDelay`) we'd only schedule +a single refresh for it. + +Is the underlying data source experiencing downtime? With our TTL of two-hours +we'll be able to provide a degraded experience to our users by serving stale +data from our cache while continuously trying to refresh it in the background. + # Passthrough There are times when you want to always retrieve the latest data from the -source and only use the in-memory cache as a fallback. In such scenarios, you +source and only use the in-memory cache as a _fallback_. In such scenarios, you can use the `Passthrough` and `PassthroughBatch` functions. The cache will -still perform in-flight request tracking and deduplicate your requests. +still perform in-flight tracking and deduplicate your requests. # Distributed storage @@ -1173,22 +1303,51 @@ I think it's important to read the previous sections before jumping here in order to understand all the heavy lifting `sturdyc` does when it comes to creating cache keys, tracking in-flight requests, refreshing records in the background to improve latency, and buffering/coalescing requests to minimize -the number of round trips to underlying data sources. +the number of round trips to underlying data sources. Because, as you’ll soon +see, we’ll leverage these features when adding distributed storage to our cache +as well. + +However, let's first try and understand when this functionality could be +useful. I like to use this feature when I'm building an application that is +able to achieve a high cache hit rate, while also being subject to large bursts +of traffic. + +To provide a real life example example of this, I've used this in production +for a large streaming application. The content was fairly static; new movies, +series, and episodes were only ingested a couple of times an hour. That meant +that we could achieve a very high hit rate for our data sources. However, +during the evenings, when a popular football match or TV show was about to +start, our traffic could spike by a factor of 20 within less than a minute. + +To illustrate the problem further, let’s say the hit rate for our in-memory +cache was 99.8%. Then, when we received that large burst of traffic, our +auto-scaling would begin provisioning new containers. These containers would +obviously be brand new, with an initial hit rate of 0%. This would cause a +significant load on our underlying data sources as soon as they came online, +because every request they received led to an outgoing request to the data +source. And these data sources had gotten used to being shielded from most of +the traffic by the older containers high hit-rate and refresh coalescing usage. +Hence, what was a 20x spike for us could become a 200x spike for them until our +new containers had warmed their cache. + +Therefore, I decided to add the ability to have the containers sync their +in-memory cache with a distributed key-value store that would have an easier +time to absorb these bursts. Adding distributed storage to the cache is, from the package's point of view, essentially just another data source with a higher priority. Hence, we're still able to take great advantage of all the features we've seen so far, and these -efficiency gains will hopefully allow you to use a much cheaper cluster. +efficiency gains will hopefully allow us to use a much cheaper cluster. -Slightly simplified, we can think of the cache's interaction with the +A bit simplified, we can think of the cache's interaction with the distributed storage like this: ```go -// NOTE: This is an example. The cache has this functionality internally. +// NOTE: This is an example. The cache has similar functionality internally. func (o *OrderAPI) OrderStatus(ctx context.Context, id string) (string, error) { cacheKey := "order-status-" + id fetchFn := func(ctx context.Context) (string, error) { - // Check redis cache first. + // Check Redis cache first. if orderStatus, ok := o.redisClient.Get(cacheKey); ok { return orderStatus, nil } @@ -1203,7 +1362,7 @@ func (o *OrderAPI) OrderStatus(ctx context.Context, id string) (string, error) { return "", err } - // Add the order status to the redis cache. + // Add the order status to the Redis cache so that it becomes available for the other containers. go func() { o.RedisClient.Set(cacheKey, response.OrderStatus, time.Hour) }() return response.OrderStatus, nil @@ -1213,15 +1372,9 @@ func (o *OrderAPI) OrderStatus(ctx context.Context, id string) (string, error) { } ``` -Syncing the keys and values to a distributed storage like this can be highly -beneficial, especially when we're deploying new containers where the in-memory -cache will be empty, as it prevents sudden bursts of traffic to the underlying -data sources. - -Keeping the in-memory caches in sync with a distributed storage requires a bit -more work though. `sturdyc` has therefore been designed to work with an -abstraction that could represent any key-value store of your choosing, all you -have to do is implement this interface: +The real implementation interacts with the distributed storage through an +abstraction so that you're able to use any key-value store you want. All you +would have to do is implement this interface: ```go type DistributedStorage interface { @@ -1237,13 +1390,19 @@ cache client: ```go cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + // Other options... sturdyc.WithDistributedStorage(storage), ) ``` **Please note** that you are responsible for configuring the TTL and eviction -policies of this storage. `sturdyc` will only make sure that it's being kept -up-to-date with the data it has in-memory. +policies of this storage. `sturdyc` will only make sure that it queries this +data source first, and then writes the keys and values to this storage as soon +as it has gone out to an underlying data source and refreshed them. Therefore, +I'd advice you touse the configuration above with short TTLs for the +distributed storage, or things might get too stale. I mostly think it's useful +if you're consuming data sources that don't handle bursts from new containers +very well. I've included an example to showcase this functionality [here.](https://github.com/viccon/sturdyc/tree/main/examples/distribution) @@ -1274,18 +1433,20 @@ this: ``` Above we can see that the underlying data source was only visited **once**, and -the in-memory cache performed a background refresh from the distributed storage -every 2 to 3 retrievals to ensure that it's being kept up-to-date. - -This sequence of events will repeat once the TTL expires. +that the remaining background refreshes that the in-memory cache performed went +only went to the distributed storage. # Distributed storage early refreshes -Similar to the in-memory cache, we're also able to use a distributed storage -where the data is refreshed before the TTL expires. +As I mentioned before, the configuration from the section above works well as +long as you're using short TTLs for the distributed key-value store. However, +I've also built systems where I wanted to leverage the distributed storage as +an additional robustness feature with long TTLs. That way, if an upstream +system goes down, newly provisioned containers could still retrieve the latest +data that the old containers had cached from something like a Redis. -This would also allow us to serve stale data if an upstream was to experience -any downtime: +If you have a similar use case, you could use the following +configuration instead: ```go cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, @@ -1293,11 +1454,12 @@ cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, ) ``` -With the configuration above, we're essentially saying that we'd prefer if the -data was refreshed once it's more than a minute old. However, if you're writing -records with a 60 minute TTL, the cache will continously fallback to these if -the refreshes were to fail, so the interaction with the distributed storage -would look something like this: +With a configuration like this, I would usually set the TTL for the distributed +storage to something like an hour. However, if the cache queries the +distributed storage and finds that a record is older than 1 minute (the second +argument to the function), it will refresh the record from the underlying data +source, and then write the updated value back to it. So the interaction with +the distributed storage would look something like this: - Start by trying to retrieve the key from the distributeted storage. If the data is fresh, it's returned immediately and written to the in-memory cache. @@ -1319,9 +1481,10 @@ type DistributedStorageEarlyRefreshes interface { ``` These delete methods will be called when a refresh occurs, and the cache -notices that it can no longer find the key at the underlying data source. This -indicates that the key has been deleted, and we will want this change to -propagate to the distributed key-value store +notices that it can no longer retrieve the key at the underlying data source. +This indicates that the key has been deleted, and we will want this change to +propagate to the distributed key-value store as soon as possible, and not have +to wait for the TTL to expire. **Please note** that you are still responsible for setting the TTL and eviction policies for the distributed store. The cache will only invoke the delete @@ -1339,11 +1502,14 @@ The cache can be configured to report custom metrics for: - Size of the cache - Cache hits - Cache misses +- Background refreshes +- Synchronous refreshes +- Missing records - Evictions - Forced evictions - The number of entries evicted - Shard distribution -- The size of the refresh buckets +- The batch size of a coalesced refresh There are also distributed metrics if you're using the cache with a _distributed storage_, which adds the following metrics in addition to what @@ -1351,14 +1517,19 @@ we've seen above: - Distributed cache hits - Distributed cache misses +- Distributed refreshes +- Distributed missing records - Distributed stale fallback All you have to do is implement one of these interfaces: ```go type MetricsRecorder interface { + CacheHit() CacheMiss() - Eviction() + BackgroundRefresh() + SynchronousRefresh() + MissingRecord() ForcedEviction() EntriesEvicted(int) ShardIndex(int) @@ -1370,9 +1541,10 @@ type DistributedMetricsRecorder interface { MetricsRecorder DistributedCacheHit() DistributedCacheMiss() + DistributedRefresh() + DistributedMissingRecord() DistributedFallback() } - ``` and pass it as an option when you create the client: @@ -1396,22 +1568,19 @@ cacheDistributedMetrics := sturdyc.New[any]( ) ``` -Below are a few images where these metrics have been visualized in Grafana: +Below are a few images where some of these metrics have been visualized in Grafana: Screenshot 2024-05-04 at 12 36 43 -Here we can how often we're able to serve from memory. +> Here we can how often we're able to serve from memory. Screenshot 2024-05-04 at 12 37 39 -This image displays the number of items we have cached. +> This image displays the number of items we have cached. Screenshot 2024-05-04 at 12 38 04 -This chart shows the batch sizes for the buffered refreshes. +> This chart shows the batch sizes for the buffered refreshes. Screenshot 2024-05-04 at 12 38 20 -And lastly, we can see the average batch size of our refreshes for two different data sources. - -You are also able to visualize evictions, forced evictions which occur when the -cache has reached its capacity, as well as the distribution between the shards. +> And lastly, we can see the average batch size of our refreshes for two different data sources. # Generics From d3145ab873702c6770db9387a7e9fad0eb73bccc Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Fri, 3 Jan 2025 13:45:30 +0100 Subject: [PATCH 13/32] WIP --- README.md | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 1d3f2a9..0683bdb 100644 --- a/README.md +++ b/README.md @@ -37,11 +37,6 @@ this package in front of our distributed key-value store: In addition to this, we were also able to reduce our number of outgoing requests by more than 90% after enabling the _refresh coalescing_ option. -# Installing - -```sh -go get github.com/viccon/sturdyc -``` # Table of contents @@ -50,6 +45,7 @@ if this is your first time using this package, I encourage you to **read these examples in the order they appear**. Most of them build on each other, and many share configurations. +- [**installing**](https://github.com/viccon/sturdyc?tab=readme-ov-file#installing) - [**creating a cache client**](https://github.com/viccon/sturdyc?tab=readme-ov-file#creating-a-cache-client) - [**evictions**](https://github.com/viccon/sturdyc?tab=readme-ov-file#evictions) - [**get or fetch**](https://github.com/viccon/sturdyc?tab=readme-ov-file#get-or-fetch) @@ -65,6 +61,12 @@ share configurations. - [**custom metrics**](https://github.com/viccon/sturdyc?tab=readme-ov-file#custom-metrics) - [**generics**](https://github.com/viccon/sturdyc?tab=readme-ov-file#generics) +# Installing + +```sh +go get github.com/viccon/sturdyc +``` + # Creating a cache client The first thing you will have to do is to create a cache client to hold your @@ -94,12 +96,30 @@ configuration: log.Println(cacheClient.Get("key1")) ``` +We're also able to provide a vast set of additional options which we are going +to explore in the sections below. + # Evictions The cache runs a background job which continuously evicts expired records from -each shard. However, there are options to both tweak the interval and disable -the functionality altogether. This is can give you a slight performance boost -in situations where you're unlikely to exceed any memory limits. +each shard. However, there are options to both tweak the interval: + +```go + cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithEvictionInterval(time.Second), + ) +``` + +and disable the functionality altogether: + +```go + cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithNoContinuousEvictions() + ) +``` + +The latter can give you a slight performance boost in situations where you're +unlikely to exceed any memory limits.. When the cache reaches its capacity, a fallback eviction is triggered. This process performs evictions on a per-shard basis, selecting records for removal From 9a45032c942017b2d6ecd4cfb60a27a624d47e1e Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Fri, 3 Jan 2025 13:47:14 +0100 Subject: [PATCH 14/32] WIP --- README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0683bdb..09bdcf3 100644 --- a/README.md +++ b/README.md @@ -20,12 +20,11 @@ It has all the functionality you would expect from a caching library, but what **sets it apart** are the features designed to make I/O heavy applications both _robust_ and _highly performant_. -The API is very simple to use. If you’re currently retrieving your data from a -distributed cache, database, or API, you should be able to add this package to -your application for a significant performance boost without losing data -freshness — provided you configure your cache client correctly. As you will see -below, there are many options, and I encourage you to read through this README -and experiment with the examples to get an understanding of how it works. +If you’re currently retrieving your data from a distributed cache, database, or +API, you should be able to add this package to your application for a +significant performance boost. As you will see below, there are many options, +and I encourage you to read through this README and experiment with the +examples to get an understanding of how it works. Here is a screenshot showing the P95 latency improvements we've observed after adding this package in front of our distributed key-value store: From 7b5c393233eec3e0b3f214c9f94465f6ed257909 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Fri, 3 Jan 2025 13:48:46 +0100 Subject: [PATCH 15/32] WIP --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 09bdcf3..001b27a 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,10 @@ It has all the functionality you would expect from a caching library, but what _robust_ and _highly performant_. If you’re currently retrieving your data from a distributed cache, database, or -API, you should be able to add this package to your application for a -significant performance boost. As you will see below, there are many options, -and I encourage you to read through this README and experiment with the -examples to get an understanding of how it works. +API, you're probably able to add this package to your application for a +significant performance boost. As you will see below, there are many ways to +configure this package, and I encourage you to read through this README and +experiment with the examples to get an understanding of how it works. Here is a screenshot showing the P95 latency improvements we've observed after adding this package in front of our distributed key-value store: From cd71519355dd6d5ddfeb249cc8f43a0e6dd1534c Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Fri, 3 Jan 2025 13:54:33 +0100 Subject: [PATCH 16/32] WIP --- README.md | 58 +++++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 001b27a..ee2af7a 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,7 @@ this package in front of our distributed key-value store:   In addition to this, we were also able to reduce our number of outgoing -requests by more than 90% after enabling the _refresh coalescing_ option. - +requests by more than 90% after enabling the _refresh coalescing_ functionality. # Table of contents @@ -72,31 +71,32 @@ The first thing you will have to do is to create a cache client to hold your configuration: ```go - // Maximum number of entries in the cache. Exceeding this number will trigger - // an eviction (as long as the "evictionPercentage" is greater than 0). - capacity := 10000 - // Number of shards to use. Increasing this number will reduce write lock collisions. - numShards := 10 - // Time-to-live for cache entries. - ttl := 2 * time.Hour - // Percentage of entries to evict when the cache reaches its capacity. Setting this - // to 0 will make writes a no-op until an item has either expired or been deleted. - evictionPercentage := 10 +// Maximum number of entries in the cache. Exceeding this number will trigger +// an eviction (as long as the "evictionPercentage" is greater than 0). +capacity := 10000 +// Number of shards to use. Increasing this number will reduce write lock collisions. +numShards := 10 +// Time-to-live for cache entries. +ttl := 2 * time.Hour +// Percentage of entries to evict when the cache reaches its capacity. Setting this +// to 0 will make writes a no-op until an item has either expired or been deleted. +evictionPercentage := 10 - // Create a cache client with the specified configuration. - cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage) +// Create a cache client with the specified configuration. +cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage) - cacheClient.Set("key1", 99) - log.Println(cacheClient.Size()) - log.Println(cacheClient.Get("key1")) +cacheClient.Set("key1", 99) +log.Println(cacheClient.Size()) +log.Println(cacheClient.Get("key1")) - cacheClient.Delete("key1") - log.Println(cacheClient.Size()) - log.Println(cacheClient.Get("key1")) +cacheClient.Delete("key1") +log.Println(cacheClient.Size()) +log.Println(cacheClient.Get("key1")) ``` -We're also able to provide a vast set of additional options which we are going -to explore in the sections below. +As the final argument to the `New` function, we're also able to provide a wide +range of additional options, which we will explore in detail in the sections +to follow. # Evictions @@ -104,21 +104,21 @@ The cache runs a background job which continuously evicts expired records from each shard. However, there are options to both tweak the interval: ```go - cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage, - sturdyc.WithEvictionInterval(time.Second), - ) +cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithEvictionInterval(time.Second), +) ``` and disable the functionality altogether: ```go - cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage, - sturdyc.WithNoContinuousEvictions() - ) +cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithNoContinuousEvictions() +) ``` The latter can give you a slight performance boost in situations where you're -unlikely to exceed any memory limits.. +unlikely to exceed any memory limits. When the cache reaches its capacity, a fallback eviction is triggered. This process performs evictions on a per-shard basis, selecting records for removal From c7e40615996d0a270eb95d73ea6aa6d893a70ff0 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Fri, 3 Jan 2025 14:28:02 +0100 Subject: [PATCH 17/32] WIP --- README.md | 17 ++++++++++++++--- examples/basic/main.go | 6 ++++-- examples/batch/main.go | 6 ++++-- examples/buffering/main.go | 2 +- examples/permutations/main.go | 2 +- keys.go | 29 +++++++++++++++++++++++++++-- 6 files changed, 51 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index ee2af7a..78043d7 100644 --- a/README.md +++ b/README.md @@ -291,6 +291,14 @@ ID: keyPrefixFn := cacheClient.BatchKeyFn("my-data-source") ``` +This will result in cache keys like this: + +``` +my-data-source-ID-1 +my-data-source-ID-2 +my-data-source-ID-3 +``` + We can now request each batch in a separate goroutine: ```go @@ -852,11 +860,14 @@ func NewAPI(c *sturdyc.Client[string]) *API { } func (a *API) GetBatch(ctx context.Context, ids []string) (map[string]string, error) { - // We are going to use a cache a key function that prefixes each id. - // This makes it possible to save the same id for different data sources. + // We are going to pass a cache a key function that prefixes each id with + // the string "some-prefix", and adds an -ID- separator before the actual + // id. This makes it possible to save the same id for different data + // sources as the keys would look something like this: some-prefix-ID-1234 cacheKeyFn := a.BatchKeyFn("some-prefix") - // The fetchFn is only going to retrieve the IDs that are not in the cache. + // The fetchFn is only going to retrieve the IDs that are not in the cache. Please + // note that the cacheMisses is going to contain the actual IDs, not the cache keys. fetchFn := func(_ context.Context, cacheMisses []string) (map[string]string, error) { log.Printf("Cache miss. Fetching ids: %s\n", strings.Join(cacheMisses, ", ")) // Batch functions should return a map where the key is the id of the record. diff --git a/examples/basic/main.go b/examples/basic/main.go index b60c138..69bbfac 100644 --- a/examples/basic/main.go +++ b/examples/basic/main.go @@ -60,8 +60,10 @@ func demonstrateGetOrFetchBatch(cacheClient *sturdyc.Client[int]) { {"11", "12", "13", "14", "15"}, } - // We'll use a cache key function to add a prefix to the IDs. If we only used - // the IDs, we wouldn't be able to fetch the same IDs from multiple data sources. + // We are going to pass a cache a key function that prefixes each id with + // the string "my-data-source", and adds an -ID- separator before the actual + // id. This makes it possible to save the same id for different data + // sources as the keys would look something like this: my-data-source-ID-1 keyPrefixFn := cacheClient.BatchKeyFn("my-data-source") // Request the keys for each batch. diff --git a/examples/batch/main.go b/examples/batch/main.go index 79ca1c6..061bd92 100644 --- a/examples/batch/main.go +++ b/examples/batch/main.go @@ -20,8 +20,10 @@ func NewAPI(c *sturdyc.Client[string]) *API { } func (a *API) GetBatch(ctx context.Context, ids []string) (map[string]string, error) { - // We are going to pass the cache a key function that prefixes each id. - // This makes it possible to save the same id for different data sources. + // We are going to pass a cache a key function that prefixes each id with + // the string "some-prefix", and adds an -ID- separator before the actual + // id. This makes it possible to save the same id for different data + // sources as the keys would look something like this: some-prefix-ID-1 cacheKeyFn := a.BatchKeyFn("some-prefix") // The fetchFn is only going to retrieve the IDs that are not in the cache. diff --git a/examples/buffering/main.go b/examples/buffering/main.go index 25c2d67..ad0049b 100644 --- a/examples/buffering/main.go +++ b/examples/buffering/main.go @@ -23,7 +23,7 @@ func NewOrderAPI(client *sturdyc.Client[string]) *OrderAPI { } func (a *OrderAPI) OrderStatus(ctx context.Context, ids []string, opts OrderOptions) (map[string]string, error) { - // We use the PermutedBatchKeyFn when an ID isn't enough to uniquely identify a + // We use the PermutedBatchKeyFn when an ID isn't enough to uniquely identify a // record. The cache is going to store each id once per set of options. In a more // realistic scenario, the opts would be query params or arguments to a DB query. cacheKeyFn := a.PermutatedBatchKeyFn("key", opts) diff --git a/examples/permutations/main.go b/examples/permutations/main.go index dae93ea..50b231f 100644 --- a/examples/permutations/main.go +++ b/examples/permutations/main.go @@ -23,7 +23,7 @@ func NewOrderAPI(c *sturdyc.Client[string]) *OrderAPI { } func (a *OrderAPI) OrderStatus(ctx context.Context, ids []string, opts OrderOptions) (map[string]string, error) { - // We use the PermutedBatchKeyFn when an ID isn't enough to uniquely identify a + // We use the PermutedBatchKeyFn when an ID isn't enough to uniquely identify a // record. The cache is going to store each id once per set of options. In a more // realistic scenario, the opts would be query params or arguments to a DB query. cacheKeyFn := a.PermutatedBatchKeyFn("key", opts) diff --git a/keys.go b/keys.go index fd41153..c7e6ddf 100644 --- a/keys.go +++ b/keys.go @@ -92,6 +92,15 @@ func (c *Client[T]) handleTime(v reflect.Value) string { // Returns: // // A string to be used as the cache key. +// +// Example usage: +// +// type queryParams struct { +// City string +// Country string +// } +// params := queryParams{"Stockholm", "Sweden"} +// key := c.PermutatedKey("prefix",, params) // prefix-Stockholm-Sweden-1 func (c *Client[T]) PermutatedKey(prefix string, permutationStruct interface{}) string { var sb strings.Builder sb.WriteString(prefix) @@ -158,8 +167,9 @@ func (c *Client[T]) PermutatedKey(prefix string, permutationStruct interface{}) } // BatchKeyFn provides a function that can be used in conjunction with -// "GetOrFetchBatch". It takes in a prefix and returns a function that will -// append the ID as a suffix for each item. +// "GetOrFetchBatch". It takes in a prefix and returns a function that will use +// the prefix, add a -ID- separator, and then append the ID as a suffix for +// each item. // // Parameters: // @@ -168,6 +178,11 @@ func (c *Client[T]) PermutatedKey(prefix string, permutationStruct interface{}) // Returns: // // A function that takes an ID and returns a cache key string with the given prefix and ID. +// +// Example usage: +// +// fn := c.BatchKeyFn("some-prefix") +// key := fn("1234") // some-prefix-ID-1234 func (c *Client[T]) BatchKeyFn(prefix string) KeyFn { return func(id string) string { return fmt.Sprintf("%s-ID-%s", prefix, id) @@ -190,6 +205,16 @@ func (c *Client[T]) BatchKeyFn(prefix string) KeyFn { // Returns: // // A function that takes an ID and returns a cache key string with the given prefix, permutation struct fields, and ID. +// +// Example usage: +// +// type queryParams struct { +// City string +// Country string +// } +// params := queryParams{"Stockholm", "Sweden"} +// cacheKeyFunc := c.PermutatedBatchKeyFn("prefix", params) +// key := cacheKeyFunc("1") // prefix-Stockholm-Sweden-ID-1 func (c *Client[T]) PermutatedBatchKeyFn(prefix string, permutationStruct interface{}) KeyFn { return func(id string) string { key := c.PermutatedKey(prefix, permutationStruct) From 033d5216cf1e8bf688bd5252257ebe71897bd193 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Fri, 3 Jan 2025 14:40:30 +0100 Subject: [PATCH 18/32] WIP --- README.md | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 78043d7..a9dda50 100644 --- a/README.md +++ b/README.md @@ -133,8 +133,8 @@ Next, we'll start to look at some of the more _advanced features_. I have tried to design the API in a way that should make it effortless to add `sturdyc` to an existing application. To take advantage of the more advanced -functionality that we'll see in the sections below you'll essentially just be -interacting with two functions: `GetOrFetch` and `GetOrFetchBatch`. +functionality you'll essentially just be interacting with two functions: +`GetOrFetch` and `GetOrFetchBatch`. All you would have to do is to take your existing code: @@ -153,7 +153,7 @@ func (c *Client) Order(ctx context.Context, id string) (Order, error) { } ``` -and wrap the actual lines that retrieves the data in a function, and then hand +and wrap the lines of code that retrieves the data in a function, and then hand that over to our cache client: ```go @@ -181,8 +181,10 @@ underlying data source. Most of our examples are going to be retrieving data from HTTP APIs, but it's just as easy to wrap a database query, a remote procedure call, a disk read, or -any other I/O operation. We'll also see how we can use closures to pass query -parameters and other options. +any other I/O operation. + +We'll also see how we can use closures to pass query parameters and other +options. # Stampede protection @@ -253,7 +255,7 @@ returns a map with a numerical value for every ID: ```go var count atomic.Int32 fetchFn := func(_ context.Context, ids []string) (map[string]int, error) { - // Increment the counter so that we can assert how many times this function was called. + // Increment the counter so that we can assert how many times this function was called. count.Add(1) time.Sleep(time.Second * 5) @@ -279,9 +281,10 @@ IDs each: ``` IDs can often be used to fetch data from multiple data sources. As an example, -we might use an userId to fetch orders, payments, shipment options, etc. Hence, -if we're using the cache with an API client, we'll want to prefix this user ID -with the actual endpoint we're using in order to make the cache key unique. +we might use an id to fetch a users orders, payments, shipment options, etc. +Hence, if we're using the cache with an API client, we'll want to prefix this +user id with the actual endpoint we're consuming in order to make the cache key +unique. The package provides more functionality for this that we'll see later on, but for now we'll use the most simple version which adds a string prefix to every @@ -291,7 +294,7 @@ ID: keyPrefixFn := cacheClient.BatchKeyFn("my-data-source") ``` -This will result in cache keys like this: +This will result in cache keys of this format: ``` my-data-source-ID-1 @@ -309,8 +312,8 @@ We can now request each batch in a separate goroutine: }() } - // Sleep to give the goroutines above a chance to run. - // This ensures that the batches are in-flight. + // Sleep to give the goroutines above a chance to run. + // This ensures that the batches are in-flight. time.Sleep(time.Second * 3) ``` @@ -320,11 +323,11 @@ Each goroutine is going to request two random IDs from our batches: ```go // Launch another 5 goroutines that are going to pick two random IDs from any of our in-flight batches. - // e.g: - // [1,8] - // [4,11] - // [14,2] - // [6,15] + // e.g: + // [1,8] + // [4,11] + // [14,2] + // [6,15] var wg sync.WaitGroup for i := 0; i < 5; i++ { wg.Add(1) From 6d0bb1c1dea9d5e96066b03a7e2367f61c71aae0 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Sun, 5 Jan 2025 23:44:54 +0100 Subject: [PATCH 19/32] WIP --- README.md | 106 +++++++++++++++++++++++++++++------------------------- 1 file changed, 58 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index a9dda50..c0a4436 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,10 @@ It has all the functionality you would expect from a caching library, but what _robust_ and _highly performant_. If you’re currently retrieving your data from a distributed cache, database, or -API, you're probably able to add this package to your application for a -significant performance boost. As you will see below, there are many ways to -configure this package, and I encourage you to read through this README and -experiment with the examples to get an understanding of how it works. +API, you could probably consume it through this package for a significant +performance boost. As you will see below, there are many ways to configure this +package, and I encourage you to read through this README and experiment with +the examples to get an understanding of how it works. Here is a screenshot showing the P95 latency improvements we've observed after adding this package in front of our distributed key-value store: @@ -94,14 +94,15 @@ log.Println(cacheClient.Size()) log.Println(cacheClient.Get("key1")) ``` -As the final argument to the `New` function, we're also able to provide a wide -range of additional options, which we will explore in detail in the sections +As the final argument to the `New` function, we're also able to provide a large +number of additional options, which we will explore in detail in the sections to follow. # Evictions -The cache runs a background job which continuously evicts expired records from -each shard. However, there are options to both tweak the interval: +The cache has two eviction strategies. One is a run a background job which +continuously evicts expired records from each shard. However, there are options +to both tweak the interval at which the job runs: ```go cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage, @@ -109,7 +110,7 @@ cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage, ) ``` -and disable the functionality altogether: +as well as disabling the functionality altogether: ```go cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage, @@ -118,25 +119,25 @@ cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage, ``` The latter can give you a slight performance boost in situations where you're -unlikely to exceed any memory limits. +unlikely to ever exceed the capacity of your cache. -When the cache reaches its capacity, a fallback eviction is triggered. This -process performs evictions on a per-shard basis, selecting records for removal -based on recency. The eviction algorithm uses +However, when the cache capacity is reached, the second eviction strategy is +triggered. This process performs evictions on a per-shard basis, selecting +records for removal based on recency. The eviction algorithm uses [quickselect](https://en.wikipedia.org/wiki/Quickselect), which has an O(N) -time complexity without requiring write locks on reads to update a recency -list. +time complexity without the overhead of requiring write locks on reads to +update a recency list, as many LRU caches do. Next, we'll start to look at some of the more _advanced features_. # Get or fetch -I have tried to design the API in a way that should make it effortless to add -`sturdyc` to an existing application. To take advantage of the more advanced -functionality you'll essentially just be interacting with two functions: -`GetOrFetch` and `GetOrFetchBatch`. +I have tried to design the API in a way that should make it effortless to start +consuming your applications data through `sturdyc`. To take advantage of all +the more advanced functionality and configurations you'll essentially just be +interacting with two functions: `GetOrFetch` and `GetOrFetchBatch`. -All you would have to do is to take your existing code: +Let's say that we had the following code for fetching orders: ```go func (c *Client) Order(ctx context.Context, id string) (Order, error) { @@ -153,8 +154,8 @@ func (c *Client) Order(ctx context.Context, id string) (Order, error) { } ``` -and wrap the lines of code that retrieves the data in a function, and then hand -that over to our cache client: +All we would have to do is wrap the lines of code that retrieves the data in a +function, and then hand that over to our cache client: ```go func (c *Client) Order(ctx context.Context, id string) (Order, error) { @@ -175,9 +176,9 @@ func (c *Client) Order(ctx context.Context, id string) (Order, error) { } ``` -The cache is then going to return the value from the cache if it's available, -and otherwise it will call the `fetchFn` to retrieve the data from the -underlying data source. +The cache is then going to return the value from memory if it's available, and +otherwise it will call the `fetchFn` to retrieve the data from the underlying +data source. Most of our examples are going to be retrieving data from HTTP APIs, but it's just as easy to wrap a database query, a remote procedure call, a disk read, or @@ -188,19 +189,18 @@ options. # Stampede protection -Cache stampedes (also known as thundering herd) occur when many requests for a -particular piece of data, which has just expired or been evicted from the -cache, come in at once. - -Preventing this has been one of the key objectives for this package. We do not -want to cause a significant load on an underlying data source every time one of -our keys expires. To address this, `sturdyc` performs _in-flight_ tracking for -every key. +When we're consuming data through `sturdyc` we'll get automatic protection +against cache stampedes. Cache stampades (also known as thundering herd) occur +when many requests for a particular piece of data, which has just expired or +been evicted from the cache, come in at once. Preventing this has been one of +the key objectives. We do not want to cause a significant load on an underlying +data source every time one of our keys expires. To address this, `sturdyc` +performs _in-flight_ tracking for every key. We can demonstrate this using the `GetOrFetch` function which, as I mentioned -before, takes a key, and a function for retrieving the data if it's not in the +earlier, takes a key, and a function for retrieving the data if it's not in the cache. The cache is going to ensure that we never have more than a single -request per key: +in-flight request per key: ```go var count atomic.Int32 @@ -243,7 +243,7 @@ and that the fetchFn only got called once: The in-flight tracking works for batch operations too. The cache is able to deduplicate a batch of cache misses, and then assemble the response by picking -records from multiple in-flight requests. +records from _multiple_ in-flight requests. To demonstrate this, we'll use the `GetOrFetchBatch` function, which can be used to retrieve data from a data source capable of handling requests for @@ -282,9 +282,16 @@ IDs each: IDs can often be used to fetch data from multiple data sources. As an example, we might use an id to fetch a users orders, payments, shipment options, etc. -Hence, if we're using the cache with an API client, we'll want to prefix this -user id with the actual endpoint we're consuming in order to make the cache key -unique. +Hence, if we're using the cache with an API client that is capable of calling +different endpoints, we'll want to prefix this user id with something in order +to avoid key collisions for different data types, e.g: + +```sh +// 1234 is our user id +orders-1234 +payments-1234 +shipments-1234 +``` The package provides more functionality for this that we'll see later on, but for now we'll use the most simple version which adds a string prefix to every @@ -317,9 +324,10 @@ We can now request each batch in a separate goroutine: time.Sleep(time.Second * 3) ``` -At this point, the cache should have in-flight requests for IDs 1-15. Knowing -this, we'll test the stampede protection by launching another five goroutines. -Each goroutine is going to request two random IDs from our batches: +At this point, the cache should have 3 in-flight requests for IDs 1-15. Knowing +this. Let's now test the stampede protection by launching another five +goroutines. Each of these goroutines are going to request two random IDs from +our previous batches: ```go // Launch another 5 goroutines that are going to pick two random IDs from any of our in-flight batches. @@ -344,8 +352,8 @@ Each goroutine is going to request two random IDs from our batches: ``` Running this program, and looking at the logs, we'll see that the cache is able -resolve all of these values without generating any additional outgoing requests -even though the IDs are picked from different batches: +resolve all of the keys from these new goroutines without generating any +additional requests even though we're picking IDs from different batches: ```sh ❯ go run . @@ -392,11 +400,13 @@ This is an important distinction because it means that the cache doesn't just naively refresh every key it's ever seen. Instead, it only refreshes the records that are actually in active rotation, while allowing unused keys to be deleted once their TTL expires. This also means that the request that gets -chosen to refresh the value won’t retrieve the updated data right away. To -address this, you can provide a synchronous refresh time, where you essentially -say, "If the data is older than x, I want the refresh to be blocking." +chosen to refresh the value won’t retrieve the updated data right away. +However, there is also a synchronous refresh time that you can provide, where +you essentially say, "If the data is older than x, I want the refresh to be +blocking." -Below is an example configuration that you can use to enable this functionality: +Below is an example configuration that you can use to enable this +functionality: ```go func main() { From 822e7f6fbb4711b619f8bb13717aa10ee9b9e7ca Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Tue, 7 Jan 2025 08:52:51 +0100 Subject: [PATCH 20/32] WIP --- metrics.go | 12 ++++++------ sturdyc_test.go | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/metrics.go b/metrics.go index b9d9449..faaba65 100644 --- a/metrics.go +++ b/metrics.go @@ -5,8 +5,8 @@ type MetricsRecorder interface { CacheHit() // CacheMiss is called for every key that results in a cache miss. CacheMiss() - // Refresh is called when a get operation results in a refresh. - BackgroundRefresh() + // AsynchronousRefresh is called when a get operation results in an asynchronous refresh. + AsynchronousRefresh() // SynchronousRefresh is called when a get operation results in a synchronous refresh. SynchronousRefresh() // MissingRecord is called every time the cache is asked to @@ -73,7 +73,7 @@ func (s *shard[T]) reportEntriesEvicted(n int) { } // reportCacheHits is used to report cache hits and misses to the metrics recorder. -func (c *Client[T]) reportCacheHits(cacheHit, missingRecord, backgroundRefresh, synchronousRefresh bool) { +func (c *Client[T]) reportCacheHits(cacheHit, missingRecord, asyncRefresh, syncRefresh bool) { if c.metricsRecorder == nil { return } @@ -82,11 +82,11 @@ func (c *Client[T]) reportCacheHits(cacheHit, missingRecord, backgroundRefresh, c.metricsRecorder.MissingRecord() } - if backgroundRefresh { - c.metricsRecorder.BackgroundRefresh() + if asyncRefresh { + c.metricsRecorder.AsynchronousRefresh() } - if synchronousRefresh { + if syncRefresh { c.metricsRecorder.SynchronousRefresh() } diff --git a/sturdyc_test.go b/sturdyc_test.go index f5b4103..9e3927d 100644 --- a/sturdyc_test.go +++ b/sturdyc_test.go @@ -55,7 +55,7 @@ func (r *TestMetricsRecorder) CacheMiss() { r.cacheMisses++ } -func (r *TestMetricsRecorder) BackgroundRefresh() { +func (r *TestMetricsRecorder) AsynchronousRefresh() { r.Lock() defer r.Unlock() r.backgroundRefreshes++ From 3379129fd744db6fe475e76bc6052a30d6af3275 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Tue, 7 Jan 2025 11:08:44 +0100 Subject: [PATCH 21/32] WIP --- README.md | 2 +- cache.go | 6 +++--- distribution_test.go | 25 +++++++++++++------------ fetch.go | 2 +- options.go | 26 +++++++++++++++----------- shard.go | 8 ++++---- 6 files changed, 37 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index c0a4436..ab47159 100644 --- a/README.md +++ b/README.md @@ -1570,7 +1570,7 @@ All you have to do is implement one of these interfaces: type MetricsRecorder interface { CacheHit() CacheMiss() - BackgroundRefresh() + AsynchronousRefresh() SynchronousRefresh() MissingRecord() ForcedEviction() diff --git a/cache.go b/cache.go index e63a5e5..53a523f 100644 --- a/cache.go +++ b/cache.go @@ -30,9 +30,9 @@ type Config struct { log Logger earlyRefreshes bool - minRefreshTime time.Duration - maxRefreshTime time.Duration - synchronousRefreshTime time.Duration + minAsyncRefreshTime time.Duration + maxAsyncRefreshTime time.Duration + syncRefreshTime time.Duration retryBaseDelay time.Duration storeMissingRecords bool diff --git a/distribution_test.go b/distribution_test.go index 49fe989..c0c69b5 100644 --- a/distribution_test.go +++ b/distribution_test.go @@ -156,7 +156,7 @@ func TestDistributedStorage(t *testing.T) { fetchObserver.AssertFetchCount(t, 1) fetchObserver.Clear() - // The keys are written asynchonously to the distributed storage. + // The keys are written asynchronously to the distributed storage. time.Sleep(100 * time.Millisecond) distributedStorage.assertRecord(t, key) distributedStorage.assertGetCount(t, 1) @@ -177,7 +177,7 @@ func TestDistributedStorage(t *testing.T) { t.Errorf("expected valuekey1, got %s", res) } - // The keys are written asynchonously to the distributed storage. + // The keys are written asynchronously to the distributed storage. time.Sleep(100 * time.Millisecond) fetchObserver.AssertFetchCount(t, 1) distributedStorage.assertGetCount(t, 2) @@ -411,7 +411,7 @@ func TestDistributedStorageBatch(t *testing.T) { fetchObserver.AssertFetchCount(t, 1) fetchObserver.Clear() - // The keys are written asynchonously to the distributed storage. + // The keys are written asynchronously to the distributed storage. time.Sleep(100 * time.Millisecond) distributedStorage.assertRecords(t, firstBatchOfIDs, keyFn) distributedStorage.assertGetCount(t, 1) @@ -444,7 +444,7 @@ func TestDistributedStorageBatch(t *testing.T) { fetchObserver.AssertRequestedRecords(t, []string{"4", "5", "6"}) fetchObserver.AssertFetchCount(t, 2) - // The keys are written asynchonously to the distributed storage. + // The keys are written asynchronously to the distributed storage. time.Sleep(100 * time.Millisecond) distributedStorage.assertRecords(t, secondBatchOfIDs, keyFn) distributedStorage.assertGetCount(t, 2) @@ -480,7 +480,7 @@ func TestDistributedStaleStorageBatch(t *testing.T) { fetchObserver.AssertFetchCount(t, 1) fetchObserver.Clear() - // The keys are written asynchonously to the distributed storage. + // The keys are written asynchronously to the distributed storage. time.Sleep(100 * time.Millisecond) distributedStorage.assertRecords(t, firstBatchOfIDs, keyFn) distributedStorage.assertGetCount(t, 1) @@ -546,7 +546,7 @@ func TestDistributedStorageBatchDeletes(t *testing.T) { fetchObserver.AssertFetchCount(t, 1) fetchObserver.Clear() - // The keys are written asynchonously to the distributed storage. + // The keys are written asynchronously to the distributed storage. time.Sleep(100 * time.Millisecond) distributedStorage.assertRecords(t, batchOfIDs, keyFn) distributedStorage.assertGetCount(t, 1) @@ -578,7 +578,7 @@ func TestDistributedStorageBatchDeletes(t *testing.T) { fetchObserver.AssertRequestedRecords(t, batchOfIDs) fetchObserver.AssertFetchCount(t, 2) - // The keys are written asynchonously to the distributed storage. + // The keys are written asynchronously to the distributed storage. time.Sleep(100 * time.Millisecond) distributedStorage.assertRecords(t, []string{"1", "2"}, keyFn) distributedStorage.assertGetCount(t, 2) @@ -615,7 +615,7 @@ func TestDistributedStorageBatchConvertsToMissingRecord(t *testing.T) { fetchObserver.AssertFetchCount(t, 1) fetchObserver.Clear() - // The keys are written asynchonously to the distributed storage. + // The keys are written asynchronously to the distributed storage. time.Sleep(100 * time.Millisecond) distributedStorage.assertRecords(t, batchOfIDs, keyFn) distributedStorage.assertGetCount(t, 1) @@ -648,7 +648,7 @@ func TestDistributedStorageBatchConvertsToMissingRecord(t *testing.T) { fetchObserver.AssertFetchCount(t, 2) fetchObserver.Clear() - // The keys are written asynchonously to the distributed storage. + // The keys are written asynchronously to the distributed storage. time.Sleep(100 * time.Millisecond) distributedStorage.assertRecords(t, []string{"1", "2"}, keyFn) distributedStorage.assertGetCount(t, 2) @@ -675,7 +675,8 @@ func TestDistributedStorageBatchConvertsToMissingRecord(t *testing.T) { fetchObserver.AssertRequestedRecords(t, batchOfIDs) fetchObserver.AssertFetchCount(t, 3) - // The keys are written asynchonously to the distributed storage. + // The keys are written asynchronously to the distributed storage. + time.Sleep(100 * time.Millisecond) time.Sleep(100 * time.Millisecond) distributedStorage.assertRecords(t, batchOfIDs, keyFn) distributedStorage.assertGetCount(t, 3) @@ -727,7 +728,7 @@ func TestDistributedStorageDoesNotCachePartialResponseAsMissingRecords(t *testin fetchObserver.AssertFetchCount(t, 1) fetchObserver.Clear() - // The keys are written asynchonously to the distributed storage. + // The keys are written asynchronously to the distributed storage. time.Sleep(100 * time.Millisecond) distributedStorage.assertRecords(t, batchOfIDs, keyFn) distributedStorage.assertGetCount(t, 1) @@ -817,7 +818,7 @@ func TestPartialResponseForRefreshesDoesNotResultInMissingRecords(t *testing.T) fetchObserver.AssertRequestedRecords(t, ids) fetchObserver.Clear() - // We need to add a sleep because the keys are written asynchonously to the + // We need to add a sleep because the keys are written asynchronously to the // distributed storage. We expect that the distributed storage was queried // for the ids before we went to the underlying data source, and then written // to when it resulted in a cache miss and the data was in fact fetched. diff --git a/fetch.go b/fetch.go index b70412d..93b1740 100644 --- a/fetch.go +++ b/fetch.go @@ -88,7 +88,7 @@ func getFetch[V, T any](ctx context.Context, c *Client[T], key string, fetchFn F // GetOrFetch attempts to retrieve the specified key from the cache. If the value // is absent, it invokes the fetchFn function to obtain it and then stores the result. -// Additionally, when background refreshes are enabled, GetOrFetch determines if the record +// Additionally, when early refreshes are enabled, GetOrFetch determines if the record // needs refreshing and, if necessary, schedules this task for background execution. // // Parameters: diff --git a/options.go b/options.go index 9ddbb48..c8a7b91 100644 --- a/options.go +++ b/options.go @@ -52,16 +52,20 @@ func WithMissingRecordStorage() Option { // WithEarlyRefreshes instructs the cache to refresh the keys that are in // active rotation, thereby preventing them from ever expiring. This can have a // significant impact on your application's latency as you're able to -// continuously serve frequently used keys from memory. The background refresh -// gets scheduled when the key is requested again after a random time between -// minRefreshTime and maxRefreshTime. This is an important distinction because -// it means that the cache won't just naively refresh every key it's ever seen. -func WithEarlyRefreshes(minRefreshTime, maxRefreshTime, synchronousRefresthTime, retryBaseDelay time.Duration) Option { +// continuously serve frequently used keys from memory. An asynchronous +// background refresh gets scheduled when a key is requested again after a +// random time between minRefreshTime and maxRefreshTime has passed. This is an +// important distinction because it means that the cache won't just naively +// refresh every key it's ever seen. The third argument to this function will +// also allow you to provide a duration for when a refresh should become +// synchronous. If any of the refreshes were to fail, you'll get the latest +// data from the cache for the duration of the TTL. +func WithEarlyRefreshes(minAsyncRefreshTime, maxAsyncRefreshTime, syncRefreshTime, retryBaseDelay time.Duration) Option { return func(c *Config) { c.earlyRefreshes = true - c.minRefreshTime = minRefreshTime - c.maxRefreshTime = maxRefreshTime - c.synchronousRefreshTime = synchronousRefresthTime + c.minAsyncRefreshTime = minAsyncRefreshTime + c.maxAsyncRefreshTime = maxAsyncRefreshTime + c.syncRefreshTime = syncRefreshTime c.retryBaseDelay = retryBaseDelay } } @@ -163,7 +167,7 @@ func validateConfig(capacity, numShards int, ttl time.Duration, evictionPercenta } if !cfg.earlyRefreshes && cfg.bufferRefreshes { - panic("refresh buffering requires background refreshes to be enabled") + panic("refresh buffering requires early refreshes to be enabled") } if cfg.bufferRefreshes && cfg.bufferSize < 1 { @@ -178,11 +182,11 @@ func validateConfig(capacity, numShards int, ttl time.Duration, evictionPercenta panic("evictionInterval must be greater than 0") } - if cfg.minRefreshTime > cfg.maxRefreshTime { + if cfg.minAsyncRefreshTime > cfg.maxAsyncRefreshTime { panic("minRefreshTime must be less than or equal to maxRefreshTime") } - if cfg.maxRefreshTime > cfg.synchronousRefreshTime { + if cfg.maxAsyncRefreshTime > cfg.syncRefreshTime { panic("maxRefreshTime must be less than or equal to synchronousRefreshTime") } diff --git a/shard.go b/shard.go index bb512b7..520f7a0 100644 --- a/shard.go +++ b/shard.go @@ -169,11 +169,11 @@ func (s *shard[T]) set(key string, value T, isMissingRecord bool) bool { // If there is a difference between the min- and maxRefreshTime we'll use that to // set a random padding so that the refreshes get spread out evenly over time. var padding time.Duration - if s.minRefreshTime != s.maxRefreshTime { - padding = time.Duration(rand.Int64N(int64(s.maxRefreshTime - s.minRefreshTime))) + if s.minAsyncRefreshTime != s.maxAsyncRefreshTime { + padding = time.Duration(rand.Int64N(int64(s.maxAsyncRefreshTime - s.minAsyncRefreshTime))) } - newEntry.backgroundRefreshAt = now.Add(s.minRefreshTime + padding) - newEntry.synchronousRefreshAt = now.Add(s.synchronousRefreshTime) + newEntry.backgroundRefreshAt = now.Add(s.minAsyncRefreshTime + padding) + newEntry.synchronousRefreshAt = now.Add(s.syncRefreshTime) newEntry.numOfRefreshRetries = 0 } From 22664d7c1facc19b849e433fcf593f989af2add4 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Tue, 7 Jan 2025 11:22:42 +0100 Subject: [PATCH 22/32] WIP --- README.md | 10 +++++----- cache.go | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index ab47159..dbabc9e 100644 --- a/README.md +++ b/README.md @@ -20,11 +20,11 @@ It has all the functionality you would expect from a caching library, but what **sets it apart** are the features designed to make I/O heavy applications both _robust_ and _highly performant_. -If you’re currently retrieving your data from a distributed cache, database, or -API, you could probably consume it through this package for a significant -performance boost. As you will see below, there are many ways to configure this -package, and I encourage you to read through this README and experiment with -the examples to get an understanding of how it works. +We have been using it in production to enhance both the performance and +reliability of our services that retrieve data from distributed caches, +databases, and APIs. While the API surface of this package is tiny, it offers +extensive configuration options. I encourage you to read through this README +and experiment with the examples in order to understand its full capabilities. Here is a screenshot showing the P95 latency improvements we've observed after adding this package in front of our distributed key-value store: diff --git a/cache.go b/cache.go index 53a523f..8d5aba1 100644 --- a/cache.go +++ b/cache.go @@ -29,12 +29,12 @@ type Config struct { metricsRecorder DistributedMetricsRecorder log Logger - earlyRefreshes bool - minAsyncRefreshTime time.Duration - maxAsyncRefreshTime time.Duration - syncRefreshTime time.Duration - retryBaseDelay time.Duration - storeMissingRecords bool + earlyRefreshes bool + minAsyncRefreshTime time.Duration + maxAsyncRefreshTime time.Duration + syncRefreshTime time.Duration + retryBaseDelay time.Duration + storeMissingRecords bool bufferRefreshes bool batchMutex sync.Mutex From afff7de409c68f818c842a6a921617e38e2bf4c8 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Tue, 7 Jan 2025 15:25:49 +0100 Subject: [PATCH 23/32] WIP --- README.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index dbabc9e..e50de33 100644 --- a/README.md +++ b/README.md @@ -20,14 +20,14 @@ It has all the functionality you would expect from a caching library, but what **sets it apart** are the features designed to make I/O heavy applications both _robust_ and _highly performant_. -We have been using it in production to enhance both the performance and -reliability of our services that retrieve data from distributed caches, -databases, and APIs. While the API surface of this package is tiny, it offers +We have been using this package in production to enhance both the performance +and reliability of our services that retrieve data from distributed caches, +databases, and APIs. While the API surface of sturdyc is tiny, it offers extensive configuration options. I encourage you to read through this README -and experiment with the examples in order to understand its full capabilities. +and experiment with the examples to understand its full capabilities. -Here is a screenshot showing the P95 latency improvements we've observed after adding -this package in front of our distributed key-value store: +Here is a screenshot showing the P95 latency improvements we observed after +adding this package in front of a distributed key-value store:   Screenshot 2024-05-10 at 10 15 18 @@ -94,9 +94,10 @@ log.Println(cacheClient.Size()) log.Println(cacheClient.Get("key1")) ``` -As the final argument to the `New` function, we're also able to provide a large -number of additional options, which we will explore in detail in the sections -to follow. + +The `New` function is variadic, and as the final argument we're also able to +provide a wide range of configuration options, which we will explore in detail +in the sections to follow. # Evictions From 772f74ee478cd895a9d609307709e1aba9101548 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Wed, 8 Jan 2025 08:25:04 +0100 Subject: [PATCH 24/32] WIP --- README.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index e50de33..4a149fe 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ in the sections to follow. # Evictions -The cache has two eviction strategies. One is a run a background job which +The cache has two eviction strategies. One is a background job which continuously evicts expired records from each shard. However, there are options to both tweak the interval at which the job runs: @@ -120,7 +120,7 @@ cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage, ``` The latter can give you a slight performance boost in situations where you're -unlikely to ever exceed the capacity of your cache. +unlikely to ever exceed the capacity you've assigned to your cache. However, when the cache capacity is reached, the second eviction strategy is triggered. This process performs evictions on a per-shard basis, selecting @@ -133,12 +133,11 @@ Next, we'll start to look at some of the more _advanced features_. # Get or fetch -I have tried to design the API in a way that should make it effortless to start -consuming your applications data through `sturdyc`. To take advantage of all -the more advanced functionality and configurations you'll essentially just be -interacting with two functions: `GetOrFetch` and `GetOrFetchBatch`. +The API has been designed to make the process of integrating `sturdyc` with any +data source as straightforward as possible. The more advanced functionality is +accessed through just two core functions: `GetOrFetch` and `GetOrFetchBatch` -Let's say that we had the following code for fetching orders: +As an example, let's say that we had the following code for fetching orders: ```go func (c *Client) Order(ctx context.Context, id string) (Order, error) { From dae632638ac83e4a1b9cebca4411dbe6b61121a9 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Wed, 8 Jan 2025 08:32:12 +0100 Subject: [PATCH 25/32] WIP --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4a149fe..c0f24e3 100644 --- a/README.md +++ b/README.md @@ -33,8 +33,10 @@ adding this package in front of a distributed key-value store: Screenshot 2024-05-10 at 10 15 18   -In addition to this, we were also able to reduce our number of outgoing -requests by more than 90% after enabling the _refresh coalescing_ functionality. +In addition to this, we were able to reduce our outgoing requests by more than +90% after utilizing both the in-flight tracking of cache keys and refresh +coalescing functionality, which in turn has allowed us to use fewer containers +and much cheaper clusters. # Table of contents From a89cc59f3c0c0ca8ae8a2156056e5c75a09f138c Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Wed, 8 Jan 2025 10:54:41 +0100 Subject: [PATCH 26/32] WIP --- README.md | 270 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 168 insertions(+), 102 deletions(-) diff --git a/README.md b/README.md index c0f24e3..44110c5 100644 --- a/README.md +++ b/README.md @@ -135,11 +135,14 @@ Next, we'll start to look at some of the more _advanced features_. # Get or fetch -The API has been designed to make the process of integrating `sturdyc` with any -data source as straightforward as possible. The more advanced functionality is -accessed through just two core functions: `GetOrFetch` and `GetOrFetchBatch` +I have designed the API in a way that should make the process of integrating +`sturdyc` with any data source as straightforward as possible. While it +provides the basic get/set methods you would expect from a cache, the advanced +functionality is accessed through just two core functions: `GetOrFetch` and +`GetOrFetchBatch` -As an example, let's say that we had the following code for fetching orders: +As an example, let's say that we had the following code for fetching orders +from an API: ```go func (c *Client) Order(ctx context.Context, id string) (Order, error) { @@ -178,26 +181,109 @@ func (c *Client) Order(ctx context.Context, id string) (Order, error) { } ``` -The cache is then going to return the value from memory if it's available, and -otherwise it will call the `fetchFn` to retrieve the data from the underlying -data source. +The cache is going to return the value from memory if it's available, and +otherwise will call the `fetchFn` to retrieve the data from the underlying data +source. Most of our examples are going to be retrieving data from HTTP APIs, but it's just as easy to wrap a database query, a remote procedure call, a disk read, or any other I/O operation. -We'll also see how we can use closures to pass query parameters and other -options. +The `fetchFn` that we pass to `GetOrFetch` has the following function +signature: + +```go +type FetchFn[T any] func(ctx context.Context) (T, error) +``` + +For data sources capable of handling requests for multiple records at once, +we'll use `GetOrFetchBatch`: + +```go +type KeyFn func(id string) string + +type BatchFetchFn[T any] func(ctx context.Context, ids []string) (map[string]T, error) + +func (c *Client[T]) GetOrFetchBatch(ctx context.Context, ids []string, keyFn KeyFn, fetchFn BatchFetchFn[T]) (map[string]T, error) { + // ... +} +``` + +There are a few things to unpack here, so let's start with the `KeyFn`. When +adding an in-memory cache to an API client capable of calling multiple +endpoints, it's highly unlikely that an ID alone is going to be enough to +uniquely identify a record. + +To illustrate, let's say that we're building a Github client and want to use +this package to get around their rate limit. The username itself wouldn't make +for a good cache key because we could use it to fetch gists, commits, +repositories, etc. Therefore, `GetOrFetchBatch` takes a `KeyFn` that prefixes +each ID with something to identify the data source so that we don't get cache +key collisions: + +```go +gistPrefixFn := cacheClient.BatchKeyFn("gists") +commitPrefixFn := cacheClient.BatchKeyFn("commits") +gists, err := cacheClient.GetOrFetchBatch(ctx, userIDs, gistPrefixFn, fetchGists) +commits, err := cacheClient.GetOrFetchBatch(ctx, userIDs, commitPrefixFn, fetchCommits) +``` + +We're now able to use the same cache for multiple data sources, and internally +we'd get cache keys of this format: + +``` +gists-ID-viccon +gists-ID-some-other-user +commits-ID-viccon +commits-ID-some-other-user +``` + +Now, let's use a bit of our imagination because Github doesn't actually allow +us to fetch gists from multiple users at once. However, if they did, our client +would probably look something like this: + +```go +func (client *GithubClient) Gists(ctx context.Context, usernames []string) (map[string]Gist, error) { + cacheKeyFn := client.cache.BatchKeyFn("gists") + fetchFunc := func(ctx context.Context, cacheMisses []string) (map[string]Gist, error) { + timeoutCtx, cancel := context.WithTimeout(ctx, client.timeout) + defer cancel() + + var response map[string]Gist + err := requests.URL(c.baseURL). + Path("/gists"). + Param("usernames", strings.Join(cacheMisses, ",")). + ToJSON(&response). + Fetch(timeoutCtx) + return response, err + } + return sturdyc.GetOrFetchBatch(ctx, client.cache, usernames, cacheKeyFn, fetchFunc) +} +``` + +In the example above, the fetchFunc would get called for users who don't have +their gists in our cache, and the cacheMisses slice would contain their actual +usernames (without the prefix from the keyFn). + +The map that we return from our `fetchFunc` should have the IDs (in this case the +usernames) as keys, and the actual data that we want to cache (the gist) as the +value. + +Later, we'll see how we can use closures to pass query parameters and options +to our fetch functions, as well as how to use the PermutatedBatchKeyFn to +create unique cache keys for each permutation of them. # Stampede protection When we're consuming data through `sturdyc` we'll get automatic protection -against cache stampedes. Cache stampades (also known as thundering herd) occur -when many requests for a particular piece of data, which has just expired or -been evicted from the cache, come in at once. Preventing this has been one of -the key objectives. We do not want to cause a significant load on an underlying -data source every time one of our keys expires. To address this, `sturdyc` -performs _in-flight_ tracking for every key. +against cache stampedes. If you're not familiar with the term, a cache stampade +(also known as thundering herd) occurs when many requests for a particular +piece of data, which has just expired or been evicted from the cache, come in +at once. + +Preventing this has been one of the key objectives. We do not want to cause a +significant load on an underlying data source every time one of our keys +expire. To address this, `sturdyc` performs _in-flight_ tracking for every key. We can demonstrate this using the `GetOrFetch` function which, as I mentioned earlier, takes a key, and a function for retrieving the data if it's not in the @@ -207,16 +293,18 @@ in-flight request per key: ```go var count atomic.Int32 fetchFn := func(_ context.Context) (int, error) { + // Increment the count so that we can assert how many times this function was called. count.Add(1) time.Sleep(time.Second) return 1337, nil } + // Fetch the same key from 5 goroutines. var wg sync.WaitGroup for i := 0; i < 5; i++ { wg.Add(1) go func() { - // We can ignore the error given the fetchFn we're using. + // We'll ignore the error here for brevity. val, _ := cacheClient.GetOrFetch(context.Background(), "key2", fetchFn) log.Printf("got value: %d\n", val) wg.Done() @@ -229,8 +317,8 @@ in-flight request per key: ``` -Running this program we'll see that our requests for "key2" got deduplicated, -and that the fetchFn only got called once: +Running this program we can see that we were able to retrieve the value for all +5 goroutines, and that the fetchFn only got called once: ```sh ❯ go run . @@ -243,114 +331,92 @@ and that the fetchFn only got called once: 2024/05/21 08:06:29 1337 true ``` -The in-flight tracking works for batch operations too. The cache is able to -deduplicate a batch of cache misses, and then assemble the response by picking -records from _multiple_ in-flight requests. +The in-flight tracking works for batch operations too where the cache is able +to deduplicate a batch of cache misses, and then assemble the response by +picking records from **multiple** in-flight requests. -To demonstrate this, we'll use the `GetOrFetchBatch` function, which can be -used to retrieve data from a data source capable of handling requests for -multiple records at once. +To demonstrate this, we'll use the `GetOrFetchBatch` function, which as mentioned +earlier, can be used to retrieve data from a data source capable of handling +requests for multiple records at once. We'll start by creating a mock function that sleeps for `5` seconds, and then returns a map with a numerical value for every ID: ```go - var count atomic.Int32 - fetchFn := func(_ context.Context, ids []string) (map[string]int, error) { - // Increment the counter so that we can assert how many times this function was called. - count.Add(1) - time.Sleep(time.Second * 5) - - response := make(map[string]int, len(ids)) - for _, id := range ids { - num, _ := strconv.Atoi(id) - response[id] = num - } - - return response, nil +var count atomic.Int32 +fetchFn := func(_ context.Context, ids []string) (map[string]int, error) { + // Increment the counter so that we can assert how many times this function was called. + count.Add(1) + time.Sleep(time.Second * 5) + + response := make(map[string]int, len(ids)) + for _, id := range ids { + num, _ := strconv.Atoi(id) + response[id] = num } -``` -Next, we'll need some batches to test with, so I created three batches with 5 -IDs each: - -```go - batches := [][]string{ - {"1", "2", "3", "4", "5"}, - {"6", "7", "8", "9", "10"}, - {"11", "12", "13", "14", "15"}, - } + return response, nil +} ``` -IDs can often be used to fetch data from multiple data sources. As an example, -we might use an id to fetch a users orders, payments, shipment options, etc. -Hence, if we're using the cache with an API client that is capable of calling -different endpoints, we'll want to prefix this user id with something in order -to avoid key collisions for different data types, e.g: +Next, we'll need some batches to test with, so here I've created three batches +with 5 IDs each: -```sh -// 1234 is our user id -orders-1234 -payments-1234 -shipments-1234 +```go +batches := [][]string{ + {"1", "2", "3", "4", "5"}, + {"6", "7", "8", "9", "10"}, + {"11", "12", "13", "14", "15"}, +} ``` -The package provides more functionality for this that we'll see later on, but -for now we'll use the most simple version which adds a string prefix to every -ID: +and we can now request each batch in a separate goroutine: ```go - keyPrefixFn := cacheClient.BatchKeyFn("my-data-source") -``` - -This will result in cache keys of this format: +for _, batch := range batches { + go func() { + res, _ := cacheClient.GetOrFetchBatch(context.Background(), batch, keyPrefixFn, fetchFn) + log.Printf("got batch: %v\n", res) + }() +} -``` -my-data-source-ID-1 -my-data-source-ID-2 -my-data-source-ID-3 +// Just to ensure that these batches are in fact in-flight, we'll sleep to give the goroutines a chance to run. +time.Sleep(time.Second * 2) ``` -We can now request each batch in a separate goroutine: +At this point, the cache should have 3 in-flight requests for IDs 1-15: -```go - for _, batch := range batches { - go func() { - res, _ := cacheClient.GetOrFetchBatch(context.Background(), batch, keyPrefixFn, fetchFn) - log.Printf("got batch: %v\n", res) - }() - } - - // Sleep to give the goroutines above a chance to run. - // This ensures that the batches are in-flight. - time.Sleep(time.Second * 3) +```sh +[1,2,3,4,5] => REQUEST 1 (IN-FLIGHT) +[6,7,8,9,10] => REQUEST 2 (IN-FLIGHT) +[11,12,13,14,15] => REQUEST 3 (IN-FLIGHT) ``` -At this point, the cache should have 3 in-flight requests for IDs 1-15. Knowing -this. Let's now test the stampede protection by launching another five -goroutines. Each of these goroutines are going to request two random IDs from -our previous batches: +Knowing this, let's test the stampede protection by launching another five +goroutines. Each of these goroutines will request two random IDs from our +previous batches. For example, they could request one ID from the first +request, and another from the second or third. ```go - // Launch another 5 goroutines that are going to pick two random IDs from any of our in-flight batches. - // e.g: - // [1,8] - // [4,11] - // [14,2] - // [6,15] - var wg sync.WaitGroup - for i := 0; i < 5; i++ { - wg.Add(1) - go func() { - ids := []string{batches[rand.IntN(2)][rand.IntN(4)], batches[rand.IntN(2)][rand.IntN(4)]} - res, _ := cacheClient.GetOrFetchBatch(context.Background(), ids, keyPrefixFn, fetchFn) - log.Printf("got batch: %v\n", res) - wg.Done() - }() - } +// Launch another 5 goroutines that are going to pick two random IDs from any of our in-flight batches. +// e.g: +// [1,8] +// [4,11] +// [14,2] +// [6,15] +var wg sync.WaitGroup +for i := 0; i < 5; i++ { + wg.Add(1) + go func() { + ids := []string{batches[rand.IntN(2)][rand.IntN(4)], batches[rand.IntN(2)][rand.IntN(4)]} + res, _ := cacheClient.GetOrFetchBatch(context.Background(), ids, keyPrefixFn, fetchFn) + log.Printf("got batch: %v\n", res) + wg.Done() + }() +} - wg.Wait() - log.Printf("fetchFn was called %d times\n", count.Load()) +wg.Wait() +log.Printf("fetchFn was called %d times\n", count.Load()) ``` Running this program, and looking at the logs, we'll see that the cache is able From 5e2c6faf6a148604293a1a90d3933fab15e87d08 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Wed, 8 Jan 2025 13:22:22 +0100 Subject: [PATCH 27/32] WIP --- README.md | 193 +++++++++++++++++++++++++++++------------------------- 1 file changed, 104 insertions(+), 89 deletions(-) diff --git a/README.md b/README.md index 44110c5..b3f3957 100644 --- a/README.md +++ b/README.md @@ -420,7 +420,7 @@ log.Printf("fetchFn was called %d times\n", count.Load()) ``` Running this program, and looking at the logs, we'll see that the cache is able -resolve all of the keys from these new goroutines without generating any +resolve all of the ids from these new goroutines without generating any additional requests even though we're picking IDs from different batches: ```sh @@ -452,15 +452,15 @@ applications this may be acceptable, but in others it can introduce stale reads. Additionally, once the cached value expires, the first request after expiration must refresh the cache, resulting in a longer response time for that user. This can make the average latency look very different from the P90–P99 -tail latencies, since those percentiles capture the refresh delays. This can -make it difficult to configure appropriate alarms for your applications -response times. +tail latencies, since those percentiles capture the delays of having to go to +the actual data source in order to refresh the cache. This in turn can make it +difficult to configure appropriate alarms for your applications response times. `sturdyc` aims to give you a lot of control over these choices when you enable the early refreshes functionality. It will prevent your most frequently used records from ever expiring by continuously refreshing them in the background. -This has a significant impact on your applications latency. We've seen the P99 -of some of our applications go from 50ms down to 1. +This can have a significant impact on your applications latency. We've seen the +P99 of some of our applications go from 50ms down to 1. One thing to note about these background refreshes is that they are scheduled if a key is **requested again** after a configurable amount of time has passed. @@ -468,10 +468,16 @@ This is an important distinction because it means that the cache doesn't just naively refresh every key it's ever seen. Instead, it only refreshes the records that are actually in active rotation, while allowing unused keys to be deleted once their TTL expires. This also means that the request that gets -chosen to refresh the value won’t retrieve the updated data right away. -However, there is also a synchronous refresh time that you can provide, where -you essentially say, "If the data is older than x, I want the refresh to be -blocking." +chosen to refresh the value won’t retrieve the updated data right away as the +refresh happens asynchronously. + +However, asynchronous refreshes can be problematic. What if some keys only get +requested very infrequently? If the refreshes are done in the background the +latency will be low, but the data itself might be stale. + +To solve this, you also get to provide a synchronous refresh time. This +essentially tells the cache: "If the data is older than x, I want the refresh +to be blocking and wait for the response." Below is an example configuration that you can use to enable this functionality: @@ -587,30 +593,33 @@ come in for this key, the stampede protection will kick in and make the refresh synchronous for all of them, while also ensuring that only a single request is made to the underlying data source. -I also like to use this feature to provide a degraded experience when an +Sometimes I like to use this feature to provide a degraded experience when an upstream system encounters issues. For this, I choose a high TTL and a low refresh time, so that when everything is working as expected, the records are refreshed continuously. However, if the upstream system stops responding, I can rely on cached records for the entire duration of the TTL. -One important note is that the synchronous refresh time isn’t affected by the -exponential backoff. The number of background refreshes is going to get reduced -if an upsteam system is experiencing errors. However, if we reach a point where -all of the records are older than the synchronous refresh time, we're going to -send a steady stream of outgoing requests. That is because I think of the -synchronous refresh time as "I really don’t want the data to be older than -this," so if a synchronous refresh fails, I want the very next request to -attempt another refresh, because the data is now older than I’d like it to be. +This also brings us to the final argument of the `WithEarlyRefreshes` function +which is the retry base delay. This delay is used to create an exponential +backoff for our background requests if a data source starts to return errors. +Please note that this **only** applies to background refreshes. If we reach a +point where all of the records are older than the synchronous refresh time, +we're going to send a steady stream of outgoing requests. That is because I +think of the synchronous refresh time as "I really don’t want the data to be +older than this, but I want the possibility of using an even higher TTL in +order to serve stale." Therefore, if a synchronous refresh fails, I want the +very next request for that key to attempt another refresh. -Also, if you don't want this functionality you could just set a short TTL. The -cache will never return a record where the TTL has expired. +Also, if you don't want any of this serve stale functionality you could just +use short TTLs. The cache will never return a record where the TTL has expired. The entire example is available [here.](https://github.com/viccon/sturdyc/tree/main/examples/refreshes) # Deletions -What if the record was deleted? Our cache might use a 2-hour-long TTL, and we -definitely don't want it to take that long for the deletion to propagate. +What if a record gets deleted at the underlying data source? Our cache might +use a 2-hour-long TTL, and we definitely don't want it to take that long for +the deletion to propagate. However, if we were to modify our client from the previous example so that it returns an error after the first request: @@ -645,8 +654,9 @@ cd examples/refreshes go run . ``` -We'll see that the exponential backoff kicks in, resulting in more iterations -for every refresh, but the value is still being printed: +We'll see that the exponential backoff kicks in, delaying our background +refreshes which results in more iterations for every refresh, but the value is +still being printed: ```sh 2024/05/09 13:22:03 Fetching value for key: key @@ -689,7 +699,7 @@ empty list, specific error message, etc. There is no easy way for the cache to figure this out implicitly. It couldn't simply delete a record every time it receives an error. If an -upstream system goes down, we want to be able to serve stale data for the +upstream system goes down, we want to be able to serve the data for the duration of the TTL, while reducing the frequency of our refreshes to make it easier for them to recover. @@ -708,8 +718,6 @@ fetchFn := func(_ context.Context) (string, error) { ``` This tells the cache that the record is no longer available at the underlying data source. -Therefore, if this record is being fetched as a background refresh, the cache will quickly see -if it has a record for this key, and subsequently delete it. If we run this application again we'll see that it works, and that we're no longer getting any cache hits. This leads to outgoing requests for every @@ -749,8 +757,8 @@ just a single ID wasn't found: } ``` -and then have the cache swallow that error and return nil, felt much less -intuitive. +and then have the cache either swallow that error and return nil, or return the +map with the error, felt much less intuitive. This code is based on the example available [here.](https://github.com/viccon/sturdyc/tree/main/examples/refreshes) @@ -922,7 +930,7 @@ type BatchFetchFn[T any] func(ctx context.Context, ids []string) (map[string]T, ``` The cache can use this to iterate through the response map, again apply the -`keyFn` to each ID, and then store each record individually in the cache. +`keyFn` to each ID, and then store each record individually. Sometimes, the function signature for the `BatchFetchFn` can feel too limited. You may need additional options and not just the IDs to retrieve the data. But @@ -1061,15 +1069,26 @@ func (c *Client) MoviesByIDs(ctx context.Context, ids []string, opts MoviesByIDs } ``` -The API clients `MoviesByIDs` function calls an external API to fetch movies by -IDs, and the `BatchFetchFn` that we're passing to `sturdyc` uses a closure to -provide the query parameters we need. +The API clients `MoviesByIDs` method calls an external API to fetch movies by +IDs, and the `BatchFetchFn` that we're passing to `sturdyc` has a closure over +the query parameters we need. However, one **important** thing to note here is that the ID is _no longer_ -enough to _uniquely_ identify a record in our cache. The query parameters will -most likely be used by the system we're calling to transform the data in -various ways. Hence, we should cache each movie once for each permutation of -our options: +enough to _uniquely_ identify a record in our cache even with the basic prefix +function we've used before. It will no longer work to just have cache keys that +looks like this: + +``` +movies-ID-1 +movies-ID-2 +movies-ID-3 +``` + +Now why is that? If you think about it, the query parameters will most likely +be used by the system we're calling to transform the data in various ways. +Hence, we need to store a movie not only once per ID, but also once per +transformation. In other terms, we should cache each movie once for each +permutation of our options: ``` IncludeUpcoming: true IncludeUpsell: true @@ -1080,8 +1099,7 @@ IncludeUpcoming: false IncludeUpsell: true This is what the `PermutatedBatchKeyFn` is used for. It takes a prefix and a struct which internally it uses reflection on in order to concatenate the -**exported** fields to form a unique cache key that would look something like -this: +**exported** fields to form a unique cache key that would look like this: ``` // movies-by-ids is our prefix that we passed as the @@ -1166,7 +1184,7 @@ func main() { ``` At this point, the cache has stored each record individually for each option -set. We can imagine that the keys would look something like this: +set. The keys would look something like this: ``` FEDEX-2024-04-06-ID-1 @@ -1346,17 +1364,17 @@ go run . ``` The number of refreshes went from **9** to **3**. Imagine what a batch size of -50 would could do for your applications performance! +50 could do for your applications performance! There is more information about this in the section about metrics, but for our -production applications we're also using the caches `WithMetrics` option so -that we can monitor how well our refreshes are performing: +production applications we're also using the `WithMetrics` option so that we +can monitor how well our refreshes are performing: Screenshot 2024-05-04 at 12 38 04 -> This chart shows the batch sizes for our coalesced refreshes. +This chart shows the batch sizes for our coalesced refreshes. Screenshot 2024-05-04 at 12 38 20 -> This chart shows the average batch size of our refreshes for two different data sources +This chart shows the average batch size of our refreshes for two different data sources The entire example is available [here.](https://github.com/viccon/sturdyc/tree/main/examples/buffering) @@ -1387,19 +1405,19 @@ are making use of the `GetOrFetchBatch` function, we'll ask the cache (using the `WithRefreshCoalescing` option) to delay them for up to 15 seconds or until a batch size of 10 is reached. -What if a key that hasn't been refreshed in the last 120 seconds is suddenly -requested? Given the `synchronousRefreshDelay` passed to the -`WithEarlyRefreshes` option, the cache will skip any background refresh and -instead perform a synchronous refresh to ensure that the data is fresh. Did -1000 requests suddenly arrive for this key? No problem, the in-flight tracking -makes sure that we only make **one** request to the underlying data source. -This works for refreshes too by the way. If 1000 requests arrived for a key -that was 3 seconds old (greater than our `maxRefreshDelay`) we'd only schedule -a single refresh for it. +What if we get a request for a key that hasn't been refreshed in the last 120 +seconds? Given the `synchronousRefreshDelay` passed to the `WithEarlyRefreshes` +option, the cache will skip any background refresh and instead perform a +synchronous refresh to ensure that the data is fresh. Did 1000 requests +suddenly arrive for this key? No problem, the in-flight tracking makes sure +that we only make **one** request to the underlying data source. This works for +refreshes too by the way. If 1000 requests arrived for a key that was 3 seconds +old (greater than our `maxRefreshDelay`) we would only schedule a single +refresh for it. Is the underlying data source experiencing downtime? With our TTL of two-hours -we'll be able to provide a degraded experience to our users by serving stale -data from our cache while continuously trying to refresh it in the background. +we'll be able to provide a degraded experience to our users by serving the data +we have in our cache. # Passthrough @@ -1410,25 +1428,22 @@ still perform in-flight tracking and deduplicate your requests. # Distributed storage -I think it's important to read the previous sections before jumping here in -order to understand all the heavy lifting `sturdyc` does when it comes to -creating cache keys, tracking in-flight requests, refreshing records in the -background to improve latency, and buffering/coalescing requests to minimize -the number of round trips to underlying data sources. Because, as you’ll soon -see, we’ll leverage these features when adding distributed storage to our cache -as well. - -However, let's first try and understand when this functionality could be -useful. I like to use this feature when I'm building an application that is -able to achieve a high cache hit rate, while also being subject to large bursts -of traffic. - -To provide a real life example example of this, I've used this in production -for a large streaming application. The content was fairly static; new movies, -series, and episodes were only ingested a couple of times an hour. That meant -that we could achieve a very high hit rate for our data sources. However, -during the evenings, when a popular football match or TV show was about to -start, our traffic could spike by a factor of 20 within less than a minute. +It's important to read the previous sections before jumping here in order to +understand how `sturdyc` works when it comes to creating cache keys, tracking +in-flight requests, refreshing records in the background, and +buffering/coalescing requests to minimize the number of round trips we have to +make to an underlying data source. As you'll soon see, we'll leverage all of +these features when we're adding distributed storage. + +However, let's first understand when this functionality can be useful. This +feature is particularly valuable when building applications that can achieve a +high cache hit rate while also being subject to large bursts of requests. As an +example, I've used this in production for a large streaming application. The +content was fairly static - new movies, series, and episodes were only ingested +a couple of times an hour. This meant that we could achieve a very high hit +rate for our data sources. However, during the evenings, when a popular +football match or TV show was about to start, our traffic could spike by a +factor of 20 within less than a minute. To illustrate the problem further, let’s say the hit rate for our in-memory cache was 99.8%. Then, when we received that large burst of traffic, our @@ -1438,12 +1453,12 @@ significant load on our underlying data sources as soon as they came online, because every request they received led to an outgoing request to the data source. And these data sources had gotten used to being shielded from most of the traffic by the older containers high hit-rate and refresh coalescing usage. -Hence, what was a 20x spike for us could become a 200x spike for them until our -new containers had warmed their cache. +Hence, what was a 20x spike for us could easily become a 200x spike for them +until our new containers had warmed their cache. Therefore, I decided to add the ability to have the containers sync their in-memory cache with a distributed key-value store that would have an easier -time to absorb these bursts. +time absorbing these bursts. Adding distributed storage to the cache is, from the package's point of view, essentially just another data source with a higher priority. Hence, we're still @@ -1510,7 +1525,7 @@ cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, policies of this storage. `sturdyc` will only make sure that it queries this data source first, and then writes the keys and values to this storage as soon as it has gone out to an underlying data source and refreshed them. Therefore, -I'd advice you touse the configuration above with short TTLs for the +I'd advice you to use the configuration above with short TTLs for the distributed storage, or things might get too stale. I mostly think it's useful if you're consuming data sources that don't handle bursts from new containers very well. @@ -1544,8 +1559,8 @@ this: ``` Above we can see that the underlying data source was only visited **once**, and -that the remaining background refreshes that the in-memory cache performed went -only went to the distributed storage. +that the remaining background refreshes that the in-memory cache performed only +went to the distributed storage. # Distributed storage early refreshes @@ -1566,7 +1581,7 @@ cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, ``` With a configuration like this, I would usually set the TTL for the distributed -storage to something like an hour. However, if the cache queries the +storage to something like an hour. However, if the `sturdyc` queries the distributed storage and finds that a record is older than 1 minute (the second argument to the function), it will refresh the record from the underlying data source, and then write the updated value back to it. So the interaction with @@ -1580,8 +1595,8 @@ the distributed storage would look something like this: - If the call to refresh the data failed, the cache will use the value from the distributed storage as a fallback. -However, there is one more scenario we must cover that requires two additional -methods to be implemented: +However, there is one more scenario we must cover now that requires two +additional methods to be implemented: ```go type DistributedStorageEarlyRefreshes interface { @@ -1682,16 +1697,16 @@ cacheDistributedMetrics := sturdyc.New[any]( Below are a few images where some of these metrics have been visualized in Grafana: Screenshot 2024-05-04 at 12 36 43 -> Here we can how often we're able to serve from memory. +Here we can how often we're able to serve from memory. Screenshot 2024-05-04 at 12 37 39 -> This image displays the number of items we have cached. +This image displays the number of items we have cached. Screenshot 2024-05-04 at 12 38 04 -> This chart shows the batch sizes for the buffered refreshes. +This chart shows the batch sizes for the buffered refreshes. Screenshot 2024-05-04 at 12 38 20 -> And lastly, we can see the average batch size of our refreshes for two different data sources. +And lastly, we can see the average batch size of our refreshes for two different data sources. # Generics From ccf8c30d0588efa6b10c6e1943ecf4b92453c19e Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Wed, 8 Jan 2025 13:23:35 +0100 Subject: [PATCH 28/32] WIP --- README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index b3f3957..7a7e18e 100644 --- a/README.md +++ b/README.md @@ -1382,21 +1382,21 @@ Another point to note is how effectively the options we've seen so far can be combined to create high-performing, flexible, and robust caching solutions: ```go - capacity := 10000 - numShards := 10 - ttl := 2 * time.Hour - evictionPercentage := 10 - minRefreshDelay := time.Second - maxRefreshDelay := time.Second * 2 - synchronousRefreshDelay := time.Second * 120 // 2 minutes. - retryBaseDelay := time.Millisecond * 10 - batchSize := 10 - batchBufferTimeout := time.Second * 15 +capacity := 10000 +numShards := 10 +ttl := 2 * time.Hour +evictionPercentage := 10 +minRefreshDelay := time.Second +maxRefreshDelay := time.Second * 2 +synchronousRefreshDelay := time.Second * 120 // 2 minutes. +retryBaseDelay := time.Millisecond * 10 +batchSize := 10 +batchBufferTimeout := time.Second * 15 - cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, - sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, retryBaseDelay), - sturdyc.WithRefreshCoalescing(batchSize, batchBufferTimeout), - ) +cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, + sturdyc.WithEarlyRefreshes(minRefreshDelay, maxRefreshDelay, synchronousRefreshDelay, retryBaseDelay), + sturdyc.WithRefreshCoalescing(batchSize, batchBufferTimeout), +) ``` With the configuration above, the keys in active rotation are going to be From 765d55fe7ee7415f1cf4aa1e2182f90384eef6d1 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Wed, 8 Jan 2025 14:27:00 +0100 Subject: [PATCH 29/32] WIP --- README.md | 99 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 50 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 7a7e18e..ee194e2 100644 --- a/README.md +++ b/README.md @@ -10,15 +10,18 @@ [![Test](https://github.com/viccon/sturdyc/actions/workflows/main.yml/badge.svg)](https://github.com/viccon/sturdyc/actions/workflows/main.yml) [![codecov](https://codecov.io/gh/viccon/sturdyc/graph/badge.svg?token=CYSKW3Z7E6)](https://codecov.io/gh/viccon/sturdyc) -`Sturdyc` is an in-memory cache that supports **non-blocking reads** and has a -configurable number of shards that makes it possible to achieve writes -**without any lock contention**. The + +`sturdyc` eliminates cache stampedes and can minimize data source load in +high-throughput systems through features such as request coalescing and +asynchronous refreshes. It combines the speed of in-memory caching with +granular control over data freshness. At its core, `sturdyc` provides +**non-blocking reads** and sharded writes for minimal lock contention. The [xxhash](https://github.com/cespare/xxhash) algorithm is used for efficient key distribution. It has all the functionality you would expect from a caching library, but what -**sets it apart** are the features designed to make I/O heavy applications both -_robust_ and _highly performant_. +**sets it apart** are the flexible configurations that have been designed to +make I/O heavy applications both _robust_ and _highly performant_. We have been using this package in production to enhance both the performance and reliability of our services that retrieve data from distributed caches, @@ -33,10 +36,10 @@ adding this package in front of a distributed key-value store: Screenshot 2024-05-10 at 10 15 18   -In addition to this, we were able to reduce our outgoing requests by more than -90% after utilizing both the in-flight tracking of cache keys and refresh -coalescing functionality, which in turn has allowed us to use fewer containers -and much cheaper clusters. +And through a combination of inflight-tracking, asynchronous refreshes, and +refresh coalescing, we reduced load on underlying data sources by more than +90%. This reduction in outgoing requests has enabled us to operate with fewer +containers and significantly cheaper database clusters. # Table of contents @@ -135,11 +138,11 @@ Next, we'll start to look at some of the more _advanced features_. # Get or fetch -I have designed the API in a way that should make the process of integrating -`sturdyc` with any data source as straightforward as possible. While it -provides the basic get/set methods you would expect from a cache, the advanced -functionality is accessed through just two core functions: `GetOrFetch` and -`GetOrFetchBatch` +I have tried to design the API in a way that should make the process of +integrating `sturdyc` with any data source as straightforward as possible. +While it provides the basic get/set methods you would expect from a cache, the +advanced functionality is accessed through just two core functions: +`GetOrFetch` and `GetOrFetchBatch` As an example, let's say that we had the following code for fetching orders from an API: @@ -270,14 +273,13 @@ usernames) as keys, and the actual data that we want to cache (the gist) as the value. Later, we'll see how we can use closures to pass query parameters and options -to our fetch functions, as well as how to use the PermutatedBatchKeyFn to +to our fetch functions, as well as how to use the `PermutatedBatchKeyFn` to create unique cache keys for each permutation of them. # Stampede protection -When we're consuming data through `sturdyc` we'll get automatic protection -against cache stampedes. If you're not familiar with the term, a cache stampade -(also known as thundering herd) occurs when many requests for a particular +`sturdyc` provides automatic protection against cache stampedes (also known as +thundering herd) - a situation that occurs when many requests for a particular piece of data, which has just expired or been evicted from the cache, come in at once. @@ -360,7 +362,7 @@ fetchFn := func(_ context.Context, ids []string) (map[string]int, error) { ``` Next, we'll need some batches to test with, so here I've created three batches -with 5 IDs each: +with five IDs each: ```go batches := [][]string{ @@ -421,7 +423,8 @@ log.Printf("fetchFn was called %d times\n", count.Load()) Running this program, and looking at the logs, we'll see that the cache is able resolve all of the ids from these new goroutines without generating any -additional requests even though we're picking IDs from different batches: +additional requests even though we're picking IDs from different in-flight +batches: ```sh ❯ go run . @@ -471,13 +474,14 @@ deleted once their TTL expires. This also means that the request that gets chosen to refresh the value won’t retrieve the updated data right away as the refresh happens asynchronously. -However, asynchronous refreshes can be problematic. What if some keys only get -requested very infrequently? If the refreshes are done in the background the -latency will be low, but the data itself might be stale. +Asynchronous refreshes present challenges with infrequently requested keys. +When the refreshes are done in the background the latency will be low, but the +data might feel flaky or stale if we're not asking for the key again soon after +it has been refreshed. To solve this, you also get to provide a synchronous refresh time. This essentially tells the cache: "If the data is older than x, I want the refresh -to be blocking and wait for the response." +to be blocking and have the user wait for the response." Below is an example configuration that you can use to enable this functionality: @@ -509,8 +513,7 @@ func main() { } ``` -And to get a better feeling for how this works, we'll use the configuration -above, and then we'll create a simple API client which embedds the cache: +Let's build a simple API client that embeds the cache using our configuration: ```go type API struct { @@ -558,8 +561,8 @@ func main() { } ``` -Running this program, we're going to see that the value gets refreshed once -every 2-3 retrievals: +Running this program, we're going to see that the value gets refreshed +asynchronously once every 2-3 retrievals: ```sh cd examples/refreshes @@ -587,11 +590,11 @@ either, given that we set the synchronous refresh delay like this: synchronousRefreshDelay := time.Second * 30 ``` -If a key isn't requested again within 30 seconds, the cache will make the -refresh synchronous. Even if a minute has passed and 1,000 requests suddenly -come in for this key, the stampede protection will kick in and make the refresh -synchronous for all of them, while also ensuring that only a single request is -made to the underlying data source. +Which means that if a key isn't requested again within 30 seconds, the cache +will make the refresh synchronous. Even if a minute has passed and 1,000 +requests suddenly come in for this key, the stampede protection will kick in +and make the refresh synchronous for all of them, while also ensuring that only +a single request is made to the underlying data source. Sometimes I like to use this feature to provide a degraded experience when an upstream system encounters issues. For this, I choose a high TTL and a low @@ -693,8 +696,8 @@ still being printed: 2024/05/09 13:22:04 Fetching value for key: key ``` -This is a bit tricky because how you determine if a record has been deleted is -going to vary based on your data source. It could be a status code, zero value, +This is a bit tricky because how you determine if a record has been deleted +could vary based on your data source. It could be a status code, zero value, empty list, specific error message, etc. There is no easy way for the cache to figure this out implicitly. @@ -750,8 +753,8 @@ just a single ID wasn't found: for _, id := range cacheMisses { // NOTE: Don't do this, it's just an example. if response[id]; !id { - return response, sturdyc.ErrNotFound - } + return response, sturdyc.ErrNotFound + } } return response, nil } @@ -883,12 +886,11 @@ The entire example is available [here.](https://github.com/viccon/sturdyc/tree/m # Batch endpoints One challenge with caching batchable endpoints is that you have to find a way -to reduce the number of cache keys. To illustrate, let's say that we have 10 000 -records, and an endpoint for fetching them that allows for batches of 20. -The IDs for the batch are supplied as query parameters, for example, -`https://example.com?ids=1,2,3,4,5,...20`. If we were to use this as the cache -key, the way many CDNs would do, we could quickly calculate the number of keys -we would generate like this: +to reduce the number of cache keys. Consider an endpoint that allows fetching +10,000 records in batches of 20. The IDs for the batch are supplied as query +parameters, for example, `https://example.com?ids=1,2,3,4,5,...20`. If we were +to use this as the cache key, the way many CDNs would do, we could quickly +calculate the number of keys we would generate like this: $$ C(n, k) = \binom{n}{k} = \frac{n!}{k!(n-k)!} $$ @@ -1038,8 +1040,8 @@ What if you're fetching data from some endpoint that accepts a variety of query parameters? Or perhaps you're doing a database query and want to apply some ordering and filtering to the data? -We can easily get around this by using closures. Let's illustrate this by -looking at an actual API client I've written: +Closures provide an elegant solution to this limitation. Let's illustrate this +by looking at an actual API client I've written: ```go const moviesByIDsCacheKeyPrefix = "movies-by-ids" @@ -1245,8 +1247,8 @@ As you may recall, our client is using the `WithEarlyRefreshes` option to refresh the records in the background whenever their keys are requested again after a certain amount of time has passed. And as seen in the example above, we're successfully storing the records once for every permutation of the -options we use to retrieve it. However, we're not really utilizing the fact -that the endpoint is batchable when we're performing the refreshes. +options we use to retrieve it. However, we're not taking advantage of the +endpoint's batch capabilities. To make this more efficient, we can enable the **refresh coalescing** functionality, but before we'll update our example to use it let's just take a @@ -1327,8 +1329,7 @@ So now we're saying that we want to coalesce the refreshes for each permutation, and try to process them in batches of 3. However, if it's not able to reach that size within 30 seconds we want the refresh to happen anyway. -And if you recall the output from our last run of this example code where the -refreshes happened one by one: +The previous output revealed that the refreshes happened one by one: ```sh go run . From ce03383c355a0aee8f3624eac2620cebd221bf55 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Wed, 8 Jan 2025 14:31:31 +0100 Subject: [PATCH 30/32] WIP --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ee194e2..b045177 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ high-throughput systems through features such as request coalescing and asynchronous refreshes. It combines the speed of in-memory caching with granular control over data freshness. At its core, `sturdyc` provides -**non-blocking reads** and sharded writes for minimal lock contention. The +**non-blocking reads** and **sharded writes** for minimal lock contention. The [xxhash](https://github.com/cespare/xxhash) algorithm is used for efficient key distribution. @@ -127,7 +127,7 @@ cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage, The latter can give you a slight performance boost in situations where you're unlikely to ever exceed the capacity you've assigned to your cache. -However, when the cache capacity is reached, the second eviction strategy is +However, if the cache capacity is reached, the second eviction strategy is triggered. This process performs evictions on a per-shard basis, selecting records for removal based on recency. The eviction algorithm uses [quickselect](https://en.wikipedia.org/wiki/Quickselect), which has an O(N) From ff44475d59ca29bc8a61c76a16e981482a292585 Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Wed, 8 Jan 2025 15:09:26 +0100 Subject: [PATCH 31/32] WIP --- README.md | 127 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 66 insertions(+), 61 deletions(-) diff --git a/README.md b/README.md index b045177..488d9e7 100644 --- a/README.md +++ b/README.md @@ -25,9 +25,10 @@ make I/O heavy applications both _robust_ and _highly performant_. We have been using this package in production to enhance both the performance and reliability of our services that retrieve data from distributed caches, -databases, and APIs. While the API surface of sturdyc is tiny, it offers -extensive configuration options. I encourage you to read through this README -and experiment with the examples to understand its full capabilities. +databases, and external APIs. While the API surface of sturdyc is tiny, it +offers extensive configuration options. I encourage you to read through this +README and experiment with the examples in order to understand its full +capabilities. Here is a screenshot showing the P95 latency improvements we observed after adding this package in front of a distributed key-value store: @@ -127,9 +128,9 @@ cacheClient := sturdyc.New[int](capacity, numShards, ttl, evictionPercentage, The latter can give you a slight performance boost in situations where you're unlikely to ever exceed the capacity you've assigned to your cache. -However, if the cache capacity is reached, the second eviction strategy is -triggered. This process performs evictions on a per-shard basis, selecting -records for removal based on recency. The eviction algorithm uses +However, if the capacity is reached, the second eviction strategy is triggered. +This process performs evictions on a per-shard basis, selecting records for +removal based on recency. The eviction algorithm uses [quickselect](https://en.wikipedia.org/wiki/Quickselect), which has an O(N) time complexity without the overhead of requiring write locks on reads to update a recency list, as many LRU caches do. @@ -221,8 +222,8 @@ To illustrate, let's say that we're building a Github client and want to use this package to get around their rate limit. The username itself wouldn't make for a good cache key because we could use it to fetch gists, commits, repositories, etc. Therefore, `GetOrFetchBatch` takes a `KeyFn` that prefixes -each ID with something to identify the data source so that we don't get cache -key collisions: +each ID with something to identify the data source so that we don't end up with +cache key collisions: ```go gistPrefixFn := cacheClient.BatchKeyFn("gists") @@ -231,8 +232,8 @@ gists, err := cacheClient.GetOrFetchBatch(ctx, userIDs, gistPrefixFn, fetchGists commits, err := cacheClient.GetOrFetchBatch(ctx, userIDs, commitPrefixFn, fetchCommits) ``` -We're now able to use the same cache for multiple data sources, and internally -we'd get cache keys of this format: +We're now able to use the _same_ cache for _multiple_ data sources, and +internally we'd get cache keys of this format: ``` gists-ID-viccon @@ -264,9 +265,9 @@ func (client *GithubClient) Gists(ctx context.Context, usernames []string) (map[ } ``` -In the example above, the fetchFunc would get called for users who don't have -their gists in our cache, and the cacheMisses slice would contain their actual -usernames (without the prefix from the keyFn). +In the example above, the `fetchFunc` would get called for users where we don't +have their gists in our cache, and the cacheMisses slice would contain their +actual usernames (without the prefix from the keyFn). The map that we return from our `fetchFunc` should have the IDs (in this case the usernames) as keys, and the actual data that we want to cache (the gist) as the @@ -389,8 +390,8 @@ time.Sleep(time.Second * 2) At this point, the cache should have 3 in-flight requests for IDs 1-15: ```sh -[1,2,3,4,5] => REQUEST 1 (IN-FLIGHT) -[6,7,8,9,10] => REQUEST 2 (IN-FLIGHT) +[1,2,3,4,5] => REQUEST 1 (IN-FLIGHT) +[6,7,8,9,10] => REQUEST 2 (IN-FLIGHT) [11,12,13,14,15] => REQUEST 3 (IN-FLIGHT) ``` @@ -422,9 +423,9 @@ log.Printf("fetchFn was called %d times\n", count.Load()) ``` Running this program, and looking at the logs, we'll see that the cache is able -resolve all of the ids from these new goroutines without generating any +to resolve all of the ids from these new goroutines without generating any additional requests even though we're picking IDs from different in-flight -batches: +requests: ```sh ❯ go run . @@ -460,10 +461,10 @@ the actual data source in order to refresh the cache. This in turn can make it difficult to configure appropriate alarms for your applications response times. `sturdyc` aims to give you a lot of control over these choices when you enable -the early refreshes functionality. It will prevent your most frequently used -records from ever expiring by continuously refreshing them in the background. -This can have a significant impact on your applications latency. We've seen the -P99 of some of our applications go from 50ms down to 1. +the **early refreshes** functionality. It will prevent your most frequently +used records from ever expiring by continuously refreshing them in the +background. This can have a significant impact on your applications latency. +We've seen the P99 of some of our applications go from 50ms down to 1. One thing to note about these background refreshes is that they are scheduled if a key is **requested again** after a configurable amount of time has passed. @@ -474,10 +475,10 @@ deleted once their TTL expires. This also means that the request that gets chosen to refresh the value won’t retrieve the updated data right away as the refresh happens asynchronously. -Asynchronous refreshes present challenges with infrequently requested keys. -When the refreshes are done in the background the latency will be low, but the -data might feel flaky or stale if we're not asking for the key again soon after -it has been refreshed. +However, asynchronous refreshes present challenges with infrequently requested +keys. When the refreshes are done in the background the latency will be low, +but the data might feel flaky or stale if we're not asking for the key again +soon after so that it is being continuously refreshed. To solve this, you also get to provide a synchronous refresh time. This essentially tells the cache: "If the data is older than x, I want the refresh @@ -526,7 +527,7 @@ func NewAPI(c *sturdyc.Client[string]) *API { func (a *API) Get(ctx context.Context, key string) (string, error) { // This could be an API call, a database query, etc. - fetchFn := func(_ context.Context) (string, error) { + fetchFn := func(_ context.Context) (string, error) { log.Printf("Fetching value for key: %s\n", key) return "value", nil } @@ -615,6 +616,7 @@ very next request for that key to attempt another refresh. Also, if you don't want any of this serve stale functionality you could just use short TTLs. The cache will never return a record where the TTL has expired. +I'm just trying to showcase some different ways to leverage this functionality! The entire example is available [here.](https://github.com/viccon/sturdyc/tree/main/examples/refreshes) @@ -788,8 +790,8 @@ the upstream eventually returns a valid response, we'll see it propagate to our cache. To illustrate, I'll make some small modifications to the code from the previous -example. The only thing I'm going to change is to make the API client return a -`ErrNotFound` for the first three requests: +example. I'm going to to make the API client return a `ErrNotFound` for the +first three requests: ```go type API struct { @@ -861,7 +863,7 @@ refreshes, and then transitions into having a value: 2024/05/09 21:25:28 Record does not exist. 2024/05/09 21:25:28 Record does not exist. 2024/05/09 21:25:28 Fetching value for key: key -2024/05/09 21:25:28 Value: value +2024/05/09 21:25:28 Value: value // Look, the value exists now! 2024/05/09 21:25:28 Value: value 2024/05/09 21:25:28 Value: value 2024/05/09 21:25:28 Fetching value for key: key @@ -889,7 +891,7 @@ One challenge with caching batchable endpoints is that you have to find a way to reduce the number of cache keys. Consider an endpoint that allows fetching 10,000 records in batches of 20. The IDs for the batch are supplied as query parameters, for example, `https://example.com?ids=1,2,3,4,5,...20`. If we were -to use this as the cache key, the way many CDNs would do, we could quickly +to use this as the cache key, the way many CDNs would, we could quickly calculate the number of keys we would generate like this: $$ C(n, k) = \binom{n}{k} = \frac{n!}{k!(n-k)!} $$ @@ -923,9 +925,10 @@ func (c *Client[T]) GetOrFetchBatch(ctx context.Context, ids []string, keyFn Key What the cache does is that it takes the IDs, applies the `keyFn` to them, and then checks each key individually if it's present in the cache. The keys that -aren't present will be fetched using the `fetchFn`. +aren't present will be passed to the `fetchFn`. -The `fetchFn` is going to have this signature where it returns a map where the ID is the key: +The `fetchFn` has this signature where it returns a map where the ID is the +key: ```go type BatchFetchFn[T any] func(ctx context.Context, ids []string) (map[string]T, error) @@ -951,14 +954,14 @@ func NewAPI(c *sturdyc.Client[string]) *API { } func (a *API) GetBatch(ctx context.Context, ids []string) (map[string]string, error) { - // We are going to pass a cache a key function that prefixes each id with - // the string "some-prefix", and adds an -ID- separator before the actual - // id. This makes it possible to save the same id for different data - // sources as the keys would look something like this: some-prefix-ID-1234 + // We are going to pass a cache a key function that prefixes each id with + // the string "some-prefix", and adds an -ID- separator before the actual + // id. This makes it possible to save the same id for different data + // sources as the keys would look something like this: some-prefix-ID-1234 cacheKeyFn := a.BatchKeyFn("some-prefix") // The fetchFn is only going to retrieve the IDs that are not in the cache. Please - // note that the cacheMisses is going to contain the actual IDs, not the cache keys. + // note that the cacheMisses is going to contain the actual IDs, not the cache keys. fetchFn := func(_ context.Context, cacheMisses []string) (map[string]string, error) { log.Printf("Cache miss. Fetching ids: %s\n", strings.Join(cacheMisses, ", ")) // Batch functions should return a map where the key is the id of the record. @@ -1075,7 +1078,7 @@ The API clients `MoviesByIDs` method calls an external API to fetch movies by IDs, and the `BatchFetchFn` that we're passing to `sturdyc` has a closure over the query parameters we need. -However, one **important** thing to note here is that the ID is _no longer_ +However, one **important** thing to note here is that the ID is **no longer** enough to _uniquely_ identify a record in our cache even with the basic prefix function we've used before. It will no longer work to just have cache keys that looks like this: @@ -1093,10 +1096,10 @@ transformation. In other terms, we should cache each movie once for each permutation of our options: ``` -IncludeUpcoming: true IncludeUpsell: true -IncludeUpcoming: false IncludeUpsell: false -IncludeUpcoming: true IncludeUpsell: false -IncludeUpcoming: false IncludeUpsell: true +ID 1 IncludeUpcoming: true IncludeUpsell: true +ID 1 IncludeUpcoming: false IncludeUpsell: false +ID 1 IncludeUpcoming: true IncludeUpsell: false +ID 1 IncludeUpcoming: false IncludeUpsell: true ``` This is what the `PermutatedBatchKeyFn` is used for. It takes a prefix and a @@ -1246,9 +1249,9 @@ The entire example is available [here.](https://github.com/viccon/sturdyc/tree/m As you may recall, our client is using the `WithEarlyRefreshes` option to refresh the records in the background whenever their keys are requested again after a certain amount of time has passed. And as seen in the example above, -we're successfully storing the records once for every permutation of the -options we use to retrieve it. However, we're not taking advantage of the -endpoint's batch capabilities. +we're successfully storing and refreshing the records once for every +permutation of the options we used to retrieve it. However, we're not taking +advantage of the endpoint's batch capabilities. To make this more efficient, we can enable the **refresh coalescing** functionality, but before we'll update our example to use it let's just take a @@ -1434,28 +1437,30 @@ understand how `sturdyc` works when it comes to creating cache keys, tracking in-flight requests, refreshing records in the background, and buffering/coalescing requests to minimize the number of round trips we have to make to an underlying data source. As you'll soon see, we'll leverage all of -these features when we're adding distributed storage. +these features for the distributed storage too. However, let's first understand when this functionality can be useful. This feature is particularly valuable when building applications that can achieve a -high cache hit rate while also being subject to large bursts of requests. As an -example, I've used this in production for a large streaming application. The -content was fairly static - new movies, series, and episodes were only ingested -a couple of times an hour. This meant that we could achieve a very high hit -rate for our data sources. However, during the evenings, when a popular -football match or TV show was about to start, our traffic could spike by a -factor of 20 within less than a minute. +high cache hit rate while also being subject to large bursts of requests. + +As an example, I've used this in production for a large streaming application. +The content was fairly static - new movies, series, and episodes were only +ingested a couple of times an hour. This meant that we could achieve a very +high hit rate for our data sources. However, during the evenings, when a +popular football match or TV show was about to start, our traffic could spike +by a factor of 20 within less than a minute. To illustrate the problem further, let’s say the hit rate for our in-memory cache was 99.8%. Then, when we received that large burst of traffic, our auto-scaling would begin provisioning new containers. These containers would obviously be brand new, with an initial hit rate of 0%. This would cause a significant load on our underlying data sources as soon as they came online, -because every request they received led to an outgoing request to the data -source. And these data sources had gotten used to being shielded from most of -the traffic by the older containers high hit-rate and refresh coalescing usage. -Hence, what was a 20x spike for us could easily become a 200x spike for them -until our new containers had warmed their cache. +because every request they received led to a cache miss so that we had to make +an outgoing request to the data source. And these data sources had gotten used +to being shielded from most of the traffic by the older containers high +hit-rate and refresh coalescing usage. Hence, what was a 20x spike for us could +easily become a 200x spike for them until our new containers had warmed their +caches. Therefore, I decided to add the ability to have the containers sync their in-memory cache with a distributed key-value store that would have an easier @@ -1528,8 +1533,8 @@ data source first, and then writes the keys and values to this storage as soon as it has gone out to an underlying data source and refreshed them. Therefore, I'd advice you to use the configuration above with short TTLs for the distributed storage, or things might get too stale. I mostly think it's useful -if you're consuming data sources that don't handle bursts from new containers -very well. +if you're consuming data sources that are rate limited or don't handle brief +bursts from new containers very well. I've included an example to showcase this functionality [here.](https://github.com/viccon/sturdyc/tree/main/examples/distribution) @@ -1582,7 +1587,7 @@ cacheClient := sturdyc.New[string](capacity, numShards, ttl, evictionPercentage, ``` With a configuration like this, I would usually set the TTL for the distributed -storage to something like an hour. However, if the `sturdyc` queries the +storage to something like an hour. However, if `sturdyc` queries the distributed storage and finds that a record is older than 1 minute (the second argument to the function), it will refresh the record from the underlying data source, and then write the updated value back to it. So the interaction with From 42b3db3ba413ced10cfc1b0834bbb74638138f1b Mon Sep 17 00:00:00 2001 From: Victor Conner Date: Thu, 9 Jan 2025 09:16:44 +0100 Subject: [PATCH 32/32] WIP --- README.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 488d9e7..65d6d18 100644 --- a/README.md +++ b/README.md @@ -476,13 +476,21 @@ chosen to refresh the value won’t retrieve the updated data right away as the refresh happens asynchronously. However, asynchronous refreshes present challenges with infrequently requested -keys. When the refreshes are done in the background the latency will be low, -but the data might feel flaky or stale if we're not asking for the key again -soon after so that it is being continuously refreshed. +keys. While background refreshes keep latency low by serving cached values +during updates, this can lead to perpetually stale data. If a key isn't +requested again before its next scheduled refresh, we remain permanently one +update behind, as each read triggers a refresh that won't be seen until the +next request. This is similar to a burger restaurant that prepares a new burger +after each customer's order - if the next customer arrives too late, they'll +receive a cold burger, despite the restaurant's proactive cooking strategy. To solve this, you also get to provide a synchronous refresh time. This essentially tells the cache: "If the data is older than x, I want the refresh -to be blocking and have the user wait for the response." +to be blocking and have the user wait for the response." Or using the burger +analogy: if a burger has been sitting for more than X minutes, the restaurant +starts making a fresh one while the customer waits. Unlike a real restaurant +though, the cache keeps the old value as a fallback - if the refresh fails, +we'll still serve the "cold burger" rather than letting the customer go hungry. Below is an example configuration that you can use to enable this functionality: