3535 * SPDX-License-Identifier: BSD-3-Clause
3636 */
3737
38+ #include <assert.h>
39+
3840#include "server.h"
3941
4042/*-----------------------------------------------------------------------------
@@ -149,6 +151,17 @@ void expireScanCallback(void *privdata, void *entry) {
149151 data -> sampled ++ ;
150152}
151153
154+ int hashTypeExpireEntry (void * db , void * o , void * entry );
155+
156+ void fieldExpireScanCallback (void * privdata , void * volaKey ) {
157+ activeExpireFieldIterator * iter = privdata ;
158+ serverAssert (volaKey );
159+ serverAssert (hashTypeHasVolatileElements (volaKey ));
160+ iter -> current_key = volaKey ;
161+ incrRefCount (iter -> current_key );
162+ assert (hashTypeHasVolatileElements (iter -> current_key ));
163+ }
164+
152165static inline int isExpiryTableValidForSamplingCb (hashtable * ht ) {
153166 long long numkeys = hashtableSize (ht );
154167 unsigned long buckets = hashtableBuckets (ht );
@@ -161,16 +174,116 @@ static inline int isExpiryTableValidForSamplingCb(hashtable *ht) {
161174 return C_OK ;
162175}
163176
164- void activeExpireCycle (int type ) {
177+ static inline int activeExpireFieldsCheckTimeLimitReached (
178+ unsigned int * iterations ,
179+ uint64_t start_us ,
180+ uint64_t limit_us ,
181+ uint64_t * now_us ) {
182+ if (((* iterations )++ & 0xf ) == 0 ) {
183+ * now_us = ustime ();
184+
185+ }
186+ return (* now_us - start_us >= limit_us );
187+ }
188+
189+
190+ void advanceDb (activeExpireFieldIterator * it ) {
191+ it -> current_db ++ ;
192+ if (it -> current_db >=server .dbnum ) {
193+ it -> current_db = 0 ;
194+ it -> db_cursor = 0 ;
195+ }
196+ it -> current_key = NULL ;
197+ }
198+
199+ static inline int effort (void ) {
200+ return server .active_expire_effort - 1 ;
201+ }
202+
203+ void hashKeyDone (activeExpireFieldIterator * it ) {
204+ serverAssert (it -> current_key );
205+ serverAssert (it -> current_key -> refcount >= 1 );
206+ decrRefCount (it -> current_key );
207+ it -> current_key = NULL ;
208+ }
209+
210+ vset * hashTypeGetVolatileSet (robj * o );
211+
212+ /*
213+ * activeExpireCycleFields
214+ *
215+ * This function incrementally expires hash fields that use field-level TTL
216+ * stored in volatile sets. It traverses all databases, scanning keys
217+ * known to hold volatile fields, and then iterates those fields to reclaim
218+ * memory for logically expired elements that were not accessed by clients.
219+ *
220+ * Field expiry is performed within a strict time budget and an entries-per-loop
221+ * limit to protect latency and CPU usage. An activeExpireFieldIterator tracks
222+ * which key and volatile set are currently being processed. Expired fields are
223+ * removed, and if the hash becomes empty, the parent key is deleted as well.
224+ *
225+ */
226+ void activeExpireCycleFields (int type , unsigned long entries_per_call , long long time_limit_us ) {
227+ if (type != ACTIVE_EXPIRE_CYCLE_SLOW ) return ;
228+ if (!server .active_expire_enabled || !iAmPrimary () || server .dbnum == 0 ) return ;
229+
230+ unsigned int iterations = 0 ;
231+ uint64_t start = ustime ();
232+ uint64_t now = start ;
233+ activeExpireFieldIterator * it = & server .active_expire_field_iterator ;
234+ int dbs_performed = 0 ;
235+
236+ while (dbs_performed < CRON_DBS_PER_CALL && !activeExpireFieldsCheckTimeLimitReached (
237+ & iterations , start , time_limit_us , & now )) {
238+ serverDb * db = server .db [it -> current_db ];
239+ if (!db || kvstoreSize (db -> keys_with_volatile_items ) == 0 ) {
240+ advanceDb (it );
241+ dbs_performed ++ ;
242+ continue ;
243+ }
244+
245+ size_t entries_processed = 0 ;
246+ while (entries_processed < entries_per_call && !activeExpireFieldsCheckTimeLimitReached (
247+ & iterations , start , time_limit_us , & now )) {
248+ if (!it -> current_key ) {
249+ it -> db_cursor = kvstoreScan (db -> keys_with_volatile_items , it -> db_cursor , -1 , fieldExpireScanCallback ,
250+ isExpiryTableValidForSamplingCb , it );
251+ } else if (it -> current_key -> refcount == 1 ) {
252+ hashKeyDone (it );
253+ }
254+
255+ if (it -> current_key ) {
256+ size_t expired = activeExpireFieldProcessKey (it -> current_key , db , (mstime_t ) (now / 1000 ),
257+ entries_per_call );
258+ entries_processed += expired ;
259+ bool hasMore = hashTypeHasVolatileElements (it -> current_key );
260+ if (!hasMore || expired < entries_per_call ) {
261+ hashKeyDone (it );
262+ }
263+ }
264+
265+ if (!it -> current_key && it -> db_cursor == 0 ) {
266+ advanceDb (it );
267+ dbs_performed ++ ;
268+ break ;
269+ }
270+ }
271+ }
272+
273+ if (activeExpireFieldsCheckTimeLimitReached (& iterations , start , time_limit_us , & now )) {
274+ server .stat_expired_time_cap_reached_count ++ ;
275+ }
276+ }
277+
278+
279+ void activeExpireCycleKeys (int type , unsigned long config_keys_per_loop , long long timelimit ) {
165280 /* Adjust the running parameters according to the configured expire
166281 * effort. The default effort is 1, and the maximum configurable effort
167282 * is 10. */
168- unsigned long effort = server .active_expire_effort - 1 , /* Rescale from 0 to 9. */
169- config_keys_per_loop = ACTIVE_EXPIRE_CYCLE_KEYS_PER_LOOP + ACTIVE_EXPIRE_CYCLE_KEYS_PER_LOOP / 4 * effort ,
170- config_cycle_fast_duration =
171- ACTIVE_EXPIRE_CYCLE_FAST_DURATION + ACTIVE_EXPIRE_CYCLE_FAST_DURATION / 4 * effort ,
172- config_cycle_slow_time_perc = ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC + 2 * effort ,
173- config_cycle_acceptable_stale = ACTIVE_EXPIRE_CYCLE_ACCEPTABLE_STALE - effort ;
283+
284+ unsigned long config_cycle_fast_duration =
285+ ACTIVE_EXPIRE_CYCLE_FAST_DURATION + ACTIVE_EXPIRE_CYCLE_FAST_DURATION / 4 * effort ();
286+ unsigned long config_cycle_acceptable_stale = ACTIVE_EXPIRE_CYCLE_ACCEPTABLE_STALE - effort ();
174287
175288 /* This function has some global state in order to continue the work
176289 * incrementally across calls. */
@@ -181,7 +294,7 @@ void activeExpireCycle(int type) {
181294 int j , iteration = 0 ;
182295 int dbs_per_call = CRON_DBS_PER_CALL ;
183296 int dbs_performed = 0 ;
184- long long start = ustime (), timelimit , elapsed ;
297+ long long start = ustime (), elapsed ;
185298
186299 /* If 'expire' action is paused, for whatever reason, then don't expire any key.
187300 * Typically, at the end of the pause we will properly expire the key OR we
@@ -209,13 +322,8 @@ void activeExpireCycle(int type) {
209322 * expired keys to use memory for too much time. */
210323 if (dbs_per_call > server .dbnum || timelimit_exit ) dbs_per_call = server .dbnum ;
211324
212- /* We can use at max 'config_cycle_slow_time_perc' percentage of CPU
213- * time per iteration. Since this function gets called with a frequency of
214- * server.hz times per second, the following is the max amount of
215- * microseconds we can spend in this function. */
216- timelimit = config_cycle_slow_time_perc * 1000000 / server .hz / 100 ;
325+
217326 timelimit_exit = 0 ;
218- if (timelimit <= 0 ) timelimit = 1 ;
219327
220328 if (type == ACTIVE_EXPIRE_CYCLE_FAST ) timelimit = config_cycle_fast_duration ; /* in microseconds. */
221329
@@ -376,6 +484,83 @@ void activeExpireCycle(int type) {
376484 server .stat_expired_stale_perc = (current_perc * 0.05 ) + (server .stat_expired_stale_perc * 0.95 );
377485}
378486
487+ /* expiryDriver abstracts expiry routines with a unified signature,
488+ * allowing activeExpireCycle to alternate keys and fields cleanly. */
489+ typedef void expiryDriver (int type , unsigned long entries_per_loop , long long timelimit );
490+
491+ /*
492+ * activeExpireCycle
493+ *
494+ * This function performs active expiration of both normal keys (with TTL)
495+ * and hash fields (with field-level TTL via volatile sets). Its purpose is to
496+ * reclaim memory from logically expired entries.
497+ *
498+ * The expiry is performed incrementally over multiple databases, respecting
499+ * a CPU time budget derived from the configured active-expire-effort.
500+ *
501+ * There are two separate expiry mechanisms for keys and for hash fields
502+ * because their iteration models are fundamentally different:
503+ * - key expiry operates on db->key entries, scanning random keys
504+ * with attached TTL entries.
505+ * - field expiry operates on db->key->volatile_set entries, scanning
506+ * fields within a hash that each have their own TTL.
507+ * This hierarchy and lookup pattern are entirely different, requiring
508+ * separate cursors, iteration logic, and data structure handling.
509+ *
510+ * The function uses an alternating scheme across event loop cycles: on one
511+ * cycle it will prioritize key expiry first, then hash field expiry if time
512+ * permits; on the next cycle, it will prioritize hash field expiry first,
513+ * then key expiry if time permits. This ensures fairness and prevents
514+ * starvation of either mechanism. Since the memory reclaim pace and iteration
515+ * model of keys versus hash fields are different and unpredictable,
516+ * alternating naturally balances the overall expiry effort when both are
517+ * fully consuming their available time budget.
518+ *
519+ * Note that field expiry is only performed during the slow iteration cycles,
520+ * as it is not scheduled to run in the fast cycle.
521+ */
522+ void activeExpireCycle (int type ) {
523+
524+ /* Adjust the running parameters according to the configured expire
525+ * effort. The default effort is 1, and the maximum configurable effort
526+ * is 10. */
527+ unsigned long config_keys_per_loop =
528+ ACTIVE_EXPIRE_CYCLE_KEYS_PER_LOOP + ACTIVE_EXPIRE_CYCLE_KEYS_PER_LOOP / 4 * effort ();
529+ unsigned long config_cycle_slow_time_perc = ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC + 2 * effort ();
530+
531+
532+ static int expireCycleStartWithFields = 0 ;
533+
534+ /* We can use at max 'config_cycle_slow_time_perc' percentage of CPU
535+ * time per iteration. Since this function gets called with a frequency of
536+ * server.hz times per second, the following is the max amount of
537+ * microseconds we can spend in this function. */
538+ long long timelimit = config_cycle_slow_time_perc * 1000000 / server .hz / 100 ;
539+
540+ if (timelimit <= 0 ) timelimit = 1 ;
541+
542+ expiryDriver * first , * second ;
543+
544+ if (expireCycleStartWithFields ) {
545+ first = activeExpireCycleFields ;
546+ second = activeExpireCycleKeys ;
547+ } else {
548+ first = activeExpireCycleKeys ;
549+ second = activeExpireCycleFields ;
550+ }
551+
552+ long long start = ustime ();
553+ first (type , config_keys_per_loop , timelimit );
554+ long long elapsed = ustime () - start ;
555+
556+ if (elapsed < timelimit ) {
557+ second (type , config_keys_per_loop , timelimit );
558+ }
559+
560+ expireCycleStartWithFields = !expireCycleStartWithFields ;
561+
562+ }
563+
379564/*-----------------------------------------------------------------------------
380565 * Expires of keys created in writable replicas
381566 *
0 commit comments