From aa836021c8842cb702523ffc0789ea0869d10112 Mon Sep 17 00:00:00 2001 From: Uri Yagelnik Date: Wed, 11 Jun 2025 23:14:04 +0000 Subject: [PATCH 1/5] Read commands offloading to IO threads Signed-off-by: Uri Yagelnik # Conflicts: # src/blocked.c # src/cluster.c # src/cluster_slot_stats.c # src/commands.def # src/io_threads.c # src/io_threads.h # src/memory_prefetch.c # src/networking.c # src/server.c # src/server.h --- src/acl.c | 4 +- src/ae.c | 9 + src/ae.h | 5 + src/ae_epoll.c | 3 +- src/aof.c | 11 +- src/blocked.c | 18 +- src/cluster.c | 7 +- src/cluster_slot_stats.c | 2 +- src/commands.def | 66 +- src/commands/get.json | 3 +- src/commands/hexists.json | 3 +- src/commands/hget.json | 3 +- src/commands/hgetall.json | 3 +- src/commands/hkeys.json | 3 +- src/commands/hlen.json | 3 +- src/commands/hmget.json | 3 +- src/commands/hrandfield.json | 3 +- src/commands/hscan.json | 3 +- src/commands/hstrlen.json | 3 +- src/commands/hvals.json | 3 +- src/commands/lindex.json | 3 +- src/commands/llen.json | 3 +- src/commands/lrange.json | 3 +- src/commands/zcard.json | 3 +- src/commands/zcount.json | 3 +- src/commands/zdiff.json | 3 +- src/commands/zinter.json | 3 +- src/commands/zintercard.json | 3 +- src/commands/zlexcount.json | 3 +- src/commands/zmscore.json | 3 +- src/commands/zrandmember.json | 3 +- src/commands/zrange.json | 3 +- src/commands/zrangebylex.json | 3 +- src/commands/zrangebyscore.json | 3 +- src/commands/zrank.json | 3 +- src/commands/zrevrange.json | 3 +- src/commands/zrevrangebylex.json | 3 +- src/commands/zrevrangebyscore.json | 3 +- src/commands/zrevrank.json | 3 +- src/commands/zscan.json | 3 +- src/commands/zscore.json | 3 +- src/commands/zunion.json | 3 +- src/config.c | 2 + src/db.c | 63 +- src/debug.c | 4 +- src/io_threads.c | 987 +++++++++++++++++++++++++++-- src/io_threads.h | 40 +- src/kvstore.c | 35 +- src/memory_prefetch.c | 55 +- src/memory_prefetch.h | 2 + src/module.c | 16 +- src/networking.c | 396 ++++++------ src/notify.c | 2 +- src/object.c | 3 +- src/rdb.c | 8 +- src/server.c | 68 +- src/server.h | 178 ++++-- src/socket.c | 2 +- src/tls.c | 4 +- src/tracking.c | 16 +- tests/unit/io-threads.tcl | 360 +++++++++++ tests/unit/maxmemory.tcl | 5 + tests/unit/moduleapi/misc.tcl | 37 ++ tests/unit/networking.tcl | 158 ----- tests/unit/other.tcl | 12 +- 65 files changed, 2041 insertions(+), 636 deletions(-) create mode 100644 tests/unit/io-threads.tcl diff --git a/src/acl.c b/src/acl.c index c33b5d1ed7..9833454dc6 100644 --- a/src/acl.c +++ b/src/acl.c @@ -498,7 +498,7 @@ void ACLFreeUserAndKillClients(user *u) { clientSetUser(c, DefaultUser, 0); /* We will write replies to this client later, so we can't * close it directly even if async. */ - if (c == server.current_client) { + if (isCurrentClient(c)) { c->flag.close_after_command = 1; } else { freeClientAsync(c); @@ -2635,7 +2635,7 @@ void addACLLogEntry(client *c, int reason, int context, int argpos, sds username } /* if we have a real client from the network, use it (could be missing on module timers) */ - client *realclient = server.current_client ? server.current_client : c; + client *realclient = getCurrentClient() ? getCurrentClient() : c; le->cinfo = catClientInfoString(sdsempty(), realclient, 0); le->context = context; diff --git a/src/ae.c b/src/ae.c index 643ff17070..0e56a0defb 100644 --- a/src/ae.c +++ b/src/ae.c @@ -92,6 +92,7 @@ aeEventLoop *aeCreateEventLoop(int setsize) { eventLoop->aftersleep = NULL; eventLoop->custompoll = NULL; eventLoop->flags = 0; + eventLoop->epoll_batch_size = 0; /* Default to 0, meaning use setsize */ /* Initialize the eventloop mutex with PTHREAD_MUTEX_ERRORCHECK type */ pthread_mutexattr_t attr; pthread_mutexattr_init(&attr); @@ -217,6 +218,9 @@ void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask) { * is removed. */ if (mask & AE_WRITABLE) mask |= AE_BARRIER; + /* We want to always remove AE_PREFETCH if set when AE_READABLE is removed. */ + if (mask & AE_READABLE) mask |= AE_PREFETCH; + /* Only remove attached events */ mask = mask & fe->mask; @@ -458,6 +462,7 @@ int aeProcessEvents(aeEventLoop *eventLoop, int flags) { if (eventLoop->aftersleep != NULL && flags & AE_CALL_AFTER_SLEEP) eventLoop->aftersleep(eventLoop, numevents); for (j = 0; j < numevents; j++) { + if (numevents > 1 && eventLoop->prefetch) eventLoop->prefetch(eventLoop, j, numevents); int fd = eventLoop->fired[j].fd; aeFileEvent *fe = &eventLoop->events[fd]; int mask = eventLoop->fired[j].mask; @@ -562,6 +567,10 @@ void aeSetCustomPollProc(aeEventLoop *eventLoop, aeCustomPollProc *custompoll) { eventLoop->custompoll = custompoll; } +void aeSetPrefetchProc(aeEventLoop *eventLoop, aePrefetchProc *prefetch) { + eventLoop->prefetch = prefetch; +} + void aeSetPollProtect(aeEventLoop *eventLoop, int protect) { if (protect) { eventLoop->flags |= AE_PROTECT_POLL; diff --git a/src/ae.h b/src/ae.h index 985429cd56..fc4aeb33c1 100644 --- a/src/ae.h +++ b/src/ae.h @@ -47,6 +47,7 @@ loop iteration. Useful when you want to persist \ things to disk before sending replies, and want \ to do that in a group fashion. */ +#define AE_PREFETCH 8 /* With PREFETCH, call prefetch callback for the events */ #define AE_FILE_EVENTS (1 << 0) #define AE_TIME_EVENTS (1 << 1) @@ -72,6 +73,7 @@ typedef void aeEventFinalizerProc(struct aeEventLoop *eventLoop, void *clientDat typedef void aeBeforeSleepProc(struct aeEventLoop *eventLoop); typedef void aeAfterSleepProc(struct aeEventLoop *eventLoop, int numevents); typedef int aeCustomPollProc(struct aeEventLoop *eventLoop); +typedef void aePrefetchProc(struct aeEventLoop *eventLoop, int cur_idx, int numevents); /* File event structure */ typedef struct aeFileEvent { @@ -113,8 +115,10 @@ typedef struct aeEventLoop { aeBeforeSleepProc *beforesleep; aeAfterSleepProc *aftersleep; aeCustomPollProc *custompoll; + aePrefetchProc *prefetch; pthread_mutex_t poll_mutex; int flags; + int epoll_batch_size; /* Optional batch size for epoll_wait */ } aeEventLoop; /* Prototypes */ @@ -138,6 +142,7 @@ char *aeGetApiName(void); void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep); void aeSetAfterSleepProc(aeEventLoop *eventLoop, aeAfterSleepProc *aftersleep); void aeSetCustomPollProc(aeEventLoop *eventLoop, aeCustomPollProc *custompoll); +void aeSetPrefetchProc(aeEventLoop *eventLoop, aePrefetchProc *prefetch); void aeSetPollProtect(aeEventLoop *eventLoop, int protect); int aePoll(aeEventLoop *eventLoop, struct timeval *tvp); int aeGetSetSize(aeEventLoop *eventLoop); diff --git a/src/ae_epoll.c b/src/ae_epoll.c index b2410ca29a..e67e01a630 100644 --- a/src/ae_epoll.c +++ b/src/ae_epoll.c @@ -110,8 +110,9 @@ static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) { static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) { aeApiState *state = eventLoop->apidata; int retval, numevents = 0; + int batch_size = eventLoop->epoll_batch_size > 0 ? eventLoop->epoll_batch_size : eventLoop->setsize; - retval = epoll_wait(state->epfd, state->events, eventLoop->setsize, + retval = epoll_wait(state->epfd, state->events, batch_size, tvp ? (tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000) : -1); if (retval > 0) { int j; diff --git a/src/aof.c b/src/aof.c index 732a71584f..d891db9bcf 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1429,10 +1429,11 @@ int loadSingleAppendOnlyFile(char *filename) { * to the same file we're about to read. */ server.aof_state = AOF_OFF; - client *old_cur_client = server.current_client; - client *old_exec_client = server.executing_client; + client *old_cur_client = getCurrentClient(); + client *old_exec_client = getExecutingClient(); fakeClient = createAOFClient(); - server.current_client = server.executing_client = fakeClient; + setCurrentClient(fakeClient); + setExecutingClient(fakeClient); /* Check if the AOF file is in RDB format (it may be RDB encoded base AOF * or old style RDB-preamble AOF). In that case we need to load the RDB file @@ -1637,8 +1638,8 @@ int loadSingleAppendOnlyFile(char *filename) { cleanup: if (fakeClient) freeClient(fakeClient); - server.current_client = old_cur_client; - server.executing_client = old_exec_client; + setCurrentClient(old_cur_client); + setExecutingClient(old_exec_client); fclose(fp); sdsfree(aof_filepath); return ret; diff --git a/src/blocked.c b/src/blocked.c index dcdb4fb240..bc85199e3e 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -91,6 +91,8 @@ void initClientBlockingState(client *c) { c->bstate->generic_blocked_list_node = NULL; c->bstate->module_blocked_handle = NULL; c->bstate->async_rm_call_handle = NULL; + c->bstate->slot_pending_list = NULL; + listInitNode(&c->bstate->pending_client_node, c); } void freeClientBlockingState(client *c) { @@ -226,7 +228,7 @@ void unblockClient(client *c, int queue_for_reprocessing) { serverAssert(c->bstate->postponed_list_node); listDelNode(server.postponed_clients, c->bstate->postponed_list_node); c->bstate->postponed_list_node = NULL; - } else if (c->bstate->btype == BLOCKED_SHUTDOWN) { + } else if (c->bstate->btype == BLOCKED_SHUTDOWN || c->bstate->btype == BLOCKED_SLOT) { /* No special cleanup. */ } else { serverPanic("Unknown btype in unblockClient()."); @@ -333,7 +335,7 @@ void disconnectAllBlockedClients(void) { * command processing will start from scratch, and the command will * be either executed or rejected. (unlike LIST blocked clients for * which the command is already in progress in a way. */ - if (c->bstate->btype == BLOCKED_POSTPONE) continue; + if (c->bstate->btype == BLOCKED_POSTPONE || c->bstate->btype == BLOCKED_SLOT) continue; unblockClientOnError(c, "-UNBLOCKED force unblock from blocking operation, " "instance state changed (master -> replica?)"); @@ -703,8 +705,8 @@ static void unblockClientOnKey(client *c, robj *key) { * running the command, and exit the execution unit after calling the unblock handler (if exists). * Notice that we also must set the current client so it will be available * when we will try to send the client side caching notification (done on 'afterCommand'). */ - client *old_client = server.current_client; - server.current_client = c; + client *old_client = getCurrentClient(); + setCurrentClient(c); enterExecutionUnit(1, 0); processCommandAndResetClient(c); if (!c->flag.blocked) { @@ -718,7 +720,7 @@ static void unblockClientOnKey(client *c, robj *key) { afterCommand(c); /* Clear the reexecuting_command flag after the proc is executed. */ c->flag.reexecuting_command = 0; - server.current_client = old_client; + setCurrentClient(old_client); } } @@ -728,8 +730,8 @@ static void unblockClientOnKey(client *c, robj *key) { * be processed in moduleHandleBlockedClients. */ static void moduleUnblockClientOnKey(client *c, robj *key) { long long prev_error_replies = server.stat_total_error_replies; - client *old_client = server.current_client; - server.current_client = c; + client *old_client = getCurrentClient(); + setCurrentClient(c); monotime replyTimer; elapsedStart(&replyTimer); @@ -742,7 +744,7 @@ static void moduleUnblockClientOnKey(client *c, robj *key) { * in order to propagate any changes that could have been done inside * moduleTryServeClientBlockedOnKey */ afterCommand(c); - server.current_client = old_client; + setCurrentClient(old_client); } /* Unblock a client which is currently Blocked on and provided a timeout. diff --git a/src/cluster.c b/src/cluster.c index cc4dcc6fa3..bfe92aa526 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -806,12 +806,13 @@ void clusterCommandMyShardId(client *c) { /* When a cluster command is called, we need to decide whether to return TLS info or * non-TLS info by the client's connection type. However if the command is called by * a Lua script or RM_call, there is no connection in the fake client, so we use - * server.current_client here to get the real client if available. And if it is not + * current_client here to get the real client if available. And if it is not * available (modules may call commands without a real client), we return the default * info, which is determined by server.tls_cluster. */ static int shouldReturnTlsInfo(void) { - if (server.current_client && server.current_client->conn) { - return connIsTLS(server.current_client->conn); + client *current_client = getCurrentClient(); + if (current_client && current_client->conn) { + return connIsTLS(current_client->conn); } else { return server.tls_cluster; } diff --git a/src/cluster_slot_stats.c b/src/cluster_slot_stats.c index 15883be54a..a313b2342e 100644 --- a/src/cluster_slot_stats.c +++ b/src/cluster_slot_stats.c @@ -146,7 +146,7 @@ void clusterSlotStatsAddNetworkBytesOutForUserClient(client *c) { /* Accumulates egress bytes upon sending replication stream. This only applies for primary nodes. */ static void clusterSlotStatsUpdateNetworkBytesOutForReplication(long long len) { - client *c = server.current_client; + client *c = getCurrentClient(); if (c == NULL || !clusterSlotStatsEnabled(c->slot)) return; /* We multiply the bytes len by the number of replicas to account for us broadcasting to multiple replicas at once. */ diff --git a/src/commands.def b/src/commands.def index 7585f56f32..516862c5d5 100644 --- a/src/commands.def +++ b/src/commands.def @@ -11229,21 +11229,21 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("geosearchstore","Queries a geospatial index for members inside an area of a box, a circle, or a polygon, optionally stores the result.","O(N+log(M)) where N is the number of elements in the grid-aligned bounding box area around the shape provided as the filter and M is the number of items inside the shape","6.2.0",CMD_DOC_NONE,NULL,NULL,"geo",COMMAND_GROUP_GEO,GEOSEARCHSTORE_History,2,GEOSEARCHSTORE_Tips,0,geosearchstoreCommand,-8,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_GEO,GEOSEARCHSTORE_Keyspecs,2,NULL,7),.args=GEOSEARCHSTORE_Args}, /* hash */ {MAKE_CMD("hdel","Deletes one or more fields and their values from a hash. Deletes the hash if no fields remain.","O(N) where N is the number of fields to be removed.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HDEL_History,1,HDEL_Tips,0,hdelCommand,-3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_HASH,HDEL_Keyspecs,1,NULL,2),.args=HDEL_Args}, -{MAKE_CMD("hexists","Determines whether a field exists in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXISTS_History,0,HEXISTS_Tips,0,hexistsCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HEXISTS_Keyspecs,1,NULL,2),.args=HEXISTS_Args}, -{MAKE_CMD("hget","Returns the value of a field in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGET_History,0,HGET_Tips,0,hgetCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HGET_Keyspecs,1,NULL,2),.args=HGET_Args}, -{MAKE_CMD("hgetall","Returns all fields and values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGETALL_History,0,HGETALL_Tips,1,hgetallCommand,2,CMD_READONLY,ACL_CATEGORY_HASH,HGETALL_Keyspecs,1,NULL,1),.args=HGETALL_Args}, +{MAKE_CMD("hexists","Determines whether a field exists in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXISTS_History,0,HEXISTS_Tips,0,hexistsCommand,3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HEXISTS_Keyspecs,1,NULL,2),.args=HEXISTS_Args}, +{MAKE_CMD("hget","Returns the value of a field in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGET_History,0,HGET_Tips,0,hgetCommand,3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HGET_Keyspecs,1,NULL,2),.args=HGET_Args}, +{MAKE_CMD("hgetall","Returns all fields and values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGETALL_History,0,HGETALL_Tips,1,hgetallCommand,2,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HGETALL_Keyspecs,1,NULL,1),.args=HGETALL_Args}, {MAKE_CMD("hincrby","Increments the integer value of a field in a hash by a number. Uses 0 as initial value if the field doesn't exist.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HINCRBY_History,0,HINCRBY_Tips,0,hincrbyCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HINCRBY_Keyspecs,1,NULL,3),.args=HINCRBY_Args}, {MAKE_CMD("hincrbyfloat","Increments the floating point value of a field by a number. Uses 0 as initial value if the field doesn't exist.","O(1)","2.6.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HINCRBYFLOAT_History,0,HINCRBYFLOAT_Tips,0,hincrbyfloatCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HINCRBYFLOAT_Keyspecs,1,NULL,3),.args=HINCRBYFLOAT_Args}, -{MAKE_CMD("hkeys","Returns all fields in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HKEYS_History,0,HKEYS_Tips,1,hkeysCommand,2,CMD_READONLY,ACL_CATEGORY_HASH,HKEYS_Keyspecs,1,NULL,1),.args=HKEYS_Args}, -{MAKE_CMD("hlen","Returns the number of fields in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HLEN_History,0,HLEN_Tips,0,hlenCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HLEN_Keyspecs,1,NULL,1),.args=HLEN_Args}, -{MAKE_CMD("hmget","Returns the values of all fields in a hash.","O(N) where N is the number of fields being requested.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HMGET_History,0,HMGET_Tips,0,hmgetCommand,-3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HMGET_Keyspecs,1,NULL,2),.args=HMGET_Args}, +{MAKE_CMD("hkeys","Returns all fields in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HKEYS_History,0,HKEYS_Tips,1,hkeysCommand,2,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HKEYS_Keyspecs,1,NULL,1),.args=HKEYS_Args}, +{MAKE_CMD("hlen","Returns the number of fields in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HLEN_History,0,HLEN_Tips,0,hlenCommand,2,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HLEN_Keyspecs,1,NULL,1),.args=HLEN_Args}, +{MAKE_CMD("hmget","Returns the values of all fields in a hash.","O(N) where N is the number of fields being requested.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HMGET_History,0,HMGET_Tips,0,hmgetCommand,-3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HMGET_Keyspecs,1,NULL,2),.args=HMGET_Args}, {MAKE_CMD("hmset","Sets the values of multiple fields.","O(N) where N is the number of fields being set.","2.0.0",CMD_DOC_DEPRECATED,"`HSET` with multiple field-value pairs","4.0.0","hash",COMMAND_GROUP_HASH,HMSET_History,0,HMSET_Tips,0,hsetCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HMSET_Keyspecs,1,NULL,2),.args=HMSET_Args}, -{MAKE_CMD("hrandfield","Returns one or more random fields from a hash.","O(N) where N is the number of fields returned","6.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HRANDFIELD_History,0,HRANDFIELD_Tips,1,hrandfieldCommand,-2,CMD_READONLY,ACL_CATEGORY_HASH,HRANDFIELD_Keyspecs,1,NULL,2),.args=HRANDFIELD_Args}, -{MAKE_CMD("hscan","Iterates over fields and values of a hash.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSCAN_History,0,HSCAN_Tips,1,hscanCommand,-3,CMD_READONLY,ACL_CATEGORY_HASH,HSCAN_Keyspecs,1,NULL,5),.args=HSCAN_Args}, +{MAKE_CMD("hrandfield","Returns one or more random fields from a hash.","O(N) where N is the number of fields returned","6.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HRANDFIELD_History,0,HRANDFIELD_Tips,1,hrandfieldCommand,-2,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HRANDFIELD_Keyspecs,1,NULL,2),.args=HRANDFIELD_Args}, +{MAKE_CMD("hscan","Iterates over fields and values of a hash.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSCAN_History,0,HSCAN_Tips,1,hscanCommand,-3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HSCAN_Keyspecs,1,NULL,5),.args=HSCAN_Args}, {MAKE_CMD("hset","Creates or modifies the value of a field in a hash.","O(1) for each field/value pair added, so O(N) to add N field/value pairs when the command is called with multiple field/value pairs.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSET_History,1,HSET_Tips,0,hsetCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HSET_Keyspecs,1,NULL,2),.args=HSET_Args}, {MAKE_CMD("hsetnx","Sets the value of a field in a hash only when the field doesn't exist.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSETNX_History,0,HSETNX_Tips,0,hsetnxCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HSETNX_Keyspecs,1,NULL,3),.args=HSETNX_Args}, -{MAKE_CMD("hstrlen","Returns the length of the value of a field.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSTRLEN_History,0,HSTRLEN_Tips,0,hstrlenCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HSTRLEN_Keyspecs,1,NULL,2),.args=HSTRLEN_Args}, -{MAKE_CMD("hvals","Returns all values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HVALS_History,0,HVALS_Tips,1,hvalsCommand,2,CMD_READONLY,ACL_CATEGORY_HASH,HVALS_Keyspecs,1,NULL,1),.args=HVALS_Args}, +{MAKE_CMD("hstrlen","Returns the length of the value of a field.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSTRLEN_History,0,HSTRLEN_Tips,0,hstrlenCommand,3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HSTRLEN_Keyspecs,1,NULL,2),.args=HSTRLEN_Args}, +{MAKE_CMD("hvals","Returns all values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HVALS_History,0,HVALS_Tips,1,hvalsCommand,2,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HVALS_Keyspecs,1,NULL,1),.args=HVALS_Args}, /* hyperloglog */ {MAKE_CMD("pfadd","Adds elements to a HyperLogLog key. Creates the key if it doesn't exist.","O(1) to add every element.","2.8.9",CMD_DOC_NONE,NULL,NULL,"hyperloglog",COMMAND_GROUP_HYPERLOGLOG,PFADD_History,0,PFADD_Tips,0,pfaddCommand,-2,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HYPERLOGLOG,PFADD_Keyspecs,1,NULL,2),.args=PFADD_Args}, {MAKE_CMD("pfcount","Returns the approximated cardinality of the set(s) observed by the HyperLogLog key(s).","O(1) with a very small average constant time when called with a single key. O(N) with N being the number of keys, and much bigger constant times, when called with multiple keys.","2.8.9",CMD_DOC_NONE,NULL,NULL,"hyperloglog",COMMAND_GROUP_HYPERLOGLOG,PFCOUNT_History,0,PFCOUNT_Tips,0,pfcountCommand,-2,CMD_READONLY|CMD_MAY_REPLICATE,ACL_CATEGORY_HYPERLOGLOG,PFCOUNT_Keyspecs,1,NULL,1),.args=PFCOUNT_Args}, @@ -11256,16 +11256,16 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("blpop","Removes and returns the first element in a list. Blocks until an element is available otherwise. Deletes the list if the last element was popped.","O(N) where N is the number of provided keys.","2.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,BLPOP_History,1,BLPOP_Tips,0,blpopCommand,-3,CMD_WRITE|CMD_BLOCKING,ACL_CATEGORY_LIST,BLPOP_Keyspecs,1,NULL,2),.args=BLPOP_Args}, {MAKE_CMD("brpop","Removes and returns the last element in a list. Blocks until an element is available otherwise. Deletes the list if the last element was popped.","O(N) where N is the number of provided keys.","2.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,BRPOP_History,1,BRPOP_Tips,0,brpopCommand,-3,CMD_WRITE|CMD_BLOCKING,ACL_CATEGORY_LIST,BRPOP_Keyspecs,1,NULL,2),.args=BRPOP_Args}, {MAKE_CMD("brpoplpush","Pops an element from a list, pushes it to another list and returns it. Block until an element is available otherwise. Deletes the list if the last element was popped.","O(1)","2.2.0",CMD_DOC_DEPRECATED,"`BLMOVE` with the `RIGHT` and `LEFT` arguments","6.2.0","list",COMMAND_GROUP_LIST,BRPOPLPUSH_History,1,BRPOPLPUSH_Tips,0,brpoplpushCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_BLOCKING,ACL_CATEGORY_LIST,BRPOPLPUSH_Keyspecs,2,NULL,3),.args=BRPOPLPUSH_Args}, -{MAKE_CMD("lindex","Returns an element from a list by its index.","O(N) where N is the number of elements to traverse to get to the element at index. This makes asking for the first or the last element of the list O(1).","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LINDEX_History,0,LINDEX_Tips,0,lindexCommand,3,CMD_READONLY,ACL_CATEGORY_LIST,LINDEX_Keyspecs,1,NULL,2),.args=LINDEX_Args}, +{MAKE_CMD("lindex","Returns an element from a list by its index.","O(N) where N is the number of elements to traverse to get to the element at index. This makes asking for the first or the last element of the list O(1).","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LINDEX_History,0,LINDEX_Tips,0,lindexCommand,3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_LIST,LINDEX_Keyspecs,1,NULL,2),.args=LINDEX_Args}, {MAKE_CMD("linsert","Inserts an element before or after another element in a list.","O(N) where N is the number of elements to traverse before seeing the value pivot. This means that inserting somewhere on the left end on the list (head) can be considered O(1) and inserting somewhere on the right end (tail) is O(N).","2.2.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LINSERT_History,0,LINSERT_Tips,0,linsertCommand,5,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_LIST,LINSERT_Keyspecs,1,NULL,4),.args=LINSERT_Args}, -{MAKE_CMD("llen","Returns the length of a list.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LLEN_History,0,LLEN_Tips,0,llenCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_LIST,LLEN_Keyspecs,1,NULL,1),.args=LLEN_Args}, +{MAKE_CMD("llen","Returns the length of a list.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LLEN_History,0,LLEN_Tips,0,llenCommand,2,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_LIST,LLEN_Keyspecs,1,NULL,1),.args=LLEN_Args}, {MAKE_CMD("lmove","Returns an element after popping it from one list and pushing it to another. Deletes the list if the last element was moved.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LMOVE_History,0,LMOVE_Tips,0,lmoveCommand,5,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_LIST,LMOVE_Keyspecs,2,NULL,4),.args=LMOVE_Args}, {MAKE_CMD("lmpop","Returns multiple elements from a list after removing them. Deletes the list if the last element was popped.","O(N+M) where N is the number of provided keys and M is the number of elements returned.","7.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LMPOP_History,0,LMPOP_Tips,0,lmpopCommand,-4,CMD_WRITE,ACL_CATEGORY_LIST,LMPOP_Keyspecs,1,lmpopGetKeys,4),.args=LMPOP_Args}, {MAKE_CMD("lpop","Returns the first elements in a list after removing it. Deletes the list if the last element was popped.","O(N) where N is the number of elements returned","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LPOP_History,1,LPOP_Tips,0,lpopCommand,-2,CMD_WRITE|CMD_FAST,ACL_CATEGORY_LIST,LPOP_Keyspecs,1,NULL,2),.args=LPOP_Args}, {MAKE_CMD("lpos","Returns the index of matching elements in a list.","O(N) where N is the number of elements in the list, for the average case. When searching for elements near the head or the tail of the list, or when the MAXLEN option is provided, the command may run in constant time.","6.0.6",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LPOS_History,0,LPOS_Tips,0,lposCommand,-3,CMD_READONLY,ACL_CATEGORY_LIST,LPOS_Keyspecs,1,NULL,5),.args=LPOS_Args}, {MAKE_CMD("lpush","Prepends one or more elements to a list. Creates the key if it doesn't exist.","O(1) for each element added, so O(N) to add N elements when the command is called with multiple arguments.","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LPUSH_History,1,LPUSH_Tips,0,lpushCommand,-3,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_LIST,LPUSH_Keyspecs,1,NULL,2),.args=LPUSH_Args}, {MAKE_CMD("lpushx","Prepends one or more elements to a list only when the list exists.","O(1) for each element added, so O(N) to add N elements when the command is called with multiple arguments.","2.2.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LPUSHX_History,1,LPUSHX_Tips,0,lpushxCommand,-3,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_LIST,LPUSHX_Keyspecs,1,NULL,2),.args=LPUSHX_Args}, -{MAKE_CMD("lrange","Returns a range of elements from a list.","O(S+N) where S is the distance of start offset from HEAD for small lists, from nearest end (HEAD or TAIL) for large lists; and N is the number of elements in the specified range.","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LRANGE_History,0,LRANGE_Tips,0,lrangeCommand,4,CMD_READONLY,ACL_CATEGORY_LIST,LRANGE_Keyspecs,1,NULL,3),.args=LRANGE_Args}, +{MAKE_CMD("lrange","Returns a range of elements from a list.","O(S+N) where S is the distance of start offset from HEAD for small lists, from nearest end (HEAD or TAIL) for large lists; and N is the number of elements in the specified range.","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LRANGE_History,0,LRANGE_Tips,0,lrangeCommand,4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_LIST,LRANGE_Keyspecs,1,NULL,3),.args=LRANGE_Args}, {MAKE_CMD("lrem","Removes elements from a list. Deletes the list if the last element was removed.","O(N+M) where N is the length of the list and M is the number of elements removed.","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LREM_History,0,LREM_Tips,0,lremCommand,4,CMD_WRITE,ACL_CATEGORY_LIST,LREM_Keyspecs,1,NULL,3),.args=LREM_Args}, {MAKE_CMD("lset","Sets the value of an element in a list by its index.","O(N) where N is the length of the list. Setting either the first or the last element of the list is O(1).","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LSET_History,0,LSET_Tips,0,lsetCommand,4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_LIST,LSET_Keyspecs,1,NULL,3),.args=LSET_Args}, {MAKE_CMD("ltrim","Removes elements from both ends a list. Deletes the list if all elements were trimmed.","O(N) where N is the number of elements to be removed by the operation.","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LTRIM_History,0,LTRIM_Tips,0,ltrimCommand,4,CMD_WRITE,ACL_CATEGORY_LIST,LTRIM_Keyspecs,1,NULL,3),.args=LTRIM_Args}, @@ -11348,36 +11348,36 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("bzpopmax","Removes and returns the member with the highest score from one or more sorted sets. Blocks until a member available otherwise. Deletes the sorted set if the last element was popped.","O(log(N)) with N being the number of elements in the sorted set.","5.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,BZPOPMAX_History,1,BZPOPMAX_Tips,0,bzpopmaxCommand,-3,CMD_WRITE|CMD_FAST|CMD_BLOCKING,ACL_CATEGORY_SORTEDSET,BZPOPMAX_Keyspecs,1,NULL,2),.args=BZPOPMAX_Args}, {MAKE_CMD("bzpopmin","Removes and returns the member with the lowest score from one or more sorted sets. Blocks until a member is available otherwise. Deletes the sorted set if the last element was popped.","O(log(N)) with N being the number of elements in the sorted set.","5.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,BZPOPMIN_History,1,BZPOPMIN_Tips,0,bzpopminCommand,-3,CMD_WRITE|CMD_FAST|CMD_BLOCKING,ACL_CATEGORY_SORTEDSET,BZPOPMIN_Keyspecs,1,NULL,2),.args=BZPOPMIN_Args}, {MAKE_CMD("zadd","Adds one or more members to a sorted set, or updates their scores. Creates the key if it doesn't exist.","O(log(N)) for each item added, where N is the number of elements in the sorted set.","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZADD_History,3,ZADD_Tips,0,zaddCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZADD_Keyspecs,1,NULL,6),.args=ZADD_Args}, -{MAKE_CMD("zcard","Returns the number of members in a sorted set.","O(1)","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZCARD_History,0,ZCARD_Tips,0,zcardCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZCARD_Keyspecs,1,NULL,1),.args=ZCARD_Args}, -{MAKE_CMD("zcount","Returns the count of members in a sorted set that have scores within a range.","O(log(N)) with N being the number of elements in the sorted set.","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZCOUNT_History,0,ZCOUNT_Tips,0,zcountCommand,4,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZCOUNT_Keyspecs,1,NULL,3),.args=ZCOUNT_Args}, -{MAKE_CMD("zdiff","Returns the difference between multiple sorted sets.","O(L + (N-K)log(N)) worst case where L is the total number of elements in all the sets, N is the size of the first set, and K is the size of the result set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZDIFF_History,0,ZDIFF_Tips,0,zdiffCommand,-3,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZDIFF_Keyspecs,1,zunionInterDiffGetKeys,3),.args=ZDIFF_Args}, +{MAKE_CMD("zcard","Returns the number of members in a sorted set.","O(1)","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZCARD_History,0,ZCARD_Tips,0,zcardCommand,2,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZCARD_Keyspecs,1,NULL,1),.args=ZCARD_Args}, +{MAKE_CMD("zcount","Returns the count of members in a sorted set that have scores within a range.","O(log(N)) with N being the number of elements in the sorted set.","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZCOUNT_History,0,ZCOUNT_Tips,0,zcountCommand,4,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZCOUNT_Keyspecs,1,NULL,3),.args=ZCOUNT_Args}, +{MAKE_CMD("zdiff","Returns the difference between multiple sorted sets.","O(L + (N-K)log(N)) worst case where L is the total number of elements in all the sets, N is the size of the first set, and K is the size of the result set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZDIFF_History,0,ZDIFF_Tips,0,zdiffCommand,-3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZDIFF_Keyspecs,1,zunionInterDiffGetKeys,3),.args=ZDIFF_Args}, {MAKE_CMD("zdiffstore","Stores the difference of multiple sorted sets in a key.","O(L + (N-K)log(N)) worst case where L is the total number of elements in all the sets, N is the size of the first set, and K is the size of the result set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZDIFFSTORE_History,0,ZDIFFSTORE_Tips,0,zdiffstoreCommand,-4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_SORTEDSET,ZDIFFSTORE_Keyspecs,2,zunionInterDiffStoreGetKeys,3),.args=ZDIFFSTORE_Args}, {MAKE_CMD("zincrby","Increments the score of a member in a sorted set.","O(log(N)) where N is the number of elements in the sorted set.","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZINCRBY_History,0,ZINCRBY_Tips,0,zincrbyCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZINCRBY_Keyspecs,1,NULL,3),.args=ZINCRBY_Args}, -{MAKE_CMD("zinter","Returns the intersect of multiple sorted sets.","O(N*K)+O(M*log(M)) worst case with N being the smallest input sorted set, K being the number of input sorted sets and M being the number of elements in the resulting sorted set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZINTER_History,0,ZINTER_Tips,0,zinterCommand,-3,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZINTER_Keyspecs,1,zunionInterDiffGetKeys,5),.args=ZINTER_Args}, -{MAKE_CMD("zintercard","Returns the number of members of the intersect of multiple sorted sets.","O(N*K) worst case with N being the smallest input sorted set, K being the number of input sorted sets.","7.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZINTERCARD_History,0,ZINTERCARD_Tips,0,zinterCardCommand,-3,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZINTERCARD_Keyspecs,1,zunionInterDiffGetKeys,3),.args=ZINTERCARD_Args}, +{MAKE_CMD("zinter","Returns the intersect of multiple sorted sets.","O(N*K)+O(M*log(M)) worst case with N being the smallest input sorted set, K being the number of input sorted sets and M being the number of elements in the resulting sorted set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZINTER_History,0,ZINTER_Tips,0,zinterCommand,-3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZINTER_Keyspecs,1,zunionInterDiffGetKeys,5),.args=ZINTER_Args}, +{MAKE_CMD("zintercard","Returns the number of members of the intersect of multiple sorted sets.","O(N*K) worst case with N being the smallest input sorted set, K being the number of input sorted sets.","7.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZINTERCARD_History,0,ZINTERCARD_Tips,0,zinterCardCommand,-3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZINTERCARD_Keyspecs,1,zunionInterDiffGetKeys,3),.args=ZINTERCARD_Args}, {MAKE_CMD("zinterstore","Stores the intersect of multiple sorted sets in a key.","O(N*K)+O(M*log(M)) worst case with N being the smallest input sorted set, K being the number of input sorted sets and M being the number of elements in the resulting sorted set.","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZINTERSTORE_History,0,ZINTERSTORE_Tips,0,zinterstoreCommand,-4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_SORTEDSET,ZINTERSTORE_Keyspecs,2,zunionInterDiffStoreGetKeys,5),.args=ZINTERSTORE_Args}, -{MAKE_CMD("zlexcount","Returns the number of members in a sorted set within a lexicographical range.","O(log(N)) with N being the number of elements in the sorted set.","2.8.9",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZLEXCOUNT_History,0,ZLEXCOUNT_Tips,0,zlexcountCommand,4,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZLEXCOUNT_Keyspecs,1,NULL,3),.args=ZLEXCOUNT_Args}, +{MAKE_CMD("zlexcount","Returns the number of members in a sorted set within a lexicographical range.","O(log(N)) with N being the number of elements in the sorted set.","2.8.9",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZLEXCOUNT_History,0,ZLEXCOUNT_Tips,0,zlexcountCommand,4,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZLEXCOUNT_Keyspecs,1,NULL,3),.args=ZLEXCOUNT_Args}, {MAKE_CMD("zmpop","Returns the highest- or lowest-scoring members from one or more sorted sets after removing them. Deletes the sorted set if the last member was popped.","O(K) + O(M*log(N)) where K is the number of provided keys, N being the number of elements in the sorted set, and M being the number of elements popped.","7.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZMPOP_History,0,ZMPOP_Tips,0,zmpopCommand,-4,CMD_WRITE,ACL_CATEGORY_SORTEDSET,ZMPOP_Keyspecs,1,zmpopGetKeys,4),.args=ZMPOP_Args}, -{MAKE_CMD("zmscore","Returns the score of one or more members in a sorted set.","O(N) where N is the number of members being requested.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZMSCORE_History,0,ZMSCORE_Tips,0,zmscoreCommand,-3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZMSCORE_Keyspecs,1,NULL,2),.args=ZMSCORE_Args}, +{MAKE_CMD("zmscore","Returns the score of one or more members in a sorted set.","O(N) where N is the number of members being requested.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZMSCORE_History,0,ZMSCORE_Tips,0,zmscoreCommand,-3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZMSCORE_Keyspecs,1,NULL,2),.args=ZMSCORE_Args}, {MAKE_CMD("zpopmax","Returns the highest-scoring members from a sorted set after removing them. Deletes the sorted set if the last member was popped.","O(log(N)*M) with N being the number of elements in the sorted set, and M being the number of elements popped.","5.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZPOPMAX_History,0,ZPOPMAX_Tips,0,zpopmaxCommand,-2,CMD_WRITE|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZPOPMAX_Keyspecs,1,NULL,2),.args=ZPOPMAX_Args}, {MAKE_CMD("zpopmin","Returns the lowest-scoring members from a sorted set after removing them. Deletes the sorted set if the last member was popped.","O(log(N)*M) with N being the number of elements in the sorted set, and M being the number of elements popped.","5.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZPOPMIN_History,0,ZPOPMIN_Tips,0,zpopminCommand,-2,CMD_WRITE|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZPOPMIN_Keyspecs,1,NULL,2),.args=ZPOPMIN_Args}, -{MAKE_CMD("zrandmember","Returns one or more random members from a sorted set.","O(N) where N is the number of members returned","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANDMEMBER_History,0,ZRANDMEMBER_Tips,1,zrandmemberCommand,-2,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZRANDMEMBER_Keyspecs,1,NULL,2),.args=ZRANDMEMBER_Args}, -{MAKE_CMD("zrange","Returns members in a sorted set within a range of indexes.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements returned.","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGE_History,1,ZRANGE_Tips,0,zrangeCommand,-4,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZRANGE_Keyspecs,1,NULL,7),.args=ZRANGE_Args}, -{MAKE_CMD("zrangebylex","Returns members in a sorted set within a lexicographical range.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","2.8.9",CMD_DOC_DEPRECATED,"`ZRANGE` with the `BYLEX` argument","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGEBYLEX_History,0,ZRANGEBYLEX_Tips,0,zrangebylexCommand,-4,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZRANGEBYLEX_Keyspecs,1,NULL,4),.args=ZRANGEBYLEX_Args}, -{MAKE_CMD("zrangebyscore","Returns members in a sorted set within a range of scores.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","1.0.5",CMD_DOC_DEPRECATED,"`ZRANGE` with the `BYSCORE` argument","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGEBYSCORE_History,1,ZRANGEBYSCORE_Tips,0,zrangebyscoreCommand,-4,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZRANGEBYSCORE_Keyspecs,1,NULL,5),.args=ZRANGEBYSCORE_Args}, +{MAKE_CMD("zrandmember","Returns one or more random members from a sorted set.","O(N) where N is the number of members returned","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANDMEMBER_History,0,ZRANDMEMBER_Tips,1,zrandmemberCommand,-2,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZRANDMEMBER_Keyspecs,1,NULL,2),.args=ZRANDMEMBER_Args}, +{MAKE_CMD("zrange","Returns members in a sorted set within a range of indexes.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements returned.","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGE_History,1,ZRANGE_Tips,0,zrangeCommand,-4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZRANGE_Keyspecs,1,NULL,7),.args=ZRANGE_Args}, +{MAKE_CMD("zrangebylex","Returns members in a sorted set within a lexicographical range.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","2.8.9",CMD_DOC_DEPRECATED,"`ZRANGE` with the `BYLEX` argument","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGEBYLEX_History,0,ZRANGEBYLEX_Tips,0,zrangebylexCommand,-4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZRANGEBYLEX_Keyspecs,1,NULL,4),.args=ZRANGEBYLEX_Args}, +{MAKE_CMD("zrangebyscore","Returns members in a sorted set within a range of scores.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","1.0.5",CMD_DOC_DEPRECATED,"`ZRANGE` with the `BYSCORE` argument","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGEBYSCORE_History,1,ZRANGEBYSCORE_Tips,0,zrangebyscoreCommand,-4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZRANGEBYSCORE_Keyspecs,1,NULL,5),.args=ZRANGEBYSCORE_Args}, {MAKE_CMD("zrangestore","Stores a range of members from sorted set in a key.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements stored into the destination key.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGESTORE_History,0,ZRANGESTORE_Tips,0,zrangestoreCommand,-5,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_SORTEDSET,ZRANGESTORE_Keyspecs,2,NULL,7),.args=ZRANGESTORE_Args}, -{MAKE_CMD("zrank","Returns the index of a member in a sorted set ordered by ascending scores.","O(log(N))","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANK_History,1,ZRANK_Tips,0,zrankCommand,-3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZRANK_Keyspecs,1,NULL,3),.args=ZRANK_Args}, +{MAKE_CMD("zrank","Returns the index of a member in a sorted set ordered by ascending scores.","O(log(N))","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANK_History,1,ZRANK_Tips,0,zrankCommand,-3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZRANK_Keyspecs,1,NULL,3),.args=ZRANK_Args}, {MAKE_CMD("zrem","Removes one or more members from a sorted set. Deletes the sorted set if all members were removed.","O(M*log(N)) with N being the number of elements in the sorted set and M the number of elements to be removed.","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZREM_History,1,ZREM_Tips,0,zremCommand,-3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZREM_Keyspecs,1,NULL,2),.args=ZREM_Args}, {MAKE_CMD("zremrangebylex","Removes members in a sorted set within a lexicographical range. Deletes the sorted set if all members were removed.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements removed by the operation.","2.8.9",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZREMRANGEBYLEX_History,0,ZREMRANGEBYLEX_Tips,0,zremrangebylexCommand,4,CMD_WRITE,ACL_CATEGORY_SORTEDSET,ZREMRANGEBYLEX_Keyspecs,1,NULL,3),.args=ZREMRANGEBYLEX_Args}, {MAKE_CMD("zremrangebyrank","Removes members in a sorted set within a range of indexes. Deletes the sorted set if all members were removed.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements removed by the operation.","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZREMRANGEBYRANK_History,0,ZREMRANGEBYRANK_Tips,0,zremrangebyrankCommand,4,CMD_WRITE,ACL_CATEGORY_SORTEDSET,ZREMRANGEBYRANK_Keyspecs,1,NULL,3),.args=ZREMRANGEBYRANK_Args}, {MAKE_CMD("zremrangebyscore","Removes members in a sorted set within a range of scores. Deletes the sorted set if all members were removed.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements removed by the operation.","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZREMRANGEBYSCORE_History,0,ZREMRANGEBYSCORE_Tips,0,zremrangebyscoreCommand,4,CMD_WRITE,ACL_CATEGORY_SORTEDSET,ZREMRANGEBYSCORE_Keyspecs,1,NULL,3),.args=ZREMRANGEBYSCORE_Args}, -{MAKE_CMD("zrevrange","Returns members in a sorted set within a range of indexes in reverse order.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements returned.","1.2.0",CMD_DOC_DEPRECATED,"`ZRANGE` with the `REV` argument","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANGE_History,0,ZREVRANGE_Tips,0,zrevrangeCommand,-4,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZREVRANGE_Keyspecs,1,NULL,4),.args=ZREVRANGE_Args}, -{MAKE_CMD("zrevrangebylex","Returns members in a sorted set within a lexicographical range in reverse order.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","2.8.9",CMD_DOC_DEPRECATED,"`ZRANGE` with the `REV` and `BYLEX` arguments","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANGEBYLEX_History,0,ZREVRANGEBYLEX_Tips,0,zrevrangebylexCommand,-4,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZREVRANGEBYLEX_Keyspecs,1,NULL,4),.args=ZREVRANGEBYLEX_Args}, -{MAKE_CMD("zrevrangebyscore","Returns members in a sorted set within a range of scores in reverse order.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","2.2.0",CMD_DOC_DEPRECATED,"`ZRANGE` with the `REV` and `BYSCORE` arguments","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANGEBYSCORE_History,1,ZREVRANGEBYSCORE_Tips,0,zrevrangebyscoreCommand,-4,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZREVRANGEBYSCORE_Keyspecs,1,NULL,5),.args=ZREVRANGEBYSCORE_Args}, -{MAKE_CMD("zrevrank","Returns the index of a member in a sorted set ordered by descending scores.","O(log(N))","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANK_History,1,ZREVRANK_Tips,0,zrevrankCommand,-3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZREVRANK_Keyspecs,1,NULL,3),.args=ZREVRANK_Args}, -{MAKE_CMD("zscan","Iterates over members and scores of a sorted set.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZSCAN_History,1,ZSCAN_Tips,1,zscanCommand,-3,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZSCAN_Keyspecs,1,NULL,5),.args=ZSCAN_Args}, -{MAKE_CMD("zscore","Returns the score of a member in a sorted set.","O(1)","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZSCORE_History,0,ZSCORE_Tips,0,zscoreCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZSCORE_Keyspecs,1,NULL,2),.args=ZSCORE_Args}, -{MAKE_CMD("zunion","Returns the union of multiple sorted sets.","O(N)+O(M*log(M)) with N being the sum of the sizes of the input sorted sets, and M being the number of elements in the resulting sorted set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZUNION_History,0,ZUNION_Tips,0,zunionCommand,-3,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZUNION_Keyspecs,1,zunionInterDiffGetKeys,5),.args=ZUNION_Args}, +{MAKE_CMD("zrevrange","Returns members in a sorted set within a range of indexes in reverse order.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements returned.","1.2.0",CMD_DOC_DEPRECATED,"`ZRANGE` with the `REV` argument","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANGE_History,0,ZREVRANGE_Tips,0,zrevrangeCommand,-4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZREVRANGE_Keyspecs,1,NULL,4),.args=ZREVRANGE_Args}, +{MAKE_CMD("zrevrangebylex","Returns members in a sorted set within a lexicographical range in reverse order.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","2.8.9",CMD_DOC_DEPRECATED,"`ZRANGE` with the `REV` and `BYLEX` arguments","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANGEBYLEX_History,0,ZREVRANGEBYLEX_Tips,0,zrevrangebylexCommand,-4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZREVRANGEBYLEX_Keyspecs,1,NULL,4),.args=ZREVRANGEBYLEX_Args}, +{MAKE_CMD("zrevrangebyscore","Returns members in a sorted set within a range of scores in reverse order.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","2.2.0",CMD_DOC_DEPRECATED,"`ZRANGE` with the `REV` and `BYSCORE` arguments","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANGEBYSCORE_History,1,ZREVRANGEBYSCORE_Tips,0,zrevrangebyscoreCommand,-4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZREVRANGEBYSCORE_Keyspecs,1,NULL,5),.args=ZREVRANGEBYSCORE_Args}, +{MAKE_CMD("zrevrank","Returns the index of a member in a sorted set ordered by descending scores.","O(log(N))","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANK_History,1,ZREVRANK_Tips,0,zrevrankCommand,-3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZREVRANK_Keyspecs,1,NULL,3),.args=ZREVRANK_Args}, +{MAKE_CMD("zscan","Iterates over members and scores of a sorted set.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZSCAN_History,1,ZSCAN_Tips,1,zscanCommand,-3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZSCAN_Keyspecs,1,NULL,5),.args=ZSCAN_Args}, +{MAKE_CMD("zscore","Returns the score of a member in a sorted set.","O(1)","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZSCORE_History,0,ZSCORE_Tips,0,zscoreCommand,3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZSCORE_Keyspecs,1,NULL,2),.args=ZSCORE_Args}, +{MAKE_CMD("zunion","Returns the union of multiple sorted sets.","O(N)+O(M*log(M)) with N being the sum of the sizes of the input sorted sets, and M being the number of elements in the resulting sorted set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZUNION_History,0,ZUNION_Tips,0,zunionCommand,-3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZUNION_Keyspecs,1,zunionInterDiffGetKeys,5),.args=ZUNION_Args}, {MAKE_CMD("zunionstore","Stores the union of multiple sorted sets in a key.","O(N)+O(M log(M)) with N being the sum of the sizes of the input sorted sets, and M being the number of elements in the resulting sorted set.","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZUNIONSTORE_History,0,ZUNIONSTORE_Tips,0,zunionstoreCommand,-4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_SORTEDSET,ZUNIONSTORE_Keyspecs,2,zunionInterDiffStoreGetKeys,5),.args=ZUNIONSTORE_Args}, /* stream */ {MAKE_CMD("xack","Returns the number of messages that were successfully acknowledged by the consumer group member of a stream.","O(1) for each message ID processed.","5.0.0",CMD_DOC_NONE,NULL,NULL,"stream",COMMAND_GROUP_STREAM,XACK_History,0,XACK_Tips,0,xackCommand,-4,CMD_WRITE|CMD_FAST,ACL_CATEGORY_STREAM,XACK_Keyspecs,1,NULL,3),.args=XACK_Args}, @@ -11400,7 +11400,7 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("decr","Decrements the integer value of a key by one. Uses 0 as initial value if the key doesn't exist.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,DECR_History,0,DECR_Tips,0,decrCommand,2,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_STRING,DECR_Keyspecs,1,NULL,1),.args=DECR_Args}, {MAKE_CMD("decrby","Decrements a number from the integer value of a key. Uses 0 as initial value if the key doesn't exist.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,DECRBY_History,0,DECRBY_Tips,0,decrbyCommand,3,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_STRING,DECRBY_Keyspecs,1,NULL,2),.args=DECRBY_Args}, {MAKE_CMD("delifeq","Delete key if value matches string.","O(1)","9.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,DELIFEQ_History,0,DELIFEQ_Tips,0,delifeqCommand,3,CMD_FAST|CMD_WRITE,ACL_CATEGORY_STRING,DELIFEQ_Keyspecs,1,NULL,2),.args=DELIFEQ_Args}, -{MAKE_CMD("get","Returns the string value of a key.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,GET_History,0,GET_Tips,0,getCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_STRING,GET_Keyspecs,1,NULL,1),.args=GET_Args}, +{MAKE_CMD("get","Returns the string value of a key.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,GET_History,0,GET_Tips,0,getCommand,2,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_STRING,GET_Keyspecs,1,NULL,1),.args=GET_Args}, {MAKE_CMD("getdel","Returns the string value of a key after deleting the key.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,GETDEL_History,0,GETDEL_Tips,0,getdelCommand,2,CMD_WRITE|CMD_FAST,ACL_CATEGORY_STRING,GETDEL_Keyspecs,1,NULL,1),.args=GETDEL_Args}, {MAKE_CMD("getex","Returns the string value of a key after setting its expiration time.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,GETEX_History,0,GETEX_Tips,0,getexCommand,-2,CMD_WRITE|CMD_FAST,ACL_CATEGORY_STRING,GETEX_Keyspecs,1,NULL,2),.args=GETEX_Args}, {MAKE_CMD("getrange","Returns a substring of the string stored at a key.","O(N) where N is the length of the returned string. The complexity is ultimately determined by the returned length, but because creating a substring from an existing string is very cheap, it can be considered O(1) for small strings.","2.4.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,GETRANGE_History,0,GETRANGE_Tips,0,getrangeCommand,4,CMD_READONLY,ACL_CATEGORY_STRING,GETRANGE_Keyspecs,1,NULL,3),.args=GETRANGE_Args}, diff --git a/src/commands/get.json b/src/commands/get.json index 693c1ac823..f0153f9505 100644 --- a/src/commands/get.json +++ b/src/commands/get.json @@ -8,7 +8,8 @@ "function": "getCommand", "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "STRING" diff --git a/src/commands/hexists.json b/src/commands/hexists.json index f5ea405718..10d028bddd 100644 --- a/src/commands/hexists.json +++ b/src/commands/hexists.json @@ -8,7 +8,8 @@ "function": "hexistsCommand", "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "HASH" diff --git a/src/commands/hget.json b/src/commands/hget.json index a041143ec8..01b0bed85b 100644 --- a/src/commands/hget.json +++ b/src/commands/hget.json @@ -8,7 +8,8 @@ "function": "hgetCommand", "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "HASH" diff --git a/src/commands/hgetall.json b/src/commands/hgetall.json index 9bbf835a34..2e81ee24e6 100644 --- a/src/commands/hgetall.json +++ b/src/commands/hgetall.json @@ -7,7 +7,8 @@ "arity": 2, "function": "hgetallCommand", "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "HASH" diff --git a/src/commands/hkeys.json b/src/commands/hkeys.json index 917df1c9eb..d72e73459f 100644 --- a/src/commands/hkeys.json +++ b/src/commands/hkeys.json @@ -7,7 +7,8 @@ "arity": 2, "function": "hkeysCommand", "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "HASH" diff --git a/src/commands/hlen.json b/src/commands/hlen.json index d4c13ac116..db0f364687 100644 --- a/src/commands/hlen.json +++ b/src/commands/hlen.json @@ -8,7 +8,8 @@ "function": "hlenCommand", "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "HASH" diff --git a/src/commands/hmget.json b/src/commands/hmget.json index 73fa9c311f..fa96abcbd8 100644 --- a/src/commands/hmget.json +++ b/src/commands/hmget.json @@ -8,7 +8,8 @@ "function": "hmgetCommand", "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "HASH" diff --git a/src/commands/hrandfield.json b/src/commands/hrandfield.json index 83abc74a9d..f59279b4a4 100644 --- a/src/commands/hrandfield.json +++ b/src/commands/hrandfield.json @@ -7,7 +7,8 @@ "arity": -2, "function": "hrandfieldCommand", "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "HASH" diff --git a/src/commands/hscan.json b/src/commands/hscan.json index 9e6099c2f2..648c9b5a44 100644 --- a/src/commands/hscan.json +++ b/src/commands/hscan.json @@ -7,7 +7,8 @@ "arity": -3, "function": "hscanCommand", "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "HASH" diff --git a/src/commands/hstrlen.json b/src/commands/hstrlen.json index 82ac6dbe48..6aeb2f3301 100644 --- a/src/commands/hstrlen.json +++ b/src/commands/hstrlen.json @@ -8,7 +8,8 @@ "function": "hstrlenCommand", "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "HASH" diff --git a/src/commands/hvals.json b/src/commands/hvals.json index 55aeaaff92..42061c18b8 100644 --- a/src/commands/hvals.json +++ b/src/commands/hvals.json @@ -7,7 +7,8 @@ "arity": 2, "function": "hvalsCommand", "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "HASH" diff --git a/src/commands/lindex.json b/src/commands/lindex.json index a589d52fc9..5e19ea8b36 100644 --- a/src/commands/lindex.json +++ b/src/commands/lindex.json @@ -7,7 +7,8 @@ "arity": 3, "function": "lindexCommand", "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "LIST" diff --git a/src/commands/llen.json b/src/commands/llen.json index 846aa40867..ac743d5cf7 100644 --- a/src/commands/llen.json +++ b/src/commands/llen.json @@ -8,7 +8,8 @@ "function": "llenCommand", "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "LIST" diff --git a/src/commands/lrange.json b/src/commands/lrange.json index 303d2f60b8..95ca2e5128 100644 --- a/src/commands/lrange.json +++ b/src/commands/lrange.json @@ -7,7 +7,8 @@ "arity": 4, "function": "lrangeCommand", "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "LIST" diff --git a/src/commands/zcard.json b/src/commands/zcard.json index 58683a4874..11bf3052f6 100644 --- a/src/commands/zcard.json +++ b/src/commands/zcard.json @@ -8,7 +8,8 @@ "function": "zcardCommand", "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zcount.json b/src/commands/zcount.json index 0fdebd7dff..721f7f7e79 100644 --- a/src/commands/zcount.json +++ b/src/commands/zcount.json @@ -8,7 +8,8 @@ "function": "zcountCommand", "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zdiff.json b/src/commands/zdiff.json index 912d5c6d05..c49b8d5c61 100644 --- a/src/commands/zdiff.json +++ b/src/commands/zdiff.json @@ -8,7 +8,8 @@ "function": "zdiffCommand", "get_keys_function": "zunionInterDiffGetKeys", "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zinter.json b/src/commands/zinter.json index 4828e21d6c..abef66f8fc 100644 --- a/src/commands/zinter.json +++ b/src/commands/zinter.json @@ -8,7 +8,8 @@ "function": "zinterCommand", "get_keys_function": "zunionInterDiffGetKeys", "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zintercard.json b/src/commands/zintercard.json index 7fdab3ed64..b02fb93db0 100644 --- a/src/commands/zintercard.json +++ b/src/commands/zintercard.json @@ -8,7 +8,8 @@ "function": "zinterCardCommand", "get_keys_function": "zunionInterDiffGetKeys", "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zlexcount.json b/src/commands/zlexcount.json index 8bf2884c93..83d0585daa 100644 --- a/src/commands/zlexcount.json +++ b/src/commands/zlexcount.json @@ -8,7 +8,8 @@ "function": "zlexcountCommand", "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zmscore.json b/src/commands/zmscore.json index 6a036fe0be..a8013f7c3b 100644 --- a/src/commands/zmscore.json +++ b/src/commands/zmscore.json @@ -8,7 +8,8 @@ "function": "zmscoreCommand", "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrandmember.json b/src/commands/zrandmember.json index 13abc9aa3c..193daf8d1d 100644 --- a/src/commands/zrandmember.json +++ b/src/commands/zrandmember.json @@ -7,7 +7,8 @@ "arity": -2, "function": "zrandmemberCommand", "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrange.json b/src/commands/zrange.json index dc7af8dc14..a2e37c92ec 100644 --- a/src/commands/zrange.json +++ b/src/commands/zrange.json @@ -13,7 +13,8 @@ ] ], "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrangebylex.json b/src/commands/zrangebylex.json index 5949b87166..1d05e93f7a 100644 --- a/src/commands/zrangebylex.json +++ b/src/commands/zrangebylex.json @@ -12,7 +12,8 @@ "DEPRECATED" ], "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrangebyscore.json b/src/commands/zrangebyscore.json index c89607e104..9b1b41ff75 100644 --- a/src/commands/zrangebyscore.json +++ b/src/commands/zrangebyscore.json @@ -18,7 +18,8 @@ "DEPRECATED" ], "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrank.json b/src/commands/zrank.json index f5f427c66d..8f3048d3a4 100644 --- a/src/commands/zrank.json +++ b/src/commands/zrank.json @@ -14,7 +14,8 @@ ], "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrevrange.json b/src/commands/zrevrange.json index a143f72153..1bce02766c 100644 --- a/src/commands/zrevrange.json +++ b/src/commands/zrevrange.json @@ -12,7 +12,8 @@ "DEPRECATED" ], "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrevrangebylex.json b/src/commands/zrevrangebylex.json index d1d8100d1e..f28b9a71e4 100644 --- a/src/commands/zrevrangebylex.json +++ b/src/commands/zrevrangebylex.json @@ -12,7 +12,8 @@ "DEPRECATED" ], "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrevrangebyscore.json b/src/commands/zrevrangebyscore.json index 0eb9e86956..5e88f0c9ca 100644 --- a/src/commands/zrevrangebyscore.json +++ b/src/commands/zrevrangebyscore.json @@ -18,7 +18,8 @@ "DEPRECATED" ], "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrevrank.json b/src/commands/zrevrank.json index 39897cae33..c529391bfa 100644 --- a/src/commands/zrevrank.json +++ b/src/commands/zrevrank.json @@ -14,7 +14,8 @@ ], "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zscan.json b/src/commands/zscan.json index 7948e393a5..477468476f 100644 --- a/src/commands/zscan.json +++ b/src/commands/zscan.json @@ -13,7 +13,8 @@ ] ], "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zscore.json b/src/commands/zscore.json index 502247051e..6da7a38058 100644 --- a/src/commands/zscore.json +++ b/src/commands/zscore.json @@ -8,7 +8,8 @@ "function": "zscoreCommand", "command_flags": [ "READONLY", - "FAST" + "FAST", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zunion.json b/src/commands/zunion.json index 1ce3dc5ee1..26bf0b1605 100644 --- a/src/commands/zunion.json +++ b/src/commands/zunion.json @@ -8,7 +8,8 @@ "function": "zunionCommand", "get_keys_function": "zunionInterDiffGetKeys", "command_flags": [ - "READONLY" + "READONLY", + "CAN_BE_OFFLOADED" ], "acl_categories": [ "SORTEDSET" diff --git a/src/config.c b/src/config.c index c17bd72122..cb0055c795 100644 --- a/src/config.c +++ b/src/config.c @@ -3267,6 +3267,8 @@ standardConfig static_configs[] = { createIntConfig("min-string-size-avoid-copy-reply", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, INT_MAX, server.min_string_size_copy_avoid, 16384, INTEGER_CONFIG, NULL, NULL), createIntConfig("min-string-size-avoid-copy-reply-threaded", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, INT_MAX, server.min_string_size_copy_avoid_threaded, 65536, INTEGER_CONFIG, NULL, NULL), createIntConfig("prefetch-batch-max-size", NULL, MODIFIABLE_CONFIG, 0, 128, server.prefetch_batch_max_size, 16, INTEGER_CONFIG, NULL, NULL), + createBoolConfig("io-threads-do-commands-offloading", NULL, MODIFIABLE_CONFIG, server.io_threads_do_commands_offloading, 1, NULL, NULL), /* Command offloading enabled by default */ + createBoolConfig("io-threads-do-commands-offloading-with-modules", NULL, MODIFIABLE_CONFIG, server.io_threads_do_commands_offloading_with_modules, 0, NULL, NULL), /* Module command offloading disabled by default */ createIntConfig("auto-aof-rewrite-percentage", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.aof_rewrite_perc, 100, INTEGER_CONFIG, NULL, NULL), createIntConfig("cluster-replica-validity-factor", "cluster-slave-validity-factor", MODIFIABLE_CONFIG, 0, INT_MAX, server.cluster_replica_validity_factor, 10, INTEGER_CONFIG, NULL, NULL), /* replica max data age factor. */ createIntConfig("list-max-listpack-size", "list-max-ziplist-size", MODIFIABLE_CONFIG, INT_MIN, INT_MAX, server.list_max_listpack_size, -2, INTEGER_CONFIG, NULL, NULL), diff --git a/src/db.c b/src/db.c index aac384b535..8a6b6d53a9 100644 --- a/src/db.c +++ b/src/db.c @@ -124,8 +124,10 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { /* Update the access time for the ageing algorithm. * Don't do it if we have a saving child, as this will trigger * a copy on write madness. */ - if (server.current_client && server.current_client->flag.no_touch && - server.executing_client->cmd->proc != touchCommand) + client *current_client = getCurrentClient(); + client *executing_client = getExecutingClient(); + if (current_client && current_client->flag.no_touch && + executing_client && executing_client->cmd->proc != touchCommand) flags |= LOOKUP_NOTOUCH; if (!hasActiveChildProcess() && !(flags & LOOKUP_NOTOUCH)) { /* Shared objects can't be stored in the database. */ @@ -243,7 +245,7 @@ static int getKVStoreIndexForKey(sds key) { } /* Returns the cluster hash slot for a given key, trying to use the cached slot that - * stored on the server.current_client first. If there is no cached value, it will compute the hash slot + * stored on the current_client first. If there is no cached value, it will compute the hash slot * and then cache the value.*/ int getKeySlot(sds key) { serverAssert(server.cluster_enabled); @@ -257,18 +259,19 @@ int getKeySlot(sds key) { * Modules and scripts executed on the primary may get replicated as multi-execs that operate on multiple slots, * so we must always recompute the slot for commands coming from the primary. */ - if (server.current_client && server.current_client->slot >= 0 && server.current_client->flag.executing_command && - !server.current_client->flag.primary) { - debugServerAssertWithInfo(server.current_client, NULL, - (int)keyHashSlot(key, (int)sdslen(key)) == server.current_client->slot); - return server.current_client->slot; + client *current_client = getCurrentClient(); + if (current_client && current_client->slot >= 0 && current_client->flag.executing_command && + !current_client->flag.primary) { + debugServerAssertWithInfo(current_client, NULL, + (int)keyHashSlot(key, (int)sdslen(key)) == current_client->slot); + return current_client->slot; } int slot = keyHashSlot(key, (int)sdslen(key)); /* For the case of replicated commands from primary, getNodeByQuery() never gets called, * and thus c->slot never gets populated. That said, if this command ends up accessing a key, * we are able to backfill c->slot here, where the key's hash calculation is made. */ - if (server.current_client && server.current_client->flag.primary) { - server.current_client->slot = slot; + if (current_client && current_client->flag.primary) { + current_client->slot = slot; } return slot; } @@ -1815,7 +1818,7 @@ long long getExpire(serverDb *db, robj *key) { void deleteExpiredKeyAndPropagateWithDictIndex(serverDb *db, robj *keyobj, int dict_index) { mstime_t expire_latency; latencyStartMonitor(expire_latency); - dbGenericDeleteWithDictIndex(db, keyobj, server.lazyfree_lazy_expire, DB_FLAG_KEY_EXPIRED, dict_index); + if (!dbGenericDeleteWithDictIndex(db, keyobj, server.lazyfree_lazy_expire, DB_FLAG_KEY_EXPIRED, dict_index)) return; latencyEndMonitor(expire_latency); latencyAddSampleIfNeeded("expire-del", expire_latency); latencyTraceIfNeeded(db, expire_del, expire_latency); @@ -1894,7 +1897,8 @@ static int objectIsExpired(robj *val) { if (server.loading) return 0; if (!timestampIsExpired(objectGetExpire(val))) return 0; if (server.primary_host == NULL && server.import_mode) { - if (server.current_client && server.current_client->flag.import_source) return 0; + client *current_client = getCurrentClient(); + if (current_client && current_client->flag.import_source) return 0; } return 1; } @@ -1912,7 +1916,8 @@ static int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) { /* See expireIfNeededWithDictIndex for more details. */ if (server.primary_host == NULL && server.import_mode) { - if (server.current_client && server.current_client->flag.import_source) return 0; + client *current_client = getCurrentClient(); + if (current_client && current_client->flag.import_source) return 0; } return 1; } @@ -1923,6 +1928,24 @@ int keyIsExpired(serverDb *db, robj *key) { return keyIsExpiredWithDictIndex(db, key, dict_index); } +typedef struct postpone_expired_key_ctx { + int dict_index; + serverDb *db; + robj *key; +} postpone_expired_key_ctx; + +static void handlePostponeExpiredKey(void *data) { + postpone_expired_key_ctx *ctx = (postpone_expired_key_ctx *)data; + + enterExecutionUnit(1, 0); + + deleteExpiredKeyAndPropagateWithDictIndex(ctx->db, ctx->key, ctx->dict_index); + decrRefCount(ctx->key); + + exitExecutionUnit(); + postExecutionUnitOperations(); +} + /* val is optional. Pass NULL if val is not yet fetched from the database. */ static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val, int flags, int dict_index) { if (server.lazy_expire_disabled) return KEY_VALID; @@ -1945,8 +1968,9 @@ static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val, * * When replicating commands from the primary, keys are never considered * expired. */ + client *current_client = getCurrentClient(); if (server.primary_host != NULL) { - if (server.current_client && (server.current_client->flag.primary)) return KEY_VALID; + if (current_client && (current_client->flag.primary)) return KEY_VALID; if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return KEY_EXPIRED; } else if (server.import_mode) { /* If we are running in the import mode on a primary, instead of @@ -1965,7 +1989,7 @@ static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val, * * When receiving commands from the import source, keys are never considered * expired. */ - if (server.current_client && (server.current_client->flag.import_source)) return KEY_VALID; + if (current_client && (current_client->flag.import_source)) return KEY_VALID; if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return KEY_EXPIRED; } @@ -1983,6 +2007,15 @@ static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val, if (static_key) { key = createStringObject(key->ptr, sdslen(key->ptr)); } + + /* If not in main-thread postpone key deletion */ + if (!inMainThread()) { + postpone_expired_key_ctx ctx = {.dict_index = dict_index, .db = db, .key = key}; + if (!static_key) incrRefCount(key); + threadAddDelayedJob(dict_index, handlePostponeExpiredKey, sizeof(ctx), &ctx); + return KEY_EXPIRED; + } + /* Delete the key */ deleteExpiredKeyAndPropagateWithDictIndex(db, key, dict_index); if (static_key) { diff --git a/src/debug.c b/src/debug.c index 6dfd6b10ec..e4098bc99c 100644 --- a/src/debug.c +++ b/src/debug.c @@ -2222,8 +2222,8 @@ void printCrashReport(void) { logServerInfo(); /* Log the current client */ - logCurrentClient(server.current_client, "CURRENT"); - logCurrentClient(server.executing_client, "EXECUTING"); + logCurrentClient(getCurrentClient(), "CURRENT"); + logCurrentClient(getExecutingClient(), "EXECUTING"); /* Log modules info. Something we wanna do last since we fear it may crash. */ logModulesInfo(); diff --git a/src/io_threads.c b/src/io_threads.c index b1ee01066a..fdeee20c95 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -5,13 +5,156 @@ */ #include "io_threads.h" +#include "server.h" +#include "cluster.h" +#include "cluster_legacy.h" +#include "cluster_slot_stats.h" +#include "module.h" static __thread int thread_id = 0; /* Thread local var */ static pthread_t io_threads[IO_THREADS_MAX_NUM] = {0}; static pthread_mutex_t io_threads_mutex[IO_THREADS_MAX_NUM]; +static int cur_epoll_thread = 0; + +/****************************************************************************************************** + * Multi-Producer Single-Consumer Queue Implementation + * This queue allows multiple producer threads to safely enqueue items + * that will be consumed by a single consumer thread. It's designed for + * passing messages from IO-threads to the Main-thread. + *******************************************************************************************************/ + +/* Multi-Producer Single-Consumer Queue structure + * Cache line padding (64 bytes) is used to prevent false sharing between counter */ +typedef struct IoToMTQueue { + size_t capacity __attribute__((aligned(64))); /* Total queue capacity */ + _Atomic uint64_t producer_counter __attribute__((aligned(64))); /* Shared counter for producers */ + _Atomic uint64_t producer_limit __attribute__((aligned(64))); /* Upper bound for producers */ + uint64_t consumer_counter __attribute__((aligned(64))); /* Consumer position */ + volatile uint64_t entries[]; /* Flexible array of queue entries */ +} IoToMTQueue; + +typedef struct MPSCPendingResponse { + uint64_t value; + uint64_t counter; +} MPSCPendingResponse; + +/* Global queue for completed I/O jobs */ +static IoToMTQueue *io_to_mt_queue = NULL; + +/* Thread-local queue for values that couldn't be immediately written to the MPSC queue */ +static __thread list thread_pending_responses = {0}; + +/* Default queue size */ +#define DEFAULT_MPSC_QUEUE_SIZE_PER_THREAD 1024 + +/* Creates a new MPSC queue with the specified capacity */ +static IoToMTQueue *IoToMTQueueCreate(size_t capacity) { + IoToMTQueue *queue = (IoToMTQueue *)zcalloc(sizeof(IoToMTQueue) + (sizeof(uint64_t) * capacity)); + queue->capacity = capacity; + queue->producer_limit = capacity - 1; /* 0 based index */ + return queue; +} + +/* Adds an item to the queue (producer operation) + * + * This function attempts to add a value to the IO-to-MT thread queue. If the queue + * is full the value is stored in an unposted list to be tried again later. + * + * value - The main value to enqueue in the queue + * counter - Position in the queue where the value should be written. If 0, a new + * position is allocated. + * + * Returns: + * 1 - If the value was successfully added to the queue + * 0 - If the queue was full and the value couldn't be added + */ +static int IoToMTQueueProduce(uint64_t value, uint64_t counter) { + IoToMTQueue *q = io_to_mt_queue; + int first_try = counter == 0; + /* Get the next producer slot if no slot is given */ + if (first_try) { + counter = atomic_fetch_add_explicit(&q->producer_counter, 1, memory_order_relaxed); + } + + /* Calculate the actual index in the ring buffer */ + uint64_t index = counter % q->capacity; + + /* Try to write the value. */ + if (atomic_load_explicit(&q->producer_limit, memory_order_acquire) >= counter) { + atomic_thread_fence(memory_order_release); + q->entries[index] = value; + return 1; + } + + if (!first_try) return 0; + /* If the queue is full, store the value to be written later, IO thread should never busy wait for the MT to avoid dead lock */ + MPSCPendingResponse *pending_response = zmalloc(sizeof(MPSCPendingResponse)); + pending_response->value = value; + pending_response->counter = counter; + listAddNodeTail(&thread_pending_responses, pending_response); + return 0; +} + +static void handleThreadPendingResponses(void) { + if (listLength(&thread_pending_responses) == 0) return; + listIter li; + listNode *ln; + + listRewind(&thread_pending_responses, &li); + while ((ln = listNext(&li))) { + MPSCPendingResponse *pending_response = listNodeValue(ln); + if (IoToMTQueueProduce(pending_response->value, pending_response->counter) == 0) break; + listDelNode(&thread_pending_responses, ln); + } +} + +/* Consumes multiple items from the queue (consumer operation) + * Returns Number of items actually consumed */ +static int IoToMTQueueConsumeBatch(int max_items, uint64_t *values) { + IoToMTQueue *q = io_to_mt_queue; + int consumed_count = 0; + /* Try to consume up to max_items */ + for (int i = 0; i < max_items; i++) { + /* Get the current consumer position */ + size_t index = q->consumer_counter % q->capacity; + + uint64_t val = q->entries[index]; + + if (!val) break; + + /* Store the consumed values in the output arrays */ + values[consumed_count] = val; + consumed_count++; + + /* Clear the entry to mark it as consumed */ + q->entries[index] = 0; + + /* Move to the next position */ + q->consumer_counter++; + } + + /* If we consumed any items, update the producer limit */ + if (consumed_count > 0) { + /* Release so that the threads see the NULL assingments */ + atomic_store_explicit(&q->producer_limit, q->producer_limit + consumed_count, memory_order_release); + /* Acqiure to get the latest thread changes */ + atomic_thread_fence(memory_order_acquire); + } + + return consumed_count; +} +/* End of IO to MT MSPC queue functions */ + + +/****************************************************************************************************** + * Single-Producer Single-Consumer Queue Implementation + * This queue allows a single producer thread (main thread) to safely enqueue items + * that will be consumed by a single consumer thread (IO thread). + * The queue uses a ring buffer with head and tail pointers to track the producer + * and consumer positions respectively, ensuring thread-safety through atomic operations. + *******************************************************************************************************/ /* IO jobs queue functions - Used to send jobs from the main-thread to the IO thread. */ -typedef void (*job_handler)(void *); typedef struct iojob { job_handler handler; void *data; @@ -20,6 +163,8 @@ typedef struct iojob { typedef struct IOJobQueue { iojob *ring_buffer; size_t size; + volatile bool pending_epoll_job; + size_t submitted_jobs_count; /* Number of jobs submitted, accessed by the main-thread only. */ _Atomic size_t head __attribute__((aligned(CACHE_LINE_SIZE))); /* Next write index for producer (main-thread) */ _Atomic size_t tail __attribute__((aligned(CACHE_LINE_SIZE))); /* Next read index for consumer (IO-thread) */ } IOJobQueue; @@ -32,6 +177,8 @@ static void IOJobQueue_init(IOJobQueue *jq, size_t item_count) { jq->size = item_count; /* Total number of items */ jq->head = 0; jq->tail = 0; + jq->submitted_jobs_count = 0; + jq->pending_epoll_job = false; } /* Clean up the job queue and free allocated memory. */ @@ -41,14 +188,29 @@ static void IOJobQueue_cleanup(IOJobQueue *jq) { memset(jq, 0, sizeof(*jq)); } -static int IOJobQueue_isFull(const IOJobQueue *jq) { +static int IOJobQueue_isFull(IOJobQueue *jq) { debugServerAssertWithInfo(NULL, NULL, inMainThread()); + + /* Fast path: If submitted jobs are less than the queue size, the queue can't be full */ + if (jq->submitted_jobs_count < (jq->size - 1)) { + return 0; /* Fast path Submitted jobs are less than the queue size, the queue can't be full. */ + } + size_t current_head = atomic_load_explicit(&jq->head, memory_order_relaxed); /* We don't use memory_order_acquire for the tail due to performance reasons, * In the worst case we will just assume wrongly the buffer is full and the main thread will do the job by itself. */ size_t current_tail = atomic_load_explicit(&jq->tail, memory_order_relaxed); size_t next_head = (current_head + 1) % jq->size; - return next_head == current_tail; + if (next_head == current_tail) { + /* Queue is full */ + serverAssert(jq->submitted_jobs_count == jq->size - 1); + return 1; + } else { + /* Queue is not full, update the submitted_jobs_count */ + size_t free_slots = (current_tail >= next_head) ? (current_tail - next_head) : (jq->size - (next_head - current_tail)); + jq->submitted_jobs_count = jq->size - free_slots - 1; + return 0; + } } /* Attempt to push a new job to the queue from the main thread. @@ -70,6 +232,7 @@ static void IOJobQueue_push(IOJobQueue *jq, job_handler handler, void *data) { /* memory_order_release to make sure the data is visible to the consumer (the IO thread). */ atomic_store_explicit(&jq->head, next_head, memory_order_release); + jq->submitted_jobs_count++; } /* Returns the number of jobs currently available for consumption in the given job queue. @@ -126,12 +289,366 @@ static void IOJobQueue_peek(const IOJobQueue *jq, job_handler *handler, void **d /* End of IO job queue functions */ -int inMainThread(void) { - return thread_id == 0; + +/* ********************************************************************************************************************* + * Deferred queues are used to manage command execution ordering in multi-threaded environments. + * They ensure proper synchronization between commands that access the same slot or require exclusive access. + * Commands are queued when there are conflicting operations in progress and executed when it's safe to do so. + * + * A deferred queue is composed of two lists: + * 1. A clients list - containing clients blocked waiting for command execution + * 2. A jobs list - containing general jobs that need to be performed on a given slot or the whole DB without client context + * + * The system maintains a deferred queue per slot and a general deferred queue (deferredCmdExclusive) for exclusive + * access to the whole database. + **********************************************************************************************************************/ +typedef struct deferredQueue { + int refcount; + int cur_tid; + list *deferred_jobs; + list *pending_clients; +} deferredQueue; + +/* Global queues for deferred commands */ +deferredQueue deferredCmdExclusive = {0}; +deferredQueue slot_use_info[16384] = {0}; + +typedef struct delayedJob { + job_handler handler; + int slot; + char data[]; +} delayedJob; + +/* Global thread-local storage for delayed jobs */ +static __thread list *thread_delayed_jobs = NULL; + +/* + * executionContext + * -1: CTX_NONE (no execution context) + * -2: CTX_EXCLUSIVE (exclusive access to the whole database) + * >= 0: slot number (indicating we're operating on a specific slot) + */ +#define CTX_NONE -1 +#define CTX_EXCLUSIVE -2 + +/* Global context tracking for deferred queue operations - mainthread only */ +static int dq_context = CTX_NONE; + +static deferredQueue *getDeferredQueue(int slot) { + if (slot == -1) { + return &deferredCmdExclusive; + } else { + return &slot_use_info[slot]; + } +} + +static int isClientListEmpty(deferredQueue *queue) { + return queue->pending_clients == NULL || listLength(queue->pending_clients) == 0; +} + +static int isJobListEmpty(deferredQueue *queue) { + return queue->deferred_jobs == NULL || listLength(queue->deferred_jobs) == 0; +} + +static int isDqEmpty(deferredQueue *queue) { + return isJobListEmpty(queue) && isClientListEmpty(queue); +} + +/* Check if a deferred queue is available for immediate processing */ +static int dqAvailable(deferredQueue *queue) { + return queue->refcount == 0; +} + +/* Returns the thread id handling the given slot, -1 if no thread is handling it */ +static int getSlotTid(int slot) { + if (dqAvailable(getDeferredQueue(slot))) return -1; + return getDeferredQueue(slot)->cur_tid; +} + +static void setSlotTid(int slot, int tid) { + getDeferredQueue(slot)->cur_tid = tid; +} + +/* Increment the reference count of a deferred queue */ +static void dqIncr(deferredQueue *queue) { + queue->refcount++; +} + +/* Create a new job with the given handler and data */ +static listNode *createJobNode(int slot, job_handler handler, size_t data_size, void *data) { + /* Allocate memory for job structure plus data using flexible array member */ + listNode *node = zmalloc(sizeof(listNode) + sizeof(delayedJob) + data_size); + delayedJob *job = (delayedJob *)(node + 1); + job->slot = slot; + job->handler = handler; + if (data_size) { + memcpy(job->data, data, data_size); /* Copy data to the flexible array member */ + } + node->value = job; + + return node; +} + +/* Process a job immediately or add it to queue based on refcount */ +static void processOrAddJob(deferredQueue *q, listNode *jobNode) { + if (q->refcount == 0) { + delayedJob *job = listNodeValue(jobNode); + job->handler(job->data); + zfree(jobNode); + } else { + if (q->deferred_jobs == NULL) { + q->deferred_jobs = listCreate(); + } + listLinkNodeTail(q->deferred_jobs, jobNode); + } } -int getIOThreadID(void) { - return thread_id; +/* Returns whether the given command requires exclusive access to the whole database. */ +static int isDBExclusiveCmd(struct serverCommand *cmd, int slot) { + /* If no slot is specified but the client command changes the keyspace, we assume it is an exclusive command */ + if (slot == -1 && (cmd->flags & CMD_WRITE)) return 1; + /* The exec command can contain commands that may affect the whole database */ + if (cmd->proc == execCommand) return 1; + /* If no mandatory keys are specified, we can't determine which slot will be accessed */ + if (cmd->flags & CMD_NO_MANDATORY_KEYS) return 1; + /* Any Admin level command needs full exclusivity as it impacts system-wide behaviour */ + if (cmd->flags & CMD_ADMIN) return 1; + return 0; +} + +/* Returns if the given command requires exclusive access to the given slot. */ +static int isSlotExclusiveCmd(struct serverCommand *cmd, int slot) { + /* No exclusivity required */ + if (cmd->flags & CMD_CAN_BE_OFFLOADED) return 0; + + /* Not slot exclusive rather DB exclusive */ + if (isDBExclusiveCmd(cmd, slot)) return 0; + + return 1; +} + +/* Process all jobs in a deferred jobs list and reset the list to empty */ +static void processDeferredJobsList(deferredQueue *queue) { + if (!queue->deferred_jobs) return; + + listIter li; + listNode *ln; + listRewind(queue->deferred_jobs, &li); + + while ((ln = listNext(&li))) { + delayedJob *job = listNodeValue(ln); + listUnlinkNode(queue->deferred_jobs, ln); + job->handler(job->data); + zfree(ln); + } + + if (queue != &deferredCmdExclusive) { + listRelease(queue->deferred_jobs); + queue->deferred_jobs = NULL; + } +} + +/* This function handles clients that were previously deferred for command processing. */ +static void dqProcessPendingClients(int slot) { + deferredQueue *queue = getDeferredQueue(slot); + /* Set the queue context based on queue type */ + dq_context = (queue == &deferredCmdExclusive) ? CTX_EXCLUSIVE : slot; + + /* Process only a fixed number of clients to avoid infinite iteration. + * Clients may be added back to the list during processing. */ + size_t len = listLength(queue->pending_clients); + listNode *ln; + + /* Process clients from the pending list. + * We need to check if the list still exists during each iteration + * because commands like CLIENT KILL may remove clients mid-iteration. */ + while (len-- && queue->pending_clients && (ln = listFirst(queue->pending_clients))) { + client *c = listNodeValue(ln); + + /* Check if we need to wait due to exclusive commands */ + if (queue->refcount) { + if (dq_context == CTX_EXCLUSIVE) { + if (isDBExclusiveCmd(c->cmd, c->slot)) break; + } else { + if (isSlotExclusiveCmd(c->cmd, c->slot)) break; + } + } + + /* Remove client from pending list and mark as unblocked */ + listUnlinkNode(queue->pending_clients, ln); + c->bstate->slot_pending_list = NULL; + c->flag.blocked = 0; + server.stat_io_threaded_clients_blocked_on_slot--; + + if (processPendingCommandAndInputBuffer(c) != C_ERR) { + beforeNextClient(c); + } + } + + /* Clean up the client list if it's empty */ + if (queue != &deferredCmdExclusive && queue->pending_clients && listLength(queue->pending_clients) == 0) { + listRelease(queue->pending_clients); + queue->pending_clients = NULL; + } + + /* Reset the queue context */ + dq_context = CTX_NONE; +} + +/* Decrement the reference count and process pending jobs if it reaches zero */ +static void dqDecr(int slot) { + deferredQueue *queue = getDeferredQueue(slot); + serverAssert(queue->refcount > 0); + queue->refcount--; + if (queue->refcount != 0) return; + + /* Process any pending jobs when refcount reaches zero */ + if (queue->deferred_jobs && listLength(queue->deferred_jobs)) { + processDeferredJobsList(queue); + } + + /* Process any pending clients */ + if (queue->pending_clients && listLength(queue->pending_clients)) { + dqProcessPendingClients(slot); + } +} + +/* Add a client to the pending clients list of a deferred queue */ +static void dqAddPendingClient(deferredQueue *queue, client *c) { + /* Create the pending clients list if it doesn't exist */ + if (isClientListEmpty(queue)) { + queue->pending_clients = listCreate(); + } + + /* Add client to the list and mark the client as blocked */ + initClientBlockingState(c); + listLinkNodeTail(queue->pending_clients, &c->bstate->pending_client_node); + c->flag.pending_command = 1; + c->bstate->slot_pending_list = queue; + c->bstate->btype = BLOCKED_SLOT; + c->flag.blocked = 1; + server.stat_io_threaded_clients_blocked_on_slot++; + server.stat_io_threaded_clients_blocked_total++; +} + +/* Remove a client from the pending clients list of a deferred queue */ +static void dqRemoveClient(deferredQueue *queue, client *c) { + /* Remove client from the list if it exists */ + serverAssert(!isClientListEmpty(queue)); + listUnlinkNode(queue->pending_clients, &c->bstate->pending_client_node); + + /* Clean up empty client list */ + if (queue != &deferredCmdExclusive && listLength(queue->pending_clients) == 0) { + listRelease(queue->pending_clients); + queue->pending_clients = NULL; + } + + /* Reset client state */ + c->flag.pending_command = 0; + c->flag.blocked = 0; + c->bstate->slot_pending_list = NULL; + server.stat_io_threaded_clients_blocked_on_slot--; +} + +static void delayedServerCron(void *data) { + UNUSED(data); + long long interval = serverCron(server.el, 0, NULL); + aeCreateTimeEvent(server.el, interval, serverCron, NULL, NULL); +} + +/* Add a delayed job to the thread-local job list */ +void threadAddDelayedJob(int slot, job_handler handler, size_t data_size, void *data) { + /* Allocate memory for job structure plus data using flexible array member */ + listNode *job_node = createJobNode(slot, handler, data_size, data); + listLinkNodeTail(thread_delayed_jobs, job_node); +} + +int isServerCronDelayed(void) { + if (!server.cluster_enabled || server.io_threads_num == 1) { + return 0; + } + + if (dqAvailable(&deferredCmdExclusive)) return 0; + + listNode *job_node = createJobNode(-1, delayedServerCron, 0, NULL); + listLinkNodeTail(deferredCmdExclusive.deferred_jobs, job_node); + return 1; +} + +/* Dispatch delayed jobs based on their type */ +static void dispatchThreadDeferredJobs(list *jobs_list) { + listIter li; + listNode *ln; + listRewind(jobs_list, &li); + + while ((ln = listNext(&li))) { + delayedJob *job = listNodeValue(ln); + if (job->slot == -1) { + job->handler(job->data); + listDelNode(jobs_list, ln); + } else { + listUnlinkNode(jobs_list, ln); + processOrAddJob(getDeferredQueue(job->slot), ln); + } + server.stat_delayed_jobs_processed++; + } + + listRelease(jobs_list); +} + +/* This function checks various conditions to ensure thread safety when processing commands + * returns 1 if the command can be safely processed, 0 if not, in which case the command is queued to be process later */ +int postponeClientCommand(client *c) { + if (!server.cluster_enabled || server.io_threads_num == 1) { + return 1; + } + + /* An exclusive command can be processed either when processing the exclusive deferered queue + * or in immediate mode if there are no read commands executed in queues*/ + if (isDBExclusiveCmd(c->cmd, c->slot)) { + if (dq_context == CTX_EXCLUSIVE) return 1; + + if (dqAvailable(&deferredCmdExclusive)) return 1; + + /* We can't execute the command */ + dqAddPendingClient(&deferredCmdExclusive, c); + return 0; + } + + if (c->slot == -1) return 1; /* Can process non exclusive command without slot */ + + if (dq_context == c->slot) return 1; /* Already in slot context */ + + /* Immediate non exclusive commands are queued whenenever there are exclusive commands waiting */ + if (dq_context == CTX_NONE && !isDqEmpty(&deferredCmdExclusive)) { + dqAddPendingClient(&deferredCmdExclusive, c); + return 0; + } + + /* Queue client if there are pending commands for this slot */ + deferredQueue *q = getDeferredQueue(c->slot); + if (!isDqEmpty(q)) { + dqAddPendingClient(q, c); + return 0; + } + + /* Queue write commands if there are active reads */ + if (isSlotExclusiveCmd(c->cmd, c->slot) && !dqAvailable(q)) { + dqAddPendingClient(q, c); + return 0; + } + + return 1; /* No pending commands for the slot, can process immediately */ +} + +static void prefetchSlotPendingInfo(int slot) { + __builtin_prefetch(slot_use_info + slot); +} + +/* End of IO deferred queue functions */ + +int inMainThread(void) { + return thread_id == 0; } /* Drains the I/O threads queue by waiting for all jobs to be processed. @@ -139,17 +656,25 @@ int getIOThreadID(void) { void drainIOThreadsQueue(void) { serverAssert(inMainThread()); for (int i = 1; i < IO_THREADS_MAX_NUM; i++) { /* No need to drain thread 0, which is the main thread. */ - while (!IOJobQueue_isEmpty(&io_jobs[i])) { + IOJobQueue *jq = &io_jobs[i]; + while (!IOJobQueue_isEmpty(jq) || jq->pending_epoll_job) { /* memory barrier acquire to get the latest job queue state */ atomic_thread_fence(memory_order_acquire); } } } +/* Returns if there is an IO operation in progress for the given client. */ +int clientIOInProgress(client *c) { + return c->io_read_state != CLIENT_IDLE || c->io_write_state != CLIENT_IDLE || c->io_command_state != CLIENT_IDLE; +} + /* Wait until the IO-thread is done with the client */ void waitForClientIO(client *c) { /* No need to wait if the client was not offloaded to the IO thread. */ - if (c->io_read_state == CLIENT_IDLE && c->io_write_state == CLIENT_IDLE) return; + if (c->io_read_state == CLIENT_IDLE && c->io_write_state == CLIENT_IDLE && c->io_command_state == CLIENT_IDLE) { + return; + } /* Wait for read operation to complete if pending. */ while (c->io_read_state == CLIENT_PENDING_IO) { @@ -161,18 +686,37 @@ void waitForClientIO(client *c) { atomic_thread_fence(memory_order_acquire); } + /* Wait for command operation to complete if pending. */ + while (c->io_command_state == CLIENT_PENDING_IO) { + atomic_thread_fence(memory_order_acquire); + } + /* Final memory barrier to ensure all changes are visible */ atomic_thread_fence(memory_order_acquire); } +static int getPendingIOThreadsJobs(void) { + return server.stat_io_writes_pending + server.stat_io_reads_pending + server.stat_io_commands_pending; +} + /** Adjusts the number of active I/O threads based on the current event load. * If increase_only is non-zero, only allows increasing the number of threads.*/ void adjustIOThreadsByEventLoad(int numevents, int increase_only) { if (server.io_threads_num == 1) return; /* All I/O is being done by the main thread. */ debugServerAssertWithInfo(NULL, NULL, server.io_threads_num > 1); - /* When events_per_io_thread is set to 0, we offload all events to the IO threads. - * This is used mainly for testing purposes. */ - int target_threads = server.events_per_io_thread == 0 ? (numevents + 1) : numevents / server.events_per_io_thread; + + int target_threads = 0; + if (server.events_per_io_thread == 0) { + /* When events_per_io_thread is set to 0, we offload all events to the IO threads. + * This is used mainly for testing purposes. */ + if (getPendingIOThreadsJobs() > 0) { + target_threads = server.io_threads_num; + } else { + target_threads = numevents + 1; + } + } else { + target_threads = numevents / server.events_per_io_thread; + } target_threads = max(1, min(target_threads, server.io_threads_num)); @@ -186,7 +730,7 @@ void adjustIOThreadsByEventLoad(int numevents, int increase_only) { int tid = server.active_io_threads_num - 1; IOJobQueue *jq = &io_jobs[tid]; /* We can't lock the thread if it may have pending jobs */ - if (!IOJobQueue_isEmpty(jq)) return; + if (!IOJobQueue_isEmpty(jq) || jq->pending_epoll_job) return; pthread_mutex_lock(&io_threads_mutex[tid]); server.active_io_threads_num--; } @@ -201,13 +745,38 @@ void adjustIOThreadsByEventLoad(int numevents, int increase_only) { /* This function performs polling on the given event loop and updates the server's * IO fired events count and poll state. */ -void IOThreadPoll(void *data) { - aeEventLoop *el = (aeEventLoop *)data; +static void IOThreadPoll(IOJobQueue *jq) { + atomic_thread_fence(memory_order_acquire); /* Acquire the updated epoll struct */ + serverAssert(server.io_poll_state == AE_IO_STATE_POLL); + struct timeval tvp = {0, 0}; - int num_events = aePoll(el, &tvp); + int num_events = aePoll(server.el, &tvp); server.io_ae_fired_events = num_events; - atomic_store_explicit(&server.io_poll_state, AE_IO_STATE_DONE, memory_order_release); + jq->pending_epoll_job = false; + atomic_store_explicit(&server.io_poll_state, AE_IO_STATE_DONE, memory_order_relaxed); + atomic_thread_fence(memory_order_release); +} + +/* Define a cleanup function that will clean all thread resources */ +void cleanupThreadResources(void *dummy) { + UNUSED(dummy); + + handleThreadPendingResponses(); + + /* Free the shared query buffer */ + freeSharedQueryBuf(); + + /* Free the delayed jobs list if it exists */ + if (thread_delayed_jobs) { + listRelease(thread_delayed_jobs); + thread_delayed_jobs = NULL; + } + + /* Clean any other thread-specific resources here */ + /* Reset thread state variables */ + setCurrentClient(NULL); + setExecutingClient(NULL); } static void *IOThreadMain(void *myid) { @@ -219,7 +788,10 @@ static void *IOThreadMain(void *myid) { valkey_set_thread_title(thdname); serverSetCpuAffinity(server.server_cpulist); initSharedQueryBuf(); - pthread_cleanup_push(freeSharedQueryBuf, NULL); + setCurrentClient(NULL); + setExecutingClient(NULL); + thread_delayed_jobs = listCreate(); + pthread_cleanup_push(cleanupThreadResources, NULL); thread_id = (int)id; size_t jobs_to_process = 0; @@ -228,14 +800,21 @@ static void *IOThreadMain(void *myid) { /* Cancellation point so that pthread_cancel() from main thread is honored. */ pthread_testcancel(); + /* Handle unposted responses if exist*/ + handleThreadPendingResponses(); + /* Wait for jobs */ for (int j = 0; j < 1000000; j++) { jobs_to_process = IOJobQueue_availableJobs(jq); - if (jobs_to_process) break; + if (jobs_to_process || jq->pending_epoll_job) break; + } + + if (jq->pending_epoll_job) { + IOThreadPoll(jq); } /* Give the main thread a chance to stop this thread. */ - if (jobs_to_process == 0) { + if (jobs_to_process == 0 && listLength(&thread_pending_responses) == 0) { pthread_mutex_lock(&io_threads_mutex[id]); pthread_mutex_unlock(&io_threads_mutex[id]); continue; @@ -250,7 +829,11 @@ static void *IOThreadMain(void *myid) { handler(data); /* Remove the job after it was processed */ IOJobQueue_removeJob(jq); + if (jq->pending_epoll_job) { + IOThreadPoll(jq); + } } + /* Memory barrier to make sure the main thread sees the updated tail index. * We do it once per loop and not per tail-update for optimization reasons. * As the main-thread main concern is to check if the queue is empty, it's enough to do it once at the end. */ @@ -353,6 +936,11 @@ void initIOThreads(void) { serverAssert(server.io_threads_num <= IO_THREADS_MAX_NUM); prefetchCommandsBatchInit(); + size_t io_to_mt_queue_size = (server.io_threads_num - 1) * DEFAULT_MPSC_QUEUE_SIZE_PER_THREAD; + io_to_mt_queue = IoToMTQueueCreate(io_to_mt_queue_size); + thread_delayed_jobs = listCreate(); + deferredCmdExclusive.pending_clients = listCreate(); + deferredCmdExclusive.deferred_jobs = listCreate(); /* Spawn and initialize the I/O threads. */ for (int i = 1; i < server.io_threads_num; i++) { @@ -360,10 +948,98 @@ void initIOThreads(void) { } } +/* + * This function is called when a client is closed but still has pending IO jobs. + * It tracks the client in a dictionary in order to know to ignore its pending jobs. + */ +void ioThreadsOnUnlinkClient(client *c) { + if (c->bstate && c->bstate->slot_pending_list) { + dqRemoveClient(c->bstate->slot_pending_list, c); + c->bstate->slot_pending_list = NULL; + } +} + +/* returns C_OK if the command is postpone due to busy slot */ +static int isCommandPostpone(client *c) { + deferredQueue *q = getDeferredQueue(c->slot); + if (!dqAvailable(q)) { + dqAddPendingClient(q, c); + return C_OK; /* Postpone the command execution */ + } + return C_ERR; +} + +int trySendProcessCommandToIOThreads(client *c) { + if (server.active_io_threads_num == 1) { + return C_ERR; /* No IO threads to offload to. */ + } + + if (!server.io_threads_do_commands_offloading) { + return C_ERR; /* Command offloading is disabled. */ + } + + /* Check if modules are loaded and module offloading is disabled */ + if (moduleCount() > 0 && !server.io_threads_do_commands_offloading_with_modules) { + return C_ERR; /* Modules are loaded and module command offloading is disabled. */ + } + + if (!(c->cmd->flags & CMD_CAN_BE_OFFLOADED)) { + return C_ERR; + } + + if (!server.cluster_enabled) { + return C_ERR; /* Avoid offloading commands in non cluster mode. */ + } + + if (server.notify_keyspace_events & NOTIFY_KEY_MISS) { + return C_ERR; /* Avoid offloading commands when NOTIFY_KEY_MISS is enabled. */ + } + + if (c->io_read_state != CLIENT_IDLE || c->io_command_state != CLIENT_IDLE || c->io_write_state != CLIENT_IDLE) { + /* isCommandPostpone returns C_OK if the client should be postponed and will be offloaded later */ + return isCommandPostpone(c); + } + + /* Do not offload if the client uses pipeline commands */ + if (c->querybuf != NULL && sdslen(c->querybuf) > c->qb_pos) { + return isCommandPostpone(c); + } + + /* Do not offload if it is possible the main-thread will write at the same time to the client's COB */ + if (getClientType(c) != CLIENT_TYPE_NORMAL) { + return isCommandPostpone(c); + } + + serverAssert(c->slot != -1); + + /* Find the IO thread that is responsible for the slot. */ + int tid = getSlotTid(c->slot); + if (tid == -1 || tid >= server.active_io_threads_num) { + tid = (c->slot % (server.active_io_threads_num - 1)) + 1; + setSlotTid(c->slot, tid); + } + + IOJobQueue *jq = &io_jobs[tid]; + if (IOJobQueue_isFull(jq)) return isCommandPostpone(c); + + c->io_command_state = CLIENT_PENDING_IO; + c->io_write_state = CLIENT_PENDING_IO; /* The thread may write the command's result */ + dqIncr(getDeferredQueue(c->slot)); + dqIncr(&deferredCmdExclusive); + /* Setting current client to NULL to avoid accessing it after it was sent to IO */ + setCurrentClient(NULL); + setExecutingClient(NULL); + IOJobQueue_push(&io_jobs[tid], ioThreadProcessCommand, c); + + server.stat_io_commands_pending++; + return C_OK; +} + int trySendReadToIOThreads(client *c) { if (server.active_io_threads_num <= 1) return C_ERR; /* If IO thread is already reading, return C_OK to make sure the main thread will not handle it. */ if (c->io_read_state != CLIENT_IDLE) return C_OK; + if (c->io_write_state != CLIENT_IDLE) return C_OK; /* For simplicity, don't offload replica clients reads as read traffic from replica is negligible */ if (getClientType(c) == CLIENT_TYPE_REPLICA) return C_ERR; /* With Lua debug client we may call connWrite directly in the main thread */ @@ -373,15 +1049,6 @@ int trySendReadToIOThreads(client *c) { if (c->flag.close_asap) return C_ERR; size_t tid = (c->id % (server.active_io_threads_num - 1)) + 1; - /* Handle case where client has a pending IO write job on a different thread: - * 1. A write job is still pending (io_write_state == CLIENT_PENDING_IO) - * 2. The pending job is on a different thread (c->cur_tid != tid) - * - * This situation can occur if active_io_threads_num increased since the - * original job assignment. In this case, we keep the job on its current - * thread to ensure the same thread handles the client's I/O operations. */ - if (c->io_write_state == CLIENT_PENDING_IO && c->cur_tid != (uint8_t)tid) tid = c->cur_tid; - IOJobQueue *jq = &io_jobs[tid]; if (IOJobQueue_isFull(jq)) return C_ERR; @@ -393,8 +1060,8 @@ int trySendReadToIOThreads(client *c) { c->io_read_state = CLIENT_PENDING_IO; connSetPostponeUpdateState(c->conn, 1); IOJobQueue_push(jq, ioThreadReadQueryFromClient, c); + server.stat_io_reads_pending++; c->flag.pending_read = 1; - listLinkNodeTail(server.clients_pending_io_read, &c->pending_read_list_node); return C_OK; } @@ -427,13 +1094,9 @@ int trySendWriteToIOThreads(client *c) { c->cur_tid = tid; if (c->flag.pending_write) { - /* We move the client to the io pending write queue */ listUnlinkNode(server.clients_pending_write, &c->clients_pending_write_node); - } else { - c->flag.pending_write = 1; + c->flag.pending_write = 0; } - serverAssert(c->clients_pending_write_node.prev == NULL && c->clients_pending_write_node.next == NULL); - listLinkNodeTail(server.clients_pending_io_write, &c->clients_pending_write_node); int is_replica = getClientType(c) == CLIENT_TYPE_REPLICA; if (is_replica) { @@ -466,11 +1129,12 @@ int trySendWriteToIOThreads(client *c) { c->io_write_state = CLIENT_PENDING_IO; IOJobQueue_push(jq, ioThreadWriteToClient, c); + server.stat_io_writes_pending++; return C_OK; } /* Internal function to free the client's argv in an IO thread. */ -void IOThreadFreeArgv(void *data) { +static void IOThreadFreeArgv(void *data) { robj **argv = (robj **)data; int last_arg = 0; for (int i = 0;; i++) { @@ -536,6 +1200,8 @@ int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv) { /* Must succeed as we checked the free space before. */ IOJobQueue_push(jq, IOThreadFreeArgv, argv); + c->argv = NULL; + c->argc = 0; return C_OK; } @@ -573,8 +1239,7 @@ int tryOffloadFreeObjToIOThreads(robj *obj) { /* This function retrieves the results of the IO Thread poll. * returns the number of fired events if the IO thread has finished processing poll events, 0 otherwise. */ static int getIOThreadPollResults(aeEventLoop *eventLoop) { - int io_state; - io_state = atomic_load_explicit(&server.io_poll_state, memory_order_acquire); + int io_state = atomic_load_explicit(&server.io_poll_state, memory_order_acquire); if (io_state == AE_IO_STATE_POLL) { /* IO thread is still processing poll events. */ return 0; @@ -597,7 +1262,7 @@ void trySendPollJobToIOThreads(void) { } /* If there are no pending jobs, let the main thread do the poll-wait by itself. */ - if (listLength(server.clients_pending_io_write) + listLength(server.clients_pending_io_read) == 0) { + if (getPendingIOThreadsJobs() == 0) { return; } @@ -606,17 +1271,13 @@ void trySendPollJobToIOThreads(void) { return; } - /* The poll is sent to the last thread. While a random thread could have been selected, - * the last thread has a slightly better chance of being less loaded compared to other threads, - * As we activate the lowest threads first. */ - int tid = server.active_io_threads_num - 1; - IOJobQueue *jq = &io_jobs[tid]; - if (IOJobQueue_isFull(jq)) return; /* The main thread will handle the poll itself. */ - - server.io_poll_state = AE_IO_STATE_POLL; + cur_epoll_thread = ((cur_epoll_thread) % (server.active_io_threads_num - 1)) + 1; + IOJobQueue *jq = &io_jobs[cur_epoll_thread]; aeSetCustomPollProc(server.el, getIOThreadPollResults); aeSetPollProtect(server.el, 1); - IOJobQueue_push(jq, IOThreadPoll, server.el); + atomic_store_explicit(&server.io_poll_state, AE_IO_STATE_POLL, memory_order_relaxed); + atomic_thread_fence(memory_order_release); + jq->pending_epoll_job = true; } static void ioThreadAccept(void *data) { @@ -624,6 +1285,7 @@ static void ioThreadAccept(void *data) { connAccept(c->conn, NULL); atomic_thread_fence(memory_order_release); c->io_read_state = CLIENT_COMPLETED_IO; + threadRespond(c, R_READ); } /* @@ -664,10 +1326,237 @@ int trySendAcceptToIOThreads(connection *conn) { c->io_read_state = CLIENT_PENDING_IO; c->flag.pending_read = 1; - listLinkNodeTail(server.clients_pending_io_read, &c->pending_read_list_node); connSetPostponeUpdateState(c->conn, 1); + server.stat_io_reads_pending++; server.stat_io_accept_offloaded++; IOJobQueue_push(job_queue, ioThreadAccept, c); return C_OK; } + +#define JOB_BATCH_SIZE (16) +#define JOB_TYPE_MASK (7) /* Lower 3 bits for job type */ +#define CLIENT_PTR_MASK (~0x7ULL) /* Upper bits for client pointer */ + +static inline jobResponseType getJobResponseType(uint64_t jobData) { + jobResponseType type = (jobResponseType)(jobData & JOB_TYPE_MASK); + if (type >= R_LAST) { + serverPanic("Invalid job type: %d", type); + } + return type; +} + +static inline void *getJobData(uint64_t jobData) { + return (void *)(jobData & CLIENT_PTR_MASK); +} + +/* Function to handle read jobs */ +static void handleReadJobs(client **read_jobs, int read_count) { + server.stat_io_reads_pending -= read_count; + serverAssert(server.stat_io_reads_pending >= 0); + + /* First pass: prefetch cluster slots for all clients */ + for (int i = 0; i < read_count; i++) { + client *c = read_jobs[i]; + __builtin_prefetch(&(server.cluster->slots[c->slot])); + __builtin_prefetch(&(server.cluster->migrating_slots_to[c->slot])); + __builtin_prefetch(&(server.cluster->importing_slots_from[c->slot])); + prefetchSlotPendingInfo(c->slot); + } + + /* Second pass: process each client */ + for (int i = 0; i < read_count; i++) { + client *c = read_jobs[i]; + processClientIOReadsDone(c); + server.stat_io_reads_processed++; + } + + /* Process commands in batch if we processed any reads */ + processClientsCommandsBatch(); +} + +/* Function to handle write jobs */ +static void handleWriteJobs(client **write_jobs, int write_count) { + server.stat_io_writes_pending -= write_count; + serverAssert(server.stat_io_writes_pending >= 0); + + for (int i = 0; i < write_count; i++) { + client *c = write_jobs[i]; + server.stat_io_writes_processed++; + processClientIOWriteDone(c, 1); + } +} + +static void threadRespondJobList(void) { + if (listLength(thread_delayed_jobs) == 0) return; + + IoToMTQueueProduce((uint64_t)thread_delayed_jobs | (uint64_t)R_JOBLIST, 0); + thread_delayed_jobs = listCreate(); +} + +void threadRespond(client *c, jobResponseType r) { + if (r == R_COMMAND) { + /* Make sure to send first the deferred jobs list */ + threadRespondJobList(); + } + + IoToMTQueueProduce((uint64_t)c | (uint64_t)r, 0); +} + +static void processClientIOCommandDone(client *c) { + serverAssert(c->io_command_state == CLIENT_COMPLETED_IO); + c->io_command_state = CLIENT_IDLE; + + if (c->flag.close_after_command) { + c->flag.close_after_command = 0; + c->flag.close_after_reply = 1; + } + + struct serverCommand *real_cmd = c->realcmd; + + /* Command stats */ + real_cmd->calls++; + real_cmd->microseconds += c->duration; + c->commands_processed++; + server.stat_numcommands++; + + /* Latency stats */ + char *latency_event = (real_cmd->flags & CMD_FAST) ? "fast-command" : "command"; + latencyAddSampleIfNeeded(latency_event, c->duration / 1000); + if (server.latency_tracking_enabled) + updateCommandLatencyHistogram(&(real_cmd->latency_histogram), c->duration * 1000); + + /* Command log */ + commandlogPushCurrentCommand(c, real_cmd); + + /* Monitor */ + if (!(c->cmd->flags & (CMD_SKIP_MONITOR | CMD_ADMIN))) { + robj **argv = c->original_argv ? c->original_argv : c->argv; + int argc = c->original_argv ? c->original_argc : c->argc; + replicationFeedMonitors(c, server.monitors, c->db->id, argv, argc); + } + + /* Cluster stats */ + clusterSlotStatsAddCpuDuration(c, c->duration); + clusterSlotStatsAddNetworkBytesOutForUserClient(c); + + /* Tracking */ + if (c->flag.tracking && !c->flag.tracking_bcast) { + trackingRememberKeys(c, c); + } + + c->duration = 0; + + processClientIOWriteDone(c, 1); /* The Worker thread does 2 things: 1. process the command , 2. Writes the results. */ + if (c->flag.close_asap) { + return; + } + + commandProcessed(c); + + /* Update the client's memory to include output buffer growth following the + * processed command. */ + if (c->conn) updateClientMemUsageAndBucket(c); + + if (clientHasPendingReplies(c) && trySendWriteToIOThreads(c) == C_ERR) { + putClientInPendingWriteQueue(c); + } + + beforeNextClient(c); +} + +/* Function to handle command jobs */ +static void handleCommandJobs(client **command_jobs, int command_count) { + server.stat_io_commands_pending -= command_count; + + /* First pass: prefetch data for all command jobs */ + for (int i = 0; i < command_count; i++) { + client *c = command_jobs[i]; + + for (int j = 0; j < c->argc; j++) { + __builtin_prefetch(c->argv[j]); + } + prefetchSlotPendingInfo(c->slot); + } + + /* Second pass: process each command */ + for (int i = 0; i < command_count; i++) { + client *c = command_jobs[i]; + int slot = c->slot; + processClientIOCommandDone(c); + dqDecr(slot); + dqDecr(-1); + server.stat_io_commands_processed++; + } +} + +int processIOThreadsResponses(void) { + if (io_to_mt_queue == NULL) return 0; + + /* Quick check if any pending operations exist */ + if (getPendingIOThreadsJobs() == 0) return 0; + + int total_processed = 0; + uint64_t jobs[JOB_BATCH_SIZE]; + client *read_jobs[JOB_BATCH_SIZE]; + client *write_jobs[JOB_BATCH_SIZE]; + client *command_jobs[JOB_BATCH_SIZE]; + + /* Loop until we consume all pending jobs */ + while (1) { + int received_responses = 0; + int dequeued_count = 0; + int read_count = 0; + int write_count = 0; + int command_count = 0; + + /* Try to dequeue JOB_BATCH_SIZE */ + while (received_responses < JOB_BATCH_SIZE) { + dequeued_count = IoToMTQueueConsumeBatch(JOB_BATCH_SIZE - received_responses, jobs); + + /* Stop if we can't get more jobs from the queue. */ + if (dequeued_count == 0) break; + + received_responses += dequeued_count; + total_processed += dequeued_count; + + /* Prefetch the jobs data */ + for (int i = 0; i < dequeued_count; i++) { + if (getJobResponseType(jobs[i]) == R_JOBLIST) continue; + client *c = getJobData(jobs[i]); + /* Always prefetch the client pointer */ + __builtin_prefetch(c); + __builtin_prefetch(&c->slot); + } + + for (int i = 0; i < dequeued_count; i++) { + jobResponseType job_type = getJobResponseType(jobs[i]); + if (job_type == R_JOBLIST) { + dispatchThreadDeferredJobs((list *)getJobData(jobs[i])); + continue; + } + client *c = getJobData(jobs[i]); + if (job_type == R_READ) { + serverAssert(c->io_read_state == CLIENT_COMPLETED_IO); + read_jobs[read_count++] = c; + } else if (job_type == R_WRITE) { + serverAssert(c->io_write_state == CLIENT_COMPLETED_IO); + write_jobs[write_count++] = c; + } else if (job_type == R_COMMAND) { + serverAssert(c->io_write_state == CLIENT_COMPLETED_IO); + serverAssert(c->io_command_state == CLIENT_COMPLETED_IO); + command_jobs[command_count++] = c; + } else { + serverPanic("Unknown job type %d", job_type); + } + } + } + + if (read_count) handleReadJobs(read_jobs, read_count); + if (write_count) handleWriteJobs(write_jobs, write_count); + if (command_count) handleCommandJobs(command_jobs, command_count); + + /* If the queue was empty at the last try - don't try again */ + if (dequeued_count == 0) return total_processed; + } +} diff --git a/src/io_threads.h b/src/io_threads.h index 992cb66c43..e6b7288466 100644 --- a/src/io_threads.h +++ b/src/io_threads.h @@ -1,19 +1,47 @@ #ifndef IO_THREADS_H #define IO_THREADS_H -#include "server.h" +#include /* For size_t */ + +struct client; +struct connection; +struct serverObject; + +typedef enum { + R_READ = 0, + R_COMMAND = 1, + R_WRITE = 2, + R_JOBLIST = 3, + R_LAST = 4, +} jobResponseType; + +typedef void (*job_handler)(void *); void initIOThreads(void); void killIOThreads(void); int inMainThread(void); -int trySendReadToIOThreads(client *c); -int trySendWriteToIOThreads(client *c); -int tryOffloadFreeObjToIOThreads(robj *o); -int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv); +int trySendReadToIOThreads(struct client *c); +int trySendWriteToIOThreads(struct client *c); +int tryOffloadFreeObjToIOThreads(struct serverObject *o); +int tryOffloadFreeArgvToIOThreads(struct client *c, int argc, struct serverObject **argv); void adjustIOThreadsByEventLoad(int numevents, int increase_only); void drainIOThreadsQueue(void); void trySendPollJobToIOThreads(void); -int trySendAcceptToIOThreads(connection *conn); +int trySendAcceptToIOThreads(struct connection *conn); +int trySendProcessCommandToIOThreads(struct client *c); +int processIOThreadsResponses(void); +void threadAddDelayedJob(int slot, job_handler handler, size_t len, void *data); +void threadRespond(struct client *c, jobResponseType r); +int clientIOInProgress(struct client *c); +int postponeClientCommand(struct client *c); +int isServerCronDelayed(void); +void ioThreadsOnUnlinkClient(struct client *c); +void pollIOThreadStats(void); +int isCommandOffloadingRunning(void); +int isCommandOffloadingPaused(void); +void updateLatencyStatsForIOThreads(struct client *c, unsigned long long duration); +void ioThreadUpdateCmdDuration(unsigned long long duration); +int trySendAcceptToIOThreads(struct connection *conn); int updateIOThreads(const char **err); #endif /* IO_THREADS_H */ diff --git a/src/kvstore.c b/src/kvstore.c index 486e434e1a..4f1e33cced 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -46,6 +46,7 @@ #include "kvstore.h" #include "serverassert.h" #include "monotonic.h" +#include "io_threads.h" #define UNUSED(V) ((void)V) @@ -65,7 +66,7 @@ struct _kvstore { unsigned long long bucket_count; /* Total number of buckets in this kvstore across hash tables. */ unsigned long long *hashtable_size_index; /* Binary indexed tree (BIT) that describes cumulative key frequencies up until * given hashtable-index. */ - size_t overhead_hashtable_lut; /* Overhead of all hashtables in bytes. */ + _Atomic size_t overhead_hashtable_lut; /* Overhead of all hashtables in bytes, Atomic as it may be update by the IO threads */ size_t overhead_hashtable_rehashing; /* Overhead of hash tables rehashing in bytes. */ }; @@ -221,22 +222,38 @@ void kvstoreHashtableRehashingStarted(hashtable *ht) { kvs->overhead_hashtable_rehashing += from * HASHTABLE_BUCKET_SIZE; } +typedef struct { + size_t from; + kvstore *kvs; + listNode *rehashing_node; +} rehashing_completion_ctx; + +void kvstoreHashtableUpdateRehashingInfo(void *data) { + rehashing_completion_ctx *ctx = (rehashing_completion_ctx *)data; + if (ctx->rehashing_node) { + listDelNode(ctx->kvs->rehashing, ctx->rehashing_node); + } + ctx->kvs->bucket_count -= ctx->from; /* Finished rehashing (Remove the old ht size) */ + ctx->kvs->overhead_hashtable_rehashing -= ctx->from * HASHTABLE_BUCKET_SIZE; +} + /* Remove hash table from the rehashing list. * * Updates the bucket count for the given hash table in a DB. It removes * the old ht size of the hash table from the total sum of buckets for a DB. */ void kvstoreHashtableRehashingCompleted(hashtable *ht) { kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht); - kvstore *kvs = metadata->kvs; - if (metadata->rehashing_node) { - listDelNode(kvs->rehashing, metadata->rehashing_node); - metadata->rehashing_node = NULL; - } - size_t from, to; hashtableRehashingInfo(ht, &from, &to); - kvs->bucket_count -= from; /* Finished rehashing (Remove the old ht size) */ - kvs->overhead_hashtable_rehashing -= from * HASHTABLE_BUCKET_SIZE; + rehashing_completion_ctx ctx = {.rehashing_node = metadata->rehashing_node, .kvs = metadata->kvs, .from = from}; + metadata->rehashing_node = NULL; + + /* If not in main-thread postpone the update of kvs rehashing info to be done later by the main-thread -*/ + if (!inMainThread()) { + threadAddDelayedJob(-1, kvstoreHashtableUpdateRehashingInfo, sizeof(ctx), &ctx); + } else { + kvstoreHashtableUpdateRehashingInfo(&ctx); + } } /* Hashtable callback to keep track of memory usage. */ diff --git a/src/memory_prefetch.c b/src/memory_prefetch.c index 2fd70ac429..32ea8805c0 100644 --- a/src/memory_prefetch.c +++ b/src/memory_prefetch.c @@ -240,6 +240,19 @@ void processClientsCommandsBatch(void) { } } +/* Check if the command is about to be offloaded to IO threads */ +static int isCommandBeingOffloaded(client *c) { + if (!server.io_threads_do_commands_offloading) { + return 0; + } + + if (!server.cluster_enabled) { + return 0; + } + + return (c->parsed_cmd->flags & CMD_CAN_BE_OFFLOADED) && (c->querybuf == NULL); +} + /* Adds the client's command to the current batch and processes the batch * if it becomes full. * @@ -250,7 +263,7 @@ int addCommandToBatchAndProcessIfFull(client *c) { batch->clients[batch->client_count++] = c; /* Get command's keys positions */ - if (c->parsed_cmd && !(c->read_flags & READ_FLAGS_BAD_ARITY)) { + if (c->parsed_cmd && !(c->read_flags & READ_FLAGS_BAD_ARITY) && !isCommandBeingOffloaded(c)) { getKeysResult result; initGetKeysResult(&result); int num_keys = getKeysFromCommand(c->parsed_cmd, c->argv, c->argc, &result); @@ -284,3 +297,43 @@ void removeClientFromPendingCommandsBatch(client *c) { } } } + +/* Prefetch memory for upcoming file events to improve cache performance */ +void prefetchEvents(aeEventLoop *eventLoop, int cur_idx, int numevents) { + const int BATCH_SIZE = 16; + + /* Only prefetch at batch boundaries (cur_idx = 0, 16, 32, ...) each time BATCH_SIZE events at once */ + if (cur_idx % BATCH_SIZE) return; + + aeFileEvent *fes[BATCH_SIZE]; + int fes_idx = 0; + int batch_size = min(numevents - cur_idx, BATCH_SIZE); + int start = cur_idx; + int end = start + batch_size; + + /* Phase 1: Prefetch aeFileEvent structures for events that need prefetching */ + for (int i = start; i < end; i++) { + int mask = eventLoop->fired[i].mask; + if (mask & AE_PREFETCH) { + fes[fes_idx] = &eventLoop->events[eventLoop->fired[i].fd]; + valkey_prefetch(fes[fes_idx]); + } else { + /* Mark as NULL so we skip this event in subsequent phases */ + fes[fes_idx] = NULL; + } + fes_idx++; + } + + /* Phase 2: Prefetch connection objects (clientData from aeFileEvent) */ + for (int i = 0; i < batch_size; i++) { + if (fes[i] == NULL) continue; + valkey_prefetch(fes[i]->clientData); + } + + /* Phase 3: Prefetch private data (client* struct) within each connection */ + for (int i = 0; i < batch_size; i++) { + if (fes[i] == NULL) continue; + connection *conn = fes[i]->clientData; + valkey_prefetch(connGetPrivateData(conn)); + } +} diff --git a/src/memory_prefetch.h b/src/memory_prefetch.h index 5a181cc58d..a733136aff 100644 --- a/src/memory_prefetch.h +++ b/src/memory_prefetch.h @@ -2,10 +2,12 @@ #define MEMORY_PREFETCH_H struct client; +struct aeEventLoop; void prefetchCommandsBatchInit(void); void processClientsCommandsBatch(void); int addCommandToBatchAndProcessIfFull(struct client *c); void removeClientFromPendingCommandsBatch(struct client *c); +void prefetchEvents(struct aeEventLoop *eventLoop, int cur_idx, int numevents); #endif /* MEMORY_PREFETCH_H */ diff --git a/src/module.c b/src/module.c index 3569ae8214..d8c1d7df83 100644 --- a/src/module.c +++ b/src/module.c @@ -864,7 +864,7 @@ void modulePostExecutionUnitOperations(void) { if (server.busy_module_yield_flags) { blockingOperationEnds(); server.busy_module_yield_flags = BUSY_MODULE_YIELD_NONE; - if (server.current_client) unprotectClient(server.current_client); + if (getCurrentClient()) unprotectClient(getCurrentClient()); unblockPostponedClients(); } } @@ -2509,7 +2509,7 @@ void VM_Yield(ValkeyModuleCtx *ctx, int flags, const char *busy_reply) { if (!server.busy_module_yield_flags) { server.busy_module_yield_flags = BUSY_MODULE_YIELD_EVENTS; blockingOperationStarts(); - if (server.current_client) protectClient(server.current_client); + if (getCurrentClient()) protectClient(getCurrentClient()); } if (flags & VALKEYMODULE_YIELD_FLAG_CLIENTS) server.busy_module_yield_flags |= BUSY_MODULE_YIELD_CLIENTS; @@ -6485,7 +6485,7 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const } int deny_write_type = writeCommandsDeniedByDiskError(); - int obey_client = (server.current_client && mustObeyClient(server.current_client)); + int obey_client = (getCurrentClient() && mustObeyClient(getCurrentClient())); if (deny_write_type != DISK_ERROR_TYPE_NONE && !obey_client) { errno = ESPIPE; @@ -8949,11 +8949,11 @@ void moduleNotifyKeyspaceEvent(int type, const char *event, robj *key, int dbid) if ((sub->event_mask & type) && (sub->active == 0 || (sub->module->options & VALKEYMODULE_OPTIONS_ALLOW_NESTED_KEYSPACE_NOTIFICATIONS))) { ValkeyModuleCtx ctx; - if (server.executing_client == NULL) { + if (getExecutingClient() == NULL) { moduleCreateContext(&ctx, sub->module, VALKEYMODULE_CTX_TEMP_CLIENT); } else { moduleCreateContext(&ctx, sub->module, VALKEYMODULE_CTX_NONE); - ctx.client = server.executing_client; + ctx.client = getExecutingClient(); } selectDb(ctx.client, dbid); ctx.flags |= VALKEYMODULE_CTX_KEYSPACE_NOTIFICATION; @@ -9712,7 +9712,7 @@ void revokeClientAuthentication(client *c) { clientSetUser(c, DefaultUser, 0); /* We will write replies to this client later, so we can't close it * directly even if async. */ - if (c == server.current_client) { + if (isCurrentClient(c)) { c->flag.close_after_command = 1; } else { freeClientAsync(c); @@ -13214,12 +13214,12 @@ int VM_RdbLoad(ValkeyModuleCtx *ctx, ValkeyModuleRdbStream *stream, int flags) { * VM_RdbLoad() is called inside a command callback, we don't want to * process the current client. Otherwise, we may free the client or try to * process next message while we are already in the command callback. */ - if (server.current_client) protectClient(server.current_client); + if (getCurrentClient()) protectClient(getCurrentClient()); serverAssert(stream->type == VALKEYMODULE_RDB_STREAM_FILE); int ret = rdbLoad(stream->data.filename, NULL, RDBFLAGS_NONE); - if (server.current_client) unprotectClient(server.current_client); + if (getCurrentClient()) unprotectClient(getCurrentClient()); /* Here we need to decide whether to enable the AOF based on the aof_enabled, * since the previous stopAppendOnly sets aof_state to AOF_OFF. */ diff --git a/src/networking.c b/src/networking.c index 2cc90c8710..82930dcbc6 100644 --- a/src/networking.c +++ b/src/networking.c @@ -222,6 +222,8 @@ static inline int isReplicaReadyForReplData(client *replica) { static int isCopyAvoidPreferred(client *c, robj *obj) { if (c->flag.fake || isDeferredReplyEnabled(c)) return 0; + if (!inMainThread()) return 0; + int type = getClientType(c); if (type != CLIENT_TYPE_NORMAL && type != CLIENT_TYPE_PUBSUB) return 0; @@ -316,11 +318,11 @@ client *createClient(connection *conn) { c->client_list_node = NULL; c->io_read_state = CLIENT_IDLE; c->io_write_state = CLIENT_IDLE; + c->io_command_state = CLIENT_IDLE; c->nwritten = 0; c->last_memory_usage = 0; c->last_memory_type = CLIENT_TYPE_NORMAL; listInitNode(&c->clients_pending_write_node, c); - listInitNode(&c->pending_read_list_node, c); c->mem_usage_bucket = NULL; c->mem_usage_bucket_node = NULL; if (conn) linkClient(c); @@ -373,8 +375,10 @@ void putClientInPendingWriteQueue(client *c) { * loop, we can try to directly write to the client sockets avoiding * a system call. We'll only really install the write handler if * we'll not be able to write the whole reply at once. */ - c->flag.pending_write = 1; - listLinkNodeHead(server.clients_pending_write, &c->clients_pending_write_node); + if (inMainThread()) { + c->flag.pending_write = 1; + listLinkNodeHead(server.clients_pending_write, &c->clients_pending_write_node); + } } } /* This function is called every time we are going to transmit new data @@ -663,8 +667,8 @@ void _addReplyToBufferOrList(client *c, const char *s, size_t len) { * the SUBSCRIBE command family, which (currently) have a push message instead of a proper reply. * The check for executing_client also avoids affecting push messages that are part of eviction. * Check CLIENT_PUSHING first to avoid race conditions, as it's absent in module's fake client. */ - int defer_push_message = c->flag.pushing && c == server.current_client && server.executing_client && - !cmdHasPushAsReply(server.executing_client->cmd); + int defer_push_message = c->flag.pushing && c == getCurrentClient() && getExecutingClient() && + !cmdHasPushAsReply(getExecutingClient()->cmd); if (defer_push_message == 0 && isDeferredReplyEnabled(c)) { _addReplyProtoToList(c, c->deferred_reply, s, len); return; @@ -757,24 +761,9 @@ void addReplyErrorLength(client *c, const char *s, size_t len) { addReplyProto(c, "\r\n", 2); } -/* Do some actions after an error reply was sent (Log if needed, updates stats, etc.) - * Possible flags: - * * ERR_REPLY_FLAG_NO_STATS_UPDATE - indicate not to update any error stats. */ -void afterErrorReply(client *c, const char *s, size_t len, int flags) { - /* Module clients fall into two categories: - * Calls to RM_Call, in which case the error isn't being returned to a client, so should not be counted. - * Module thread safe context calls to RM_ReplyWithError, which will be added to a real client by the main thread - * later. */ - if (c->flag.module) { - if (!c->deferred_reply_errors) { - c->deferred_reply_errors = listCreate(); - listSetFreeMethod(c->deferred_reply_errors, sdsfreeVoid); - } - listAddNodeTail(c->deferred_reply_errors, sdsnewlen(s, len)); - return; - } - - commitDeferredReplyBuffer(c, 1); +/* Updates some global error stats. This function is called + * from afterErrorReply and afterErrorReplyDelayed */ +void afterErrorReplyStatsUpdate(client *c, const char *s, size_t len, int flags) { if (!(flags & ERR_REPLY_FLAG_NO_STATS_UPDATE)) { /* Increment the global error counter */ server.stat_total_error_replies++; @@ -788,7 +777,7 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) { /* If we cannot retrieve the error prefix, use the default: "ERR". */ if (spaceloc) { const size_t errEndPos = (size_t)(spaceloc - s); - err_prefix = (char *)s + 1; + err_prefix = (char *)(s) + 1; prefix_len = errEndPos - 1; } } @@ -807,6 +796,51 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) { * case c->cmd was changed (like in GEOADD). */ c->realcmd->failed_calls++; } +} + +/* Context needed for IO thread to push error stats update job to Main thread */ +typedef struct { + client *c; + const sds s; + size_t len; + int flags; +} delayedErrorStatsUpdateCtx; + +/* Delayed version of 'afterErrorReply' that is pushed by IO threads + * to the queue for main thread to update the error stats of an + * offloaded command */ +void afterErrorReplyDelayed(void *data) { + delayedErrorStatsUpdateCtx *ctx = (delayedErrorStatsUpdateCtx *)data; + afterErrorReplyStatsUpdate(ctx->c, ctx->s, ctx->len, ctx->flags); + incrCommandStatsOnError(ctx->c->cmd, ERROR_COMMAND_FAILED); + sdsfree(ctx->s); +} + +/* Do some actions after an error reply was sent (Log if needed, updates stats, etc.) + * Possible flags: + * * ERR_REPLY_FLAG_NO_STATS_UPDATE - indicate not to update any error stats. */ +void afterErrorReply(client *c, const char *s, size_t len, int flags) { + /* Module clients fall into two categories: + * Calls to RM_Call, in which case the error isn't being returned to a client, so should not be counted. + * Module thread safe context calls to RM_ReplyWithError, which will be added to a real client by the main thread + * later. */ + if (c->flag.module) { + if (!c->deferred_reply_errors) { + c->deferred_reply_errors = listCreate(); + listSetFreeMethod(c->deferred_reply_errors, sdsfreeVoid); + } + listAddNodeTail(c->deferred_reply_errors, sdsnewlen(s, len)); + return; + } + + /* Postpone error updates if its io-thread */ + if (!inMainThread()) { + delayedErrorStatsUpdateCtx ctx = {.c = c, .s = sdsnewlen(s, len), .len = len, .flags = flags}; + threadAddDelayedJob(-1, afterErrorReplyDelayed, sizeof(ctx), &ctx); + return; + } + + afterErrorReplyStatsUpdate(c, s, len, flags); /* Sometimes it could be normal that a replica replies to a primary with * an error and this function gets called. Actually the error will never @@ -1800,11 +1834,8 @@ void disconnectReplicas(void) { void unlinkClient(client *c) { listNode *ln; - /* Wait for IO operations to be done before unlinking the client. */ - waitForClientIO(c); - /* If this is marked as current client unset it. */ - if (c->conn && server.current_client == c) server.current_client = NULL; + if (c->conn && isCurrentClient(c)) setCurrentClient(NULL); /* Certain operations must be done only if the client has an active connection. * If the client was already unlinked or if it's a "fake client" the @@ -1848,21 +1879,12 @@ void unlinkClient(client *c) { /* Remove from the list of pending writes if needed. */ if (c->flag.pending_write) { - if (c->io_write_state == CLIENT_IDLE) { - listUnlinkNode(server.clients_pending_write, &c->clients_pending_write_node); - } else { - listUnlinkNode(server.clients_pending_io_write, &c->clients_pending_write_node); - } + serverAssert(server.clients_pending_write->len > 0); + listUnlinkNode(server.clients_pending_write, &c->clients_pending_write_node); c->flag.pending_write = 0; } - /* Remove from the list of pending reads if needed. */ serverAssert(c->io_read_state != CLIENT_PENDING_IO && c->io_write_state != CLIENT_PENDING_IO); - if (c->flag.pending_read) { - listUnlinkNode(server.clients_pending_io_read, &c->pending_read_list_node); - c->flag.pending_read = 0; - } - /* When client was just unblocked because of a blocking operation, * remove it from the list of unblocked clients. */ @@ -1875,6 +1897,8 @@ void unlinkClient(client *c) { /* Clear the tracking status. */ if (c->flag.tracking) disableTracking(c); + + ioThreadsOnUnlinkClient(c); } /* Clear the client state to resemble a newly connected client. */ @@ -1925,19 +1949,19 @@ void clearClientConnectionState(client *c) { c->flag.no_evict = 0; } -void freeClient(client *c) { +/* Free the client structure and all the data associated with it. + * Returns 0 if the client was not freed immediately, but scheduled for + * asynchronous freeing, and 1 if the client was freed immediately. */ +int freeClient(client *c) { listNode *ln; /* If a client is protected, yet we need to free it right now, make sure * to at least use asynchronous freeing. */ - if (c->flag.protected || c->flag.protected_rdb_channel) { + if (c->flag.protected || c->flag.protected_rdb_channel || clientIOInProgress(c)) { freeClientAsync(c); - return; + return 0; } - /* Wait for IO operations to be done before proceeding */ - waitForClientIO(c); - /* For connected clients, call the disconnection event of modules hooks. */ if (c->conn) { moduleFireServerEvent(VALKEYMODULE_EVENT_CLIENT_CHANGE, VALKEYMODULE_SUBEVENT_CLIENT_CHANGE_DISCONNECTED, c); @@ -1968,7 +1992,7 @@ void freeClient(client *c) { c->flag.close_asap = 0; c->flag.close_after_reply = 0; replicationCachePrimary(c); - return; + return 0; } } @@ -1993,7 +2017,6 @@ void freeClient(client *c) { c->duration = 0; if (c->flag.blocked) unblockClient(c, 1); - freeClientBlockingState(c); freeClientPubSubData(c); /* Free data structures. */ @@ -2021,6 +2044,7 @@ void freeClient(client *c) { * places where active clients may be referenced. */ unlinkClient(c); + freeClientBlockingState(c); freeClientReplicationData(c); /* Remove client from memory usage buckets */ @@ -2038,6 +2062,7 @@ void freeClient(client *c) { sdsfree(c->peerid); sdsfree(c->sockname); zfree(c); + return 1; } /* Schedule a client to free it at a safe time in the beforeSleep() function. @@ -2108,6 +2133,11 @@ void trimClientQueryBuffer(client *c) { * wait until we're done with all clients. In other words, it can't wait until beforeSleep(). * With IO threads enabled, this function offloads the write to the IO threads if possible. */ void beforeNextClient(client *c) { + if (c->io_command_state != CLIENT_IDLE) { + /* If the client command was offloaded to IO threads, we need to wait for it to finish */ + return; + } + /* Notice, this code is also called from 'processUnblockedClients'. * But in case of a module blocked client (see RM_Call 'K' flag) we do not reach this code path. * So whenever we change the code here we need to consider if we need this change on module @@ -2184,6 +2214,8 @@ int freeClientsInAsyncFreeQueue(void) { c->flag.protected_rdb_channel = 0; } + if (clientIOInProgress(c)) continue; + if (c->flag.protected) continue; c->flag.close_asap = 0; @@ -2954,18 +2986,13 @@ parseResult handleParseResults(client *c) { * This function handles various post-write tasks, including updating client state, * allow_async_writes - A flag indicating whether I/O threads can handle pending writes for this client. * returns 1 if processing completed successfully, 0 if processing is skipped. */ -int processClientIOWriteDone(client *c, int allow_async_writes) { - /* memory barrier acquire to get the latest client state */ - atomic_thread_fence(memory_order_acquire); - /* If a client is protected, don't proceed to check the write results as it may trigger conn close. */ - if (c->flag.protected) return 0; - - listUnlinkNode(server.clients_pending_io_write, &c->clients_pending_write_node); - c->flag.pending_write = 0; +void processClientIOWriteDone(client *c, int allow_async_writes) { + if (c->io_write_state == CLIENT_IDLE) return; /* Already handled */ + serverAssert(c->io_write_state == CLIENT_COMPLETED_IO); c->io_write_state = CLIENT_IDLE; /* Don't post-process-writes to clients that are going to be closed anyway. */ - if (c->flag.close_asap) return 0; + if (c->flag.close_asap) return; /* Update processed count on server */ server.stat_io_writes_processed += 1; @@ -2973,48 +3000,21 @@ int processClientIOWriteDone(client *c, int allow_async_writes) { connSetPostponeUpdateState(c->conn, 0); connUpdateState(c->conn); if (postWriteToClient(c) == C_ERR) { - return 1; - } - - if (clientHasPendingReplies(c)) { - if (c->write_flags & WRITE_FLAGS_WRITE_ERROR) { - /* Install the write handler if there are pending writes in some of the clients as a result of not being - * able to write everything in one go. */ - installClientWriteHandler(c); - } else { - /* If we can send the client to the I/O thread, let it handle the write. */ - if (allow_async_writes && trySendWriteToIOThreads(c) == C_OK) return 1; - /* Try again in the next eventloop */ - putClientInPendingWriteQueue(c); - } + return; } - return 1; -} - -/* This function handles the post-processing of I/O write operations that have been - * completed for clients. It iterates through the list of clients with pending I/O - * writes and performs necessary actions based on their current state. - * - * Returns The number of clients processed during this function call. */ -int processIOThreadsWriteDone(void) { - if (listLength(server.clients_pending_io_write) == 0) return 0; - int processed = 0; - listNode *ln; + if (!clientHasPendingReplies(c)) return; - listNode *next = listFirst(server.clients_pending_io_write); - while (next) { - ln = next; - next = listNextNode(ln); - client *c = listNodeValue(ln); - - /* Client is still waiting for a pending I/O - skip it */ - if (c->io_write_state == CLIENT_PENDING_IO || c->io_read_state == CLIENT_PENDING_IO) continue; - - processed += processClientIOWriteDone(c, 1); + if (c->write_flags & WRITE_FLAGS_WRITE_ERROR) { + /* Install the write handler if there are pending writes in some of the clients as a result of not being + * able to write everything in one go. */ + installClientWriteHandler(c); + } else { + /* If we can send the client to the I/O thread, let it handle the write. */ + if (allow_async_writes && trySendWriteToIOThreads(c) == C_OK) return; + /* Try again in the next eventloop */ + putClientInPendingWriteQueue(c); } - - return processed; } /* This function is called just before entering the event loop, in the hope @@ -3035,8 +3035,7 @@ int handleClientsWithPendingWrites(void) { listRewind(server.clients_pending_write, &li); while ((ln = listNext(&li))) { client *c = listNodeValue(ln); - c->flag.pending_write = 0; - listUnlinkNode(server.clients_pending_write, ln); + serverAssert(c->flag.pending_write); /* If a client is protected, don't do anything, * that may trigger write error or recreate handler. */ @@ -3045,6 +3044,14 @@ int handleClientsWithPendingWrites(void) { /* Don't write to clients that are going to be closed anyway. */ if (c->flag.close_asap) continue; + if (c->io_command_state != CLIENT_IDLE) { + /* If the client is in the middle of an I/O command, we can't write to it yet. */ + continue; + } + + c->flag.pending_write = 0; + listUnlinkNode(server.clients_pending_write, ln); + if (!clientHasPendingReplies(c)) continue; /* If we can send the client to the I/O thread, let it handle the write. */ @@ -3132,8 +3139,7 @@ void initSharedQueryBuf(void) { sdsclear(thread_shared_qb); } -void freeSharedQueryBuf(void *dummy) { - UNUSED(dummy); +void freeSharedQueryBuf(void) { sdsfree(thread_shared_qb); thread_shared_qb = NULL; } @@ -3550,24 +3556,27 @@ void commandProcessed(client *c) { * of processing the command, otherwise C_OK is returned. */ int processCommandAndResetClient(client *c) { int deadclient = 0; - client *old_client = server.current_client; - server.current_client = c; + client *old_client = getCurrentClient(); + setCurrentClient(c); if (processCommand(c) == C_OK) { + if (c->io_command_state != CLIENT_IDLE) { + return C_OK; + } commandProcessed(c); /* Update the client's memory to include output buffer growth following the * processed command. */ if (c->conn) updateClientMemUsageAndBucket(c); } - if (server.current_client == NULL) deadclient = 1; + if (getCurrentClient() == NULL) deadclient = 1; /* * Restore the old client, this is needed because when a script * times out, we will get into this code from processEventsWhileBlocked. - * Which will cause to set the server.current_client. If not restored + * Which will cause to set the current_client. If not restored * we will return 1 to our caller which will falsely indicate the client * is dead and will stop reading from its buffer. */ - server.current_client = old_client; + setCurrentClient(old_client); /* performEvictions may flush replica output buffers. This may * result in a replica, that may be the active client, to be * freed. */ @@ -3855,11 +3864,11 @@ char *getClientSockname(client *c) { int isClientConnIpV6(client *c) { /* The cached client peer id is on the form "[IPv6]:port" for IPv6 * addresses, so we just check for '[' here. */ - if (c->flag.fake && server.current_client) { + if (c->flag.fake && getCurrentClient()) { /* Fake client? Use current client instead. - * Noted that in here we are assuming server.current_client is set + * Noted that in here we are assuming current_client is set * and real (aof has already violated this in loadSingleAppendOnlyFil). */ - c = server.current_client; + c = getCurrentClient(); } return getClientPeerId(c)[0] == '['; } @@ -4315,7 +4324,7 @@ static int clientMatchesFilter(client *client, clientFilter *client_filter) { if (client_filter->type != -1 && getClientType(client) != client_filter->type) return 0; if (client_filter->ids && !intsetFind(client_filter->ids, client->id)) return 0; if (client_filter->user && client->user != client_filter->user) return 0; - if (client_filter->skipme && client == server.current_client) return 0; + if (client_filter->skipme && client == getCurrentClient()) return 0; if (client_filter->max_age != 0 && (long long)(commandTimeSnapshot() / 1000 - client->ctime) < client_filter->max_age) return 0; if (client_filter->idle != 0 && (long long)(commandTimeSnapshot() / 1000 - client->last_interaction) < client_filter->idle) return 0; if (client_filter->flags && clientMatchesFlagFilter(client, client_filter->flags) == 0) return 0; @@ -4754,7 +4763,7 @@ void clientUnblockCommand(client *c) { * doesn't have a timeout callback (even in the case of UNBLOCK ERROR). * The reason is that we assume that if a command doesn't expect to be timedout, * it also doesn't expect to be unblocked by CLIENT UNBLOCK */ - if (target && target->flag.blocked && blockedClientMayTimeout(target)) { + if (target && target->flag.blocked && blockedClientMayTimeout(target) && target->bstate->btype != BLOCKED_SLOT) { if (unblock_error) unblockClientOnError(target, "-UNBLOCKED client unblocked via CLIENT UNBLOCK"); else @@ -5757,91 +5766,59 @@ int postponeClientRead(client *c) { return (trySendReadToIOThreads(c) == C_OK); } -int processIOThreadsReadDone(void) { +void processClientIOReadsDone(client *c) { + serverAssert(c->io_read_state == CLIENT_COMPLETED_IO); + if (ProcessingEventsWhileBlocked) { /* When ProcessingEventsWhileBlocked we may call processIOThreadsReadDone recursively. * In this case, there may be some clients left in the batch waiting to be processed. */ processClientsCommandsBatch(); } - if (listLength(server.clients_pending_io_read) == 0) return 0; - int processed = 0; - listNode *ln; - - listNode *next = listFirst(server.clients_pending_io_read); - while (next) { - ln = next; - next = listNextNode(ln); - client *c = listNodeValue(ln); - - /* Client is still waiting for a pending I/O - skip it */ - if (c->io_write_state == CLIENT_PENDING_IO || c->io_read_state == CLIENT_PENDING_IO) continue; - /* If the write job is done, process it ASAP to free the buffer and handle connection errors */ - if (c->io_write_state == CLIENT_COMPLETED_IO) { - int allow_async_writes = 0; /* Don't send writes for the client to IO threads before processing the reads */ - processClientIOWriteDone(c, allow_async_writes); - } - /* memory barrier acquire to get the updated client state */ - atomic_thread_fence(memory_order_acquire); - - listUnlinkNode(server.clients_pending_io_read, ln); - c->flag.pending_read = 0; - c->io_read_state = CLIENT_IDLE; - - /* Don't post-process-reads from clients that are going to be closed anyway. */ - if (c->flag.close_asap) continue; - - /* If a client is protected, don't do anything, - * that may trigger read/write error or recreate handler. */ - if (c->flag.protected) continue; + c->flag.pending_read = 0; + c->io_read_state = CLIENT_IDLE; - processed++; - server.stat_io_reads_processed++; + /* Don't post-process-reads from clients that are going to be closed anyway. */ + if (c->flag.close_asap) return; - /* Save the current conn state, as connUpdateState may modify it */ - int in_accept_state = (connGetState(c->conn) == CONN_STATE_ACCEPTING); - connSetPostponeUpdateState(c->conn, 0); - connUpdateState(c->conn); + /* If a client is protected, don't do anything, + * that may trigger read/write error or recreate handler. */ + if (c->flag.protected) return; - /* In accept state, no client's data was read - stop here. */ - if (in_accept_state) continue; + /* Save the current conn state, as connUpdateState may modify it */ + int in_accept_state = (connGetState(c->conn) == CONN_STATE_ACCEPTING); + connSetPostponeUpdateState(c->conn, 0); + connUpdateState(c->conn); - /* On read error - stop here. */ - if (handleReadResult(c) == C_ERR) { - continue; - } + /* In accept state, no client's data was read - stop here. */ + if (in_accept_state) return; - if (!(c->read_flags & READ_FLAGS_DONT_PARSE)) { - parseResult res = handleParseResults(c); - /* On parse error - stop here. */ - if (res == PARSE_ERR) { - continue; - } else if (res == PARSE_NEEDMORE) { - beforeNextClient(c); - continue; - } - } - - if (c->argc > 0) { - c->flag.pending_command = 1; - } + /* On read error - stop here. */ + if (handleReadResult(c) == C_ERR) { + return; + } - size_t list_length_before_command_execute = listLength(server.clients_pending_io_read); - /* try to add the command to the batch */ - int ret = addCommandToBatchAndProcessIfFull(c); - /* If the command was not added to the commands batch, process it immediately */ - if (ret == C_ERR) { - if (processPendingCommandAndInputBuffer(c) == C_OK) beforeNextClient(c); - } - if (list_length_before_command_execute != listLength(server.clients_pending_io_read)) { - /* A client was unlink from the list possibly making the next node invalid */ - next = listFirst(server.clients_pending_io_read); + if (!(c->read_flags & READ_FLAGS_DONT_PARSE)) { + parseResult res = handleParseResults(c); + /* On parse error - stop here. */ + if (res == PARSE_ERR) { + return; + } else if (res == PARSE_NEEDMORE) { + beforeNextClient(c); + return; } } - processClientsCommandsBatch(); + if (c->argc > 0) { + c->flag.pending_command = 1; + } - return processed; + /* try to add the command to the batch */ + int ret = addCommandToBatchAndProcessIfFull(c); + /* If the command was not added to the commands batch, process it immediately */ + if (ret == C_ERR) { + if (processPendingCommandAndInputBuffer(c) == C_OK) beforeNextClient(c); + } } /* Returns the actual client eviction limit based on current configuration or @@ -5879,11 +5856,21 @@ void evictClients(void) { listNode *ln = listNext(&bucket_iter); if (ln) { client *c = ln->value; + if (c->flag.close_asap) { + /* We don't want to continue evicting clients in this case + * since it can cause multiple clients to be evicted unnecssarily */ + break; + } sds ci = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "Evicting client: %s", ci); - freeClient(c); sdsfree(ci); server.stat_evictedclients++; + if (freeClient(c) == 0) { + /* The client is protected and will be closed later. + * We don't want to continue evicting clients in this case + * since it can cause multiple clients to be evicted unnecssarily */ + break; + } } else { curr_bucket--; if (curr_bucket < 0) { @@ -5895,8 +5882,6 @@ void evictClients(void) { } } -/* IO threads functions */ - void ioThreadReadQueryFromClient(void *data) { client *c = data; serverAssert(c->io_read_state == CLIENT_PENDING_IO); @@ -5904,6 +5889,10 @@ void ioThreadReadQueryFromClient(void *data) { /* Read */ readToQueryBuf(c); + if (c->flag.close_asap) { + goto done; + } + /* Check for read errors. */ if (c->nread <= 0) { goto done; @@ -5940,8 +5929,8 @@ void ioThreadReadQueryFromClient(void *data) { if (!(c->read_flags & READ_FLAGS_PRIMARY)) { trimClientQueryBuffer(c); } - atomic_thread_fence(memory_order_release); c->io_read_state = CLIENT_COMPLETED_IO; + threadRespond(c, R_READ); } void ioThreadWriteToClient(void *data) { @@ -5953,7 +5942,50 @@ void ioThreadWriteToClient(void *data) { } else { _writeToClient(c); } + c->io_write_state = CLIENT_COMPLETED_IO; + threadRespond(c, R_WRITE); +} + +void ioThreadProcessCommand(void *data) { + client *c = (client *)data; + serverAssert(c->cmd->flags & CMD_CAN_BE_OFFLOADED); + const long long call_timer = ustime(); + c->flag.executing_command = 1; + setCurrentClient(c); + setExecutingClient(c); + + monotime monotonic_start = 0; + if (monotonicGetType() == MONOTONIC_CLOCK_HW) { + monotonic_start = getMonotonicUs(); + } + + /* Execute the command */ + c->cmd->proc(c); + + c->flag.executing_command = 0; + + ustime_t duration; + if (monotonicGetType() == MONOTONIC_CLOCK_HW) + duration = getMonotonicUs() - monotonic_start; + else + duration = ustime() - call_timer; + + c->duration += duration; + + /* Send write response to the client */ + c->nwritten = 0; + c->write_flags = 0; + /* Set the rebly block and bufpos */ + c->io_last_reply_block = listLast(c->reply); + if (c->io_last_reply_block) { + c->io_last_bufpos = ((clientReplyBlock *)listNodeValue(c->io_last_reply_block))->used; + } else { + c->io_last_bufpos = (size_t)c->bufpos; + } + + _writeToClient(c); - atomic_thread_fence(memory_order_release); + c->io_command_state = CLIENT_COMPLETED_IO; c->io_write_state = CLIENT_COMPLETED_IO; + threadRespond(c, R_COMMAND); } diff --git a/src/notify.c b/src/notify.c index bec0af2844..e997fcb6e0 100644 --- a/src/notify.c +++ b/src/notify.c @@ -107,7 +107,7 @@ void notifyKeyspaceEvent(int type, char *event, robj *key, int dbid) { robj *chanobj, *eventobj; int len = -1; char buf[24]; - client *c = server.executing_client; + client *c = getExecutingClient(); debugServerAssert(moduleNotifyKeyspaceSubscribersCnt() == 0 || (type & (NOTIFY_GENERIC | NOTIFY_STRING | NOTIFY_LIST | NOTIFY_SET | NOTIFY_HASH | NOTIFY_ZSET | NOTIFY_STREAM)) == 0 || c == NULL || diff --git a/src/object.c b/src/object.c index 36715429b5..fbe1460526 100644 --- a/src/object.c +++ b/src/object.c @@ -781,8 +781,9 @@ void trimStringObjectIfNeeded(robj *o, int trim_small_values) { * 2. When utilizing the argument caching mechanism in Lua. * 3. When calling from RM_TrimStringAllocation (trim_small_values is true). */ size_t len = sdslen(o->ptr); + client *executing_client = getExecutingClient(); if (len >= PROTO_MBULK_BIG_ARG || trim_small_values || - (server.executing_client && server.executing_client->flag.script && len < LUA_CMD_OBJCACHE_MAX_LEN)) { + (executing_client && executing_client->flag.script && len < LUA_CMD_OBJCACHE_MAX_LEN)) { if (sdsavail(o->ptr) > len / 10) { o->ptr = sdsRemoveFreeSpace(o->ptr, 0); } diff --git a/src/rdb.c b/src/rdb.c index e8bf2e25bd..51cd4d9c21 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -65,7 +65,7 @@ #define rdbReportReadError(...) rdbReportError(0, __LINE__, __VA_ARGS__) /* This macro tells if we are in the context of a RESTORE command, and not loading an RDB or AOF. */ -#define isRestoreContext() ((server.current_client == NULL || server.current_client->id == CLIENT_ID_AOF) ? 0 : 1) +#define isRestoreContext() ((getCurrentClient() == NULL || getCurrentClient()->id == CLIENT_ID_AOF) ? 0 : 1) char *rdbFileBeingLoaded = NULL; /* used for rdb checking on read error */ extern int rdbCheckMode; @@ -1881,9 +1881,9 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { if (server.sanitize_dump_payload == SANITIZE_DUMP_CLIENTS) { /* Skip sanitization when loading (an RDB), or getting a RESTORE command * from either the primary or a client using an ACL user with the skip-sanitize-payload flag. */ - int skip = server.loading || (server.current_client && (server.current_client->flag.primary)); - if (!skip && server.current_client && server.current_client->user) - skip = !!(server.current_client->user->flags & USER_FLAG_SANITIZE_PAYLOAD_SKIP); + int skip = server.loading || (getCurrentClient() && (getCurrentClient()->flag.primary)); + if (!skip && getCurrentClient() && getCurrentClient()->user) + skip = !!(getCurrentClient()->user->flags & USER_FLAG_SANITIZE_PAYLOAD_SKIP); deep_integrity_validation = !skip; } diff --git a/src/server.c b/src/server.c index e0ccaac5ec..171c57e63a 100644 --- a/src/server.c +++ b/src/server.c @@ -93,6 +93,9 @@ struct sharedObjectsStruct shared; +__thread client *_current_client; /* The client that triggered the command execution (External or AOF). */ +__thread client *_executing_client; /* The client executing the current command (possibly script or module). */ + /* Global vars that are actually used as constants. The following double * values are used for double on-disk serialization, and are initialized * at runtime to avoid strange compiler optimizations. */ @@ -114,7 +117,7 @@ const char *replstateToString(int replstate); /*============================ Utility functions ============================ */ /* This macro tells if we are in the context of loading an AOF. */ -#define isAOFLoadingContext() ((server.current_client && server.current_client->id == CLIENT_ID_AOF) ? 1 : 0) +#define isAOFLoadingContext() ((getCurrentClient() && getCurrentClient()->id == CLIENT_ID_AOF) ? 1 : 0) /* We use a private localtime implementation which is fork-safe. The logging * function of the server may be called from other threads. */ @@ -1482,6 +1485,10 @@ long long serverCron(struct aeEventLoop *eventLoop, long long id, void *clientDa UNUSED(id); UNUSED(clientData); + if (isServerCronDelayed()) { + return AE_NOMORE; + } + /* Software watchdog: deliver the SIGALRM that will reach the signal * handler if we don't return here fast enough. */ if (server.watchdog_period) watchdogScheduleSignal(server.watchdog_period); @@ -1803,14 +1810,14 @@ void beforeSleep(struct aeEventLoop *eventLoop) { * events to handle. */ if (ProcessingEventsWhileBlocked) { uint64_t processed = 0; - processed += processIOThreadsReadDone(); + processed += processIOThreadsResponses(); processed += connTypeProcessPendingData(); if (server.aof_state == AOF_ON || server.aof_state == AOF_WAIT_REWRITE) flushAppendOnlyFile(0); processed += handleClientsWithPendingWrites(); int last_processed = 0; do { /* Try to process all the pending IO events. */ - last_processed = processIOThreadsReadDone() + processIOThreadsWriteDone(); + last_processed = processIOThreadsResponses(); processed += last_processed; } while (last_processed != 0); processed += freeClientsInAsyncFreeQueue(); @@ -1819,7 +1826,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { } /* We should handle pending reads clients ASAP after event loop. */ - processIOThreadsReadDone(); + processIOThreadsResponses(); /* Handle pending data(typical TLS). (must be done before flushAppendOnlyFile) */ connTypeProcessPendingData(); @@ -1914,11 +1921,9 @@ void beforeSleep(struct aeEventLoop *eventLoop) { /* Try to process more IO reads that are ready to be processed. */ if (server.aof_fsync != AOF_FSYNC_ALWAYS) { - processIOThreadsReadDone(); + processIOThreadsResponses(); } - processIOThreadsWriteDone(); - /* Record cron time in beforeSleep. This does not include the time consumed by AOF writing and IO writing above. */ monotime cron_start_time_after_write = getMonotonicUs(); @@ -2721,6 +2726,7 @@ void resetServerStats(void) { server.stat_sync_partial_ok = 0; server.stat_sync_partial_err = 0; server.stat_io_reads_processed = 0; + server.stat_io_commands_processed = 0; server.stat_total_reads_processed = 0; server.stat_io_writes_processed = 0; server.stat_io_freed_objects = 0; @@ -2729,6 +2735,7 @@ void resetServerStats(void) { server.stat_total_writes_processed = 0; server.stat_client_qbuf_limit_disconnections = 0; server.stat_client_outbuf_limit_disconnections = 0; + server.stat_delayed_jobs_processed = 0; for (j = 0; j < STATS_METRIC_COUNT; j++) { server.inst_metric[j].idx = 0; server.inst_metric[j].last_sample_base = 0; @@ -2821,7 +2828,8 @@ void initServer(void) { server.rdb_pipe_read = -1; server.rdb_child_exit_pipe = -1; server.main_thread_id = pthread_self(); - server.current_client = NULL; + setCurrentClient(NULL); + setExecutingClient(NULL); server.errors = raxNew(); server.execution_nesting = 0; server.clients = listCreate(); @@ -2832,8 +2840,6 @@ void initServer(void) { server.replicas_waiting_psync = raxNew(); server.wait_before_rdb_client_free = DEFAULT_WAIT_BEFORE_RDB_CLIENT_FREE; server.clients_pending_write = listCreate(); - server.clients_pending_io_write = listCreate(); - server.clients_pending_io_read = listCreate(); server.clients_timeout_table = raxNew(); server.replication_allowed = 1; server.replicas_eldb = -1; /* Force to emit the first SELECT command. */ @@ -2874,6 +2880,10 @@ void initServer(void) { serverLog(LL_WARNING, "Failed creating the event loop. Error message: '%s'", strerror(errno)); exit(1); } + /* Set the epoll batch size for the server event loop */ + server.el->epoll_batch_size = AE_EPOLL_EVENTS_BATCH_SIZE; + + aeSetPrefetchProc(server.el, prefetchEvents); server.dbnum = server.cluster_enabled ? server.config_databases_cluster : server.config_databases; server.db = zcalloc(sizeof(serverDb *) * server.dbnum); @@ -3604,8 +3614,8 @@ static void propagatePendingCommands(void) { /* In case a command that may modify random keys was run *directly* * (i.e. not from within a script, MULTI/EXEC, RM_Call, etc.) we want * to avoid using a transaction (much like active-expire) */ - if (server.current_client && server.current_client->cmd && - server.current_client->cmd->flags & CMD_TOUCHES_ARBITRARY_KEYS) { + if (getCurrentClient() && getCurrentClient()->cmd && + getCurrentClient()->cmd->flags & CMD_TOUCHES_ARBITRARY_KEYS) { transaction = 0; } @@ -3724,8 +3734,8 @@ void call(client *c, int flags) { struct ClientFlags client_old_flags = c->flag; struct serverCommand *real_cmd = c->realcmd; - client *prev_client = server.executing_client; - server.executing_client = c; + client *prev_client = getExecutingClient(); + setExecutingClient(c); /* When call() is issued during loading the AOF we don't want commands called * from module, exec or LUA to go into the commandlog or to populate statistics. */ @@ -3900,17 +3910,17 @@ void call(client *c, int flags) { /* We use the tracking flag of the original external client that * triggered the command, but we take the keys from the actual command * being executed. */ - if (server.current_client && (server.current_client->flag.tracking) && - !(server.current_client->flag.tracking_bcast)) { - trackingRememberKeys(server.current_client, c); + if (getCurrentClient() && (getCurrentClient()->flag.tracking) && + !(getCurrentClient()->flag.tracking_bcast)) { + trackingRememberKeys(getCurrentClient(), c); } } if (!c->flag.blocked) { - /* Modules may call commands in cron, in which case server.current_client + /* Modules may call commands in cron, in which case current_client * is not set. */ - if (server.current_client) { - server.current_client->commands_processed++; + if (getCurrentClient()) { + getCurrentClient()->commands_processed++; } server.stat_numcommands++; } @@ -3933,7 +3943,7 @@ void call(client *c, int flags) { server.client_pause_in_transaction = 0; } - server.executing_client = prev_client; + setExecutingClient(prev_client); } /* Used when a command that is ready for execution needs to be rejected, due to @@ -4231,6 +4241,8 @@ int processCommand(client *c) { } } + if (!postponeClientCommand(c)) return C_OK; + if (!server.cluster_enabled && c->capa & CLIENT_CAPA_REDIRECT && server.primary_host && !obey_client && (is_write_command || (is_read_command && !c->flag.readonly))) { if (server.failover_state == FAILOVER_IN_PROGRESS) { @@ -4271,7 +4283,7 @@ int processCommand(client *c) { * before key eviction, after the last command was executed and consumed * some client output buffer memory. */ evictClients(); - if (server.current_client == NULL) { + if (getCurrentClient() == NULL) { /* If we evicted ourself then abort processing the command */ return C_ERR; } @@ -4293,7 +4305,7 @@ int processCommand(client *c) { /* performEvictions may flush replica output buffers. This may result * in a replica, that may be the active client, to be freed. */ - if (server.current_client == NULL) return C_ERR; + if (getCurrentClient() == NULL) return C_ERR; if (out_of_memory && is_denyoom_command) { rejectCommand(c, shared.oomerr); @@ -4431,6 +4443,9 @@ int processCommand(client *c) { addReply(c, shared.queued); } else { int flags = CMD_CALL_FULL; + if (trySendProcessCommandToIOThreads(c) == C_OK) { + return C_OK; + } call(c, flags); if (listLength(server.ready_keys) && !isInsideYieldingLongCommand()) handleClientsBlockedOnKeys(); } @@ -6077,10 +6092,17 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "io_threaded_reads_processed:%lld\r\n", server.stat_io_reads_processed, "io_threaded_writes_processed:%lld\r\n", server.stat_io_writes_processed, "io_threaded_freed_objects:%lld\r\n", server.stat_io_freed_objects, + "io_threaded_reads_pending:%lld\r\n", server.stat_io_reads_pending, + "io_threaded_writes_pending:%lld\r\n", server.stat_io_writes_pending, + "io_threaded_commands_pending:%lld\r\n", server.stat_io_commands_pending, + "io_threaded_commands_processed:%lld\r\n", server.stat_io_commands_processed, "io_threaded_accept_processed:%lld\r\n", server.stat_io_accept_offloaded, "io_threaded_poll_processed:%lld\r\n", server.stat_poll_processed_by_io_threads, "io_threaded_total_prefetch_batches:%lld\r\n", server.stat_total_prefetch_batches, "io_threaded_total_prefetch_entries:%lld\r\n", server.stat_total_prefetch_entries, + "io_threaded_clients_blocked_on_slot:%lld\r\n", server.stat_io_threaded_clients_blocked_on_slot, + "io_threaded_clients_blocked_total:%lld\r\n", server.stat_io_threaded_clients_blocked_total, + "io_threaded_postponed_jobs_to_mainthread:%lld\r\n", server.stat_delayed_jobs_processed, "client_query_buffer_limit_disconnections:%lld\r\n", server.stat_client_qbuf_limit_disconnections, "client_output_buffer_limit_disconnections:%lld\r\n", server.stat_client_outbuf_limit_disconnections, "reply_buffer_shrinks:%lld\r\n", server.stat_reply_buffer_shrinks, diff --git a/src/server.h b/src/server.h index 93417930d2..f5d56b6cc6 100644 --- a/src/server.h +++ b/src/server.h @@ -82,6 +82,7 @@ typedef long long ustime_t; /* microsecond time type. */ #include "rax.h" /* Radix tree */ #include "connection.h" /* Connection abstraction */ #include "memory_prefetch.h" +#include "io_threads.h" #include "trace/trace.h" #ifdef USE_LTTNG @@ -251,6 +252,7 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define CMD_ALLOW_BUSY ((1ULL << 26)) #define CMD_MODULE_GETCHANNELS (1ULL << 27) /* Use the modules getchannels interface. */ #define CMD_TOUCHES_ARBITRARY_KEYS (1ULL << 28) +#define CMD_CAN_BE_OFFLOADED (1ULL << 29) /* Command can be offloaded to worker IO threads. */ /* Command flags. Please don't forget to add command flag documentation in struct * serverCommand in this file. */ @@ -366,6 +368,7 @@ typedef enum blocking_type { BLOCKED_ZSET, /* BZPOP et al. */ BLOCKED_POSTPONE, /* Blocked by processCommand, re-try processing later. */ BLOCKED_SHUTDOWN, /* SHUTDOWN. */ + BLOCKED_SLOT, /* Blocked on slot due to command offloading */ BLOCKED_NUM, /* Number of blocked states. */ BLOCKED_END /* End of enumeration */ } blocking_type; @@ -698,6 +701,8 @@ typedef enum { AE_IO_STATE_DONE } AeIoState; +#define AE_EPOLL_EVENTS_BATCH_SIZE 200 /* Default batch size for epoll_wait */ + /*----------------------------------------------------------------------------- * Data types *----------------------------------------------------------------------------*/ @@ -918,9 +923,11 @@ typedef struct blockingState { which is opaque for the Redis core, only handled in module.c. */ - void *async_rm_call_handle; /* ValkeyModuleAsyncRMCallPromise structure. - which is opaque for the Redis core, only - handled in module.c. */ + void *async_rm_call_handle; /* ValkeyModuleAsyncRMCallPromise structure. + which is opaque for the Redis core, only + handled in module.c. */ + void *slot_pending_list; /* Pending clients queue on which the client is blocked while waiting for busy slot */ + listNode pending_client_node; /* Node in clients pending queue */ } blockingState; /* The following structure represents a node in the server.ready_keys list, @@ -1203,12 +1210,31 @@ typedef struct LastWrittenBuf { } LastWrittenBuf; typedef struct client { + volatile uint8_t io_read_state; /* Indicate the IO read state of the client */ + volatile uint8_t io_write_state; /* Indicate the IO write state of the client */ + volatile uint8_t io_command_state; /* Indicate the IO command state of the client */ + ustime_t duration; /* Current command duration. Used for measuring latency of blocking/non-blocking cmds */ + robj **original_argv; /* Arguments of original command if arguments were rewritten. */ + unsigned long long net_input_bytes_curr_cmd; /* Total network input bytes read for the* execution of this client's current command. */ + unsigned long long net_output_bytes_curr_cmd; /* Total network output bytes sent to this client, by the current command. */ + unsigned long long commands_processed; /* Total count of commands this client executed. */ + + + int original_argc; /* Num of arguments of original command if arguments were rewritten. */ + unsigned long long net_output_bytes; /* Total network output bytes sent to this client. */ /* Basic client information and connection. */ uint64_t id; /* Client incremental unique ID. */ connection *conn; /* Input buffer and command parsing fields */ - sds querybuf; /* Buffer we use to accumulate client queries. */ - size_t qb_pos; /* The position we have read in querybuf. */ + sds querybuf; /* Buffer we use to accumulate client queries. */ + size_t qb_pos; /* The position we have read in querybuf. */ + /* Client flags and state indicators */ + union { + uint64_t raw_flag; + struct ClientFlags flag; + }; + list *reply; /* List of reply objects to send to the client. */ + int slot; /* The slot the client is executing against. Set to -1 if no slot is being used */ robj **argv; /* Arguments of current command. */ int argc; /* Num of arguments of current command. */ int argv_len; /* Size of argv array (may be more than argc) */ @@ -1231,10 +1257,9 @@ typedef struct client { multiState *mstate; /* MULTI/EXEC state, lazily initialized when first needed */ blockingState *bstate; /* Blocking state, lazily initialized when first needed */ /* Output buffer and reply handling */ - long duration; /* Current command duration. Used for measuring latency of blocking/non-blocking cmds */ + char *buf; /* Output buffer */ size_t buf_usable_size; /* Usable size of buffer. */ - list *reply; /* List of reply objects to send to the client. */ listNode *io_last_reply_block; /* Last client reply block when sent to IO thread */ size_t io_last_bufpos; /* The client's bufpos at the time it was sent to the IO thread */ LastWrittenBuf io_last_written; /* Track state for last written buffer */ @@ -1242,37 +1267,28 @@ typedef struct client { listNode clients_pending_write_node; /* list node in clients_pending_write or in clients_pending_io_write list */ size_t bufpos; payloadHeader *last_header; /* Pointer to the last header in a buffer when using copy avoidance */ - int original_argc; /* Num of arguments of original command if arguments were rewritten. */ - robj **original_argv; /* Arguments of original command if arguments were rewritten. */ - /* Client flags and state indicators */ - union { - uint64_t raw_flag; - struct ClientFlags flag; - }; - uint16_t write_flags; /* Client Write flags - used to communicate the client write state. */ - volatile uint8_t io_read_state; /* Indicate the IO read state of the client */ - volatile uint8_t io_write_state; /* Indicate the IO write state of the client */ - uint8_t resp; /* RESP protocol version. Can be 2 or 3. */ - uint8_t cur_tid; /* ID of IO thread currently performing IO for this client */ + + + uint16_t write_flags; /* Client Write flags - used to communicate the client write state. */ + + + uint8_t resp; /* RESP protocol version. Can be 2 or 3. */ + uint8_t cur_tid; /* ID of IO thread currently performing IO for this client */ /* In updateClientMemoryUsage() we track the memory usage of * each client and add it to the sum of all the clients of a given type, * however we need to remember what was the old contribution of each * client, and in which category the client was, in order to remove it * before adding it the new value. */ uint8_t last_memory_type; - uint8_t capa; /* Client capabilities: CLIENT_CAPA* macros. */ - listNode pending_read_list_node; /* IO thread only ?*/ + uint8_t capa; /* Client capabilities: CLIENT_CAPA* macros. */ /* Statistics and metrics */ - unsigned long long net_input_bytes; /* Total network input bytes read from this client. */ - unsigned long long net_input_bytes_curr_cmd; /* Total network input bytes read for the* execution of this client's current command. */ - unsigned long long net_output_bytes; /* Total network output bytes sent to this client. */ - unsigned long long commands_processed; /* Total count of commands this client executed. */ - unsigned long long net_output_bytes_curr_cmd; /* Total network output bytes sent to this client, by the current command. */ - size_t buf_peak; /* Peak used size of buffer in last 5 sec interval. */ - int nwritten; /* Number of bytes of the last write. */ - int nread; /* Number of bytes of the last read. */ - int read_flags; /* Client Read flags - used to communicate the client read state. */ - int slot; /* The slot the client is executing against. Set to -1 if no slot is being used */ + unsigned long long net_input_bytes; /* Total network input bytes read from this client. */ + + + size_t buf_peak; /* Peak used size of buffer in last 5 sec interval. */ + int nwritten; /* Number of bytes of the last write. */ + int nread; /* Number of bytes of the last read. */ + int read_flags; /* Client Read flags - used to communicate the client read state. */ listNode *mem_usage_bucket_node; clientMemUsageBucket *mem_usage_bucket; /* In updateClientMemoryUsage() we track the memory usage of @@ -1303,6 +1319,15 @@ typedef struct client { #endif } client; +extern __thread client *_current_client; /* The client that triggered the command execution (External or AOF). */ +extern __thread client *_executing_client; /* The client executing the current command (possibly script or module). */ + +#define getCurrentClient() (_current_client) +#define setCurrentClient(c) (_current_client = (c)) +#define getExecutingClient() (_executing_client) +#define setExecutingClient(c) (_executing_client = (c)) +#define isCurrentClient(c) ((c) == getCurrentClient()) + /* When a command generates a lot of discrete elements to the client output buffer, it is much faster to * skip certain types of initialization. This type is used to indicate a client that has been initialized * and can be used with addWritePreparedReply* functions. A client can be cast into this type with @@ -1657,16 +1682,12 @@ struct valkeyServer { list *clients; /* List of active clients */ list *clients_to_close; /* Clients to close asynchronously */ list *clients_pending_write; /* There is to write or install handler. */ - list *clients_pending_io_read; /* List of clients with pending read to be process by I/O threads. */ - list *clients_pending_io_write; /* List of clients with pending write to be process by I/O threads. */ list *replicas, *monitors; /* List of replicas and MONITORs */ rax *replicas_waiting_psync; /* Radix tree for tracking replicas awaiting partial synchronization. * Key: RDB client ID * Value: RDB client object * This structure holds dual-channel sync replicas from the start of their * RDB transfer until their main channel establishes partial synchronization. */ - client *current_client; /* The client that triggered the command execution (External or AOF). */ - client *executing_client; /* The client executing the current command (possibly script or module). */ #ifdef LOG_REQ_RES char *req_res_logfile; /* Path of log file for logging all requests and their replies. If NULL, no logging will be @@ -1685,20 +1706,22 @@ struct valkeyServer { uint32_t paused_actions; /* Bitmask of actions that are currently paused */ list *postponed_clients; /* List of postponed clients */ pause_event client_pause_per_purpose[NUM_PAUSE_PURPOSES]; - char neterr[ANET_ERR_LEN]; /* Error buffer for anet.c */ - dict *migrate_cached_sockets; /* MIGRATE cached sockets */ - _Atomic uint64_t next_client_id; /* Next client unique ID. Incremental. */ - int protected_mode; /* Don't accept external connections. */ - int io_threads_num; /* Number of IO threads to use. */ - int active_io_threads_num; /* Current number of active IO threads, includes main thread. */ - int events_per_io_thread; /* Number of events on the event loop to trigger IO threads activation. */ - int prefetch_batch_max_size; /* Maximum number of keys to prefetch in a single batch */ - long long events_processed_while_blocked; /* processEventsWhileBlocked() */ - int enable_protected_configs; /* Enable the modification of protected configs, see PROTECTED_ACTION_ALLOWED_* */ - int enable_debug_cmd; /* Enable DEBUG commands, see PROTECTED_ACTION_ALLOWED_* */ - int enable_module_cmd; /* Enable MODULE commands, see PROTECTED_ACTION_ALLOWED_* */ - int enable_debug_assert; /* Enable debug asserts */ - int debug_client_enforce_reply_list; /* Force client to always use the reply list */ + char neterr[ANET_ERR_LEN]; /* Error buffer for anet.c */ + dict *migrate_cached_sockets; /* MIGRATE cached sockets */ + _Atomic uint64_t next_client_id; /* Next client unique ID. Incremental. */ + int protected_mode; /* Don't accept external connections. */ + int io_threads_num; /* Number of IO threads to use. */ + int active_io_threads_num; /* Current number of active IO threads, includes main thread. */ + int events_per_io_thread; /* Number of events on the event loop to trigger IO threads activation. */ + int io_threads_do_commands_offloading; /* If enabled, commands can be offloaded to IO threads. */ + int io_threads_do_commands_offloading_with_modules; /* If enabled, commands can be offloaded to IO threads even when modules are loaded. */ + int prefetch_batch_max_size; /* Maximum number of keys to prefetch in a single batch */ + long long events_processed_while_blocked; /* processEventsWhileBlocked() */ + int enable_protected_configs; /* Enable the modification of protected configs, see PROTECTED_ACTION_ALLOWED_* */ + int enable_debug_cmd; /* Enable DEBUG commands, see PROTECTED_ACTION_ALLOWED_* */ + int enable_module_cmd; /* Enable MODULE commands, see PROTECTED_ACTION_ALLOWED_* */ + int enable_debug_assert; /* Enable debug asserts */ + int debug_client_enforce_reply_list; /* Force client to always use the reply list */ /* Reply construction copy avoidance */ int min_io_threads_copy_avoid; /* Minimum number of IO threads for copy avoidance in reply construction */ int min_string_size_copy_avoid_threaded; /* Minimum bulk string size for copy avoidance in reply construction when IO threads enabled */ @@ -1725,8 +1748,8 @@ struct valkeyServer { long long stat_evictedscripts; /* Number of evicted lua scripts. */ long long stat_total_eviction_exceeded_time; /* Total time over the memory limit, unit us */ monotime stat_last_eviction_exceeded_time; /* Timestamp of current eviction start, unit us */ - long long stat_keyspace_hits; /* Number of successful lookups of keys */ - long long stat_keyspace_misses; /* Number of failed lookups of keys */ + _Atomic long long stat_keyspace_hits; /* Number of successful lookups of keys */ + _Atomic long long stat_keyspace_misses; /* Number of failed lookups of keys */ long long stat_active_defrag_hits; /* number of allocations moved */ long long stat_active_defrag_misses; /* number of allocations scanned but not moved */ long long stat_active_defrag_key_hits; /* number of keys with moved allocations */ @@ -1764,20 +1787,27 @@ struct valkeyServer { size_t stat_clients_type_memory[CLIENT_TYPE_COUNT]; /* Mem usage by type */ size_t stat_cluster_links_memory; /* Mem usage by cluster links */ long long - stat_unexpected_error_replies; /* Number of unexpected (aof-loading, replica to primary, etc.) error replies */ - long long stat_total_error_replies; /* Total number of issued error replies ( command + rejected errors ) */ - long long stat_dump_payload_sanitizations; /* Number deep dump payloads integrity validations. */ - long long stat_io_reads_processed; /* Number of read events processed by IO threads */ - long long stat_io_writes_processed; /* Number of write events processed by IO threads */ - long long stat_io_freed_objects; /* Number of objects freed by IO threads */ - long long stat_io_accept_offloaded; /* Number of offloaded accepts */ - long long stat_poll_processed_by_io_threads; /* Total number of poll jobs processed by IO */ - long long stat_total_reads_processed; /* Total number of read events processed */ - long long stat_total_writes_processed; /* Total number of write events processed */ - long long stat_client_qbuf_limit_disconnections; /* Total number of clients reached query buf length limit */ - long long stat_client_outbuf_limit_disconnections; /* Total number of clients reached output buf length limit */ - long long stat_total_prefetch_entries; /* Total number of prefetched dict entries */ - long long stat_total_prefetch_batches; /* Total number of prefetched batches */ + stat_unexpected_error_replies; /* Number of unexpected (aof-loading, replica to primary, etc.) error replies */ + long long stat_total_error_replies; /* Total number of issued error replies ( command + rejected errors ) */ + long long stat_dump_payload_sanitizations; /* Number deep dump payloads integrity validations. */ + long long stat_io_reads_processed; /* Number of read events processed by IO threads */ + long long stat_io_reads_pending; /* Number of read events pending in IO threads */ + long long stat_io_writes_processed; /* Number of write events processed by IO threads */ + long long stat_io_writes_pending; /* Number of write events pending in IO threads */ + long long stat_io_commands_processed; /* Number of write events pending in IO threads */ + long long stat_io_commands_pending; /* Number of write events pending in IO threads */ + long long stat_io_freed_objects; /* Number of objects freed by IO threads */ + long long stat_io_accept_offloaded; /* Number of offloaded accepts */ + long long stat_poll_processed_by_io_threads; /* Total number of poll jobs processed by IO */ + long long stat_io_threaded_clients_blocked_on_slot; /* Number of clients currently blocked on slots */ + long long stat_io_threaded_clients_blocked_total; /* Total number of clients that were blocked on slots */ + long long stat_total_reads_processed; /* Total number of read events processed */ + long long stat_total_writes_processed; /* Total number of write events processed */ + long long stat_client_qbuf_limit_disconnections; /* Total number of clients reached query buf length limit */ + long long stat_client_outbuf_limit_disconnections; /* Total number of clients reached output buf length limit */ + long long stat_total_prefetch_entries; /* Total number of prefetched dict entries */ + long long stat_total_prefetch_batches; /* Total number of prefetched batches */ + long long stat_delayed_jobs_processed; /* Total number of delayed jobs sent to main thread from worker threads */ /* The following two are used to track instantaneous metrics, like * number of operations per second, network traffic. */ struct { @@ -2446,6 +2476,11 @@ typedef int serverGetKeysProc(struct serverCommand *cmd, robj **argv, int argc, * CMD_TOUCHES_ARBITRARY_KEYS: The command may touch (and cause lazy-expire) * arbitrary key (i.e not provided in argv) * + * CMD_CAN_BE_OFFLOADED: The command can be safely offloaded to worker IO threads. + * Currently only simple read commands that don't have side effects + * are eligible for offloading. Commands with this flag should be + * idempotent and not modify any server state. + * * The following additional flags are only used in order to put commands * in a specific ACL category. Commands can have multiple ACL categories. * See valkey.conf for the exact meaning of each. @@ -2674,13 +2709,14 @@ void dictVanillaFree(void *val); #define READ_FLAGS_BAD_ARITY (1 << 18) #define READ_FLAGS_NO_KEYS (1 << 19) #define READ_FLAGS_CROSSSLOT (1 << 20) +#define READ_FLAGS_COMMAND_PROCESSED (1 << 17) /* Write flags for various write errors and states */ #define WRITE_FLAGS_WRITE_ERROR (1 << 0) #define WRITE_FLAGS_IS_REPLICA (1 << 1) client *createClient(connection *conn); -void freeClient(client *c); +int freeClient(client *c); void freeClientAsync(client *c); void logInvalidUseAndFreeClientAsync(client *c, const char *fmt, ...); void beforeNextClient(client *c); @@ -2800,7 +2836,7 @@ void linkClient(client *c); void protectClient(client *c); void unprotectClient(client *c); void initSharedQueryBuf(void); -void freeSharedQueryBuf(void *dummy); +void freeSharedQueryBuf(void); client *lookupClientByID(uint64_t id); int authRequired(client *c); void clientSetUser(client *c, user *u, int authenticated); @@ -2810,13 +2846,15 @@ void deleteCachedResponseClient(client *recording_client); void waitForClientIO(client *c); void ioThreadReadQueryFromClient(void *data); void ioThreadWriteToClient(void *data); +void ioThreadProcessCommand(void *data); int canParseCommand(client *c); -int processIOThreadsReadDone(void); -int processIOThreadsWriteDone(void); +void processClientIOReadsDone(client *c); +void processClientIOWriteDone(client *c, int allow_async_writes); +void commandProcessed(client *c); +long long serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData); void releaseReplyReferences(client *c); void resetLastWrittenBuf(client *c); - /* logreqres.c - logging of requests and responses */ void reqresReset(client *c, int free_buf); void reqresSaveClientReplyOffset(client *c); diff --git a/src/socket.c b/src/socket.c index d8dbde4e2b..2cf2db90e9 100644 --- a/src/socket.c +++ b/src/socket.c @@ -251,7 +251,7 @@ static int connSocketSetReadHandler(connection *conn, ConnectionCallbackFunc fun conn->read_handler = func; if (!conn->read_handler) aeDeleteFileEvent(server.el, conn->fd, AE_READABLE); - else if (aeCreateFileEvent(server.el, conn->fd, AE_READABLE, conn->type->ae_handler, conn) == AE_ERR) + else if (aeCreateFileEvent(server.el, conn->fd, AE_READABLE | AE_PREFETCH, conn->type->ae_handler, conn) == AE_ERR) return C_ERR; return C_OK; } diff --git a/src/tls.c b/src/tls.c index e5af01b2fa..edaa4d9d74 100644 --- a/src/tls.c +++ b/src/tls.c @@ -591,7 +591,7 @@ static void registerSSLEvent(tls_connection *conn) { if (conn->flags & TLS_CONN_FLAG_WRITE_WANT_READ) { if (mask & AE_WRITABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE); - if (!(mask & AE_READABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE, tlsEventHandler, conn); + if (!(mask & AE_READABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE | AE_PREFETCH, tlsEventHandler, conn); } else if (conn->flags & TLS_CONN_FLAG_READ_WANT_WRITE) { if (mask & AE_READABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE); if (!(mask & AE_WRITABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_WRITABLE, tlsEventHandler, conn); @@ -641,7 +641,7 @@ static void updateSSLEvent(tls_connection *conn) { int need_write = conn->c.write_handler || (conn->flags & TLS_CONN_FLAG_READ_WANT_WRITE); if (need_read && !(mask & AE_READABLE)) - aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE, tlsEventHandler, conn); + aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE | AE_PREFETCH, tlsEventHandler, conn); if (!need_read && (mask & AE_READABLE)) aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE); if (need_write && !(mask & AE_WRITABLE)) diff --git a/src/tracking.c b/src/tracking.c index b07432d99e..e8cecb223b 100644 --- a/src/tracking.c +++ b/src/tracking.c @@ -398,16 +398,17 @@ void trackingInvalidateKey(client *c, robj *keyobj, int bcast) { continue; } + client *current_client = getCurrentClient(); /* If the client enabled the NOLOOP mode, don't send notifications * about keys changed by the client itself. */ - if (target->flag.tracking_noloop && target == server.current_client) { + if (target->flag.tracking_noloop && target == current_client) { continue; } /* If target is current client and it's executing a command, we need schedule key invalidation. * As the invalidation messages may be interleaved with command * response and should after command response. */ - if (target == server.current_client && (server.current_client->flag.executing_command)) { + if (target == current_client && (current_client->flag.executing_command)) { incrRefCount(keyobj); listAddNodeTail(server.tracking_pending_keys, keyobj); } else { @@ -438,12 +439,13 @@ void trackingHandlePendingKeyInvalidations(void) { robj *key = listNodeValue(ln); /* current_client maybe freed, so we need to send invalidation * message only when current_client is still alive */ - if (server.current_client != NULL) { + client *current_client = getCurrentClient(); + if (current_client != NULL) { if (key != NULL) { - sendTrackingMessage(server.current_client, (char *)key->ptr, sdslen(key->ptr), 0); + sendTrackingMessage(current_client, (char *)key->ptr, sdslen(key->ptr), 0); } else { - sendTrackingMessage(server.current_client, shared.null[server.current_client->resp]->ptr, - sdslen(shared.null[server.current_client->resp]->ptr), 1); + sendTrackingMessage(current_client, shared.null[current_client->resp]->ptr, + sdslen(shared.null[current_client->resp]->ptr), 1); } } if (key != NULL) decrRefCount(key); @@ -475,7 +477,7 @@ void trackingInvalidateKeysOnFlush(int async) { while ((ln = listNext(&li)) != NULL) { client *c = listNodeValue(ln); if (c->flag.tracking) { - if (c == server.current_client) { + if (isCurrentClient(c)) { /* We use a special NULL to indicate that we should send null */ listAddNodeTail(server.tracking_pending_keys, NULL); } else { diff --git a/tests/unit/io-threads.tcl b/tests/unit/io-threads.tcl new file mode 100644 index 0000000000..3660ecc65b --- /dev/null +++ b/tests/unit/io-threads.tcl @@ -0,0 +1,360 @@ +source tests/support/cli.tcl + +if {$::io_threads} { + + start_server {config "minimal.conf" tags {"external:skip"} overrides {enable-debug-command {yes}}} { + + set server_pid [s process_id] + + test {prefetch works as expected when killing a client from the middle of prefetch commands batch} { + # Create 16 (prefetch batch size) +1 clients + for {set i 0} {$i < 16} {incr i} { + set rd$i [valkey_deferring_client] + } + + # set a key that will be later be prefetch + r set a 0 + + # Get the client ID of rd4 + $rd4 client id + set rd4_id [$rd4 read] + + # Create a batch of commands by suspending the server for a while + # before responding to the first command + pause_process $server_pid + + # The first client will kill the fourth client + $rd0 client kill id $rd4_id + + # Send set commands for all clients except the first + for {set i 1} {$i < 16} {incr i} { + [set rd$i] set a $i + [set rd$i] flush + } + + # Resume the server + resume_process $server_pid + + # Read the results + assert_equal {1} [$rd0 read] + catch {$rd4 read} err + assert_match {I/O error reading reply} $err + + # verify the prefetch stats are as expected + set info [r info stats] + set prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries] + assert_range $prefetch_entries 2 15; # With slower machines, the number of prefetch entries can be lower + set prefetch_batches [getInfoProperty $info io_threaded_total_prefetch_batches] + assert_range $prefetch_batches 1 7; # With slower machines, the number of batches can be higher + + # Verify the final state + $rd15 get a + assert_equal {OK} [$rd15 read] + assert_equal {15} [$rd15 read] + } + + test {prefetch works as expected when changing the batch size while executing the commands batch} { + # Create 16 (default prefetch batch size) clients + for {set i 0} {$i < 16} {incr i} { + set rd$i [valkey_deferring_client] + } + + # Create a batch of commands by suspending the server for a while + # before responding to the first command + pause_process $server_pid + + # Send set commands for all clients the 5th client will change the prefetch batch size + for {set i 0} {$i < 16} {incr i} { + if {$i == 4} { + [set rd$i] config set prefetch-batch-max-size 1 + } + [set rd$i] set a $i + [set rd$i] flush + } + # Resume the server + resume_process $server_pid + # Read the results + for {set i 0} {$i < 16} {incr i} { + assert_equal {OK} [[set rd$i] read] + } + + # assert the configured prefetch batch size was changed + assert {[r config get prefetch-batch-max-size] eq "prefetch-batch-max-size 1"} + } + + test {no prefetch when the batch size is set to 0} { + # set the batch size to 0 + r config set prefetch-batch-max-size 0 + # save the current value of prefetch entries + set info [r info stats] + set prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries] + + # Create 16 (default prefetch batch size) clients + for {set i 0} {$i < 16} {incr i} { + set rd$i [valkey_deferring_client] + } + + # Create a batch of commands by suspending the server for a while + # before responding to the first command + pause_process $server_pid + + # Send set commands for all clients + for {set i 0} {$i < 16} {incr i} { + [set rd$i] set a $i + [set rd$i] flush + } + + # Resume the server + resume_process $server_pid + + # Read the results + for {set i 0} {$i < 16} {incr i} { + assert_equal {OK} [[set rd$i] read] + } + + # assert the prefetch entries did not change + set info [r info stats] + set new_prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries] + assert_equal $prefetch_entries $new_prefetch_entries + } + } + + start_server {} { + start_server {} { + test {replicas writes are offloaded to IO threads} { + set primary [srv -1 client] + set primary_host [srv -1 host] + set primary_port [srv -1 port] + + set replica [srv 0 client] + $replica replicaof $primary_host $primary_port + + wait_for_condition 500 100 { + [s 0 master_link_status] eq {up} + } else { + fail "Replication not started." + } + + # get the current io_threaded_writes_processed + set info [$primary info stats] + set io_threaded_writes_processed [getInfoProperty $info io_threaded_writes_processed] + + # Send a write command to the primary + $primary set a 1 + + # Wait for the write to be propagated to the replica + wait_for_condition 50 100 { + [$replica get a] eq {1} + } else { + fail "Replication not propagated." + } + + # Get the new io_threaded_writes_processed + set info [$primary info stats] + set new_io_threaded_writes_processed [getInfoProperty $info io_threaded_writes_processed] + # Assert new is old + 3, 3 for the write to the info-client, set-client and to the replica. + assert {$new_io_threaded_writes_processed >= $io_threaded_writes_processed + 3} ; + + # Verify the write was propagated to the replica + assert_equal {1} [$replica get a] + } + } +} + + + ### Commands Offloading tests ### + + proc get_offloaded_commands {r} { + # Get the current IO thread stats + set info [$r info stats] + return [getInfoProperty $info io_threaded_commands_processed] + } + + start_cluster 1 0 {config "minimal.conf" tags {"external:skip"} overrides {enable-debug-command {yes}}} { + wait_for_cluster_state ok + set server_pid [s process_id] + + # Skip if non io-threads mode - as it is relevant only for io-threads mode + test {Pipeline commands are not offloaded with IO threads} { + # This test verifies that pipeline commands are not offloaded to IO threads + + set initial_offloaded_commands [get_offloaded_commands r] + # Create a client and send pipeline commands + set rd [valkey_deferring_client] + + # Send a batch of commands in pipeline mode + $rd write [format_command GET nonexistent_key] + $rd write [format_command SET test value1] + $rd write [format_command GET test] + $rd write [format_command INCR counter] + $rd write [format_command GET counter] + $rd flush + + # Read all responses + assert_equal {} [$rd read] "GET nonexistent_key should return empty" + assert_equal {OK} [$rd read] + assert_equal {value1} [$rd read] + assert_equal {1} [$rd read] + assert_equal {1} [$rd read] + + # Get the updated IO thread stats + set updated_offloaded_commands [get_offloaded_commands r] + # The pipeline commands should not have been offloaded to IO threads only the last GET command + # So the number of commands processed by IO threads should be increment by 1 + assert_equal [expr {$initial_offloaded_commands + 1}] $updated_offloaded_commands; + + # Verify that non-pipelined commands are offloaded + assert_equal {OK} [r SET regular_command value2] + assert_equal {value2} [r GET regular_command] + + # The non-pipelined command should have been offloaded + # So the number of commands processed by IO threads should increase + set final_offloaded_commands [get_offloaded_commands r] + assert {$final_offloaded_commands > $updated_offloaded_commands} + } + + test {Offloaded command returns wrong type error for incorrect key type} { + set errs_cnt 0 + set updated_errs_cnt 0 + + # Get initial stats + set info_stats [r info] + regexp {count=(\d*)} [getInfoProperty $info_stats errorstat_WRONGTYPE] _ errs_cnt + set initial_posted_jobs [getInfoProperty $info_stats io_threaded_postponed_jobs_to_mainthread] + set initial_offloaded_commands [get_offloaded_commands r] + + # set a HASH with a key and value + r HSET key_1 field_a val_a + + # Verify we get error response for a GET command on a HASH type key + assert_error {*WRONGTYPE*} {r GET key_1} + + # Get updated stats + set updated_info_stats [r INFO] + set updated_posted_jobs [getInfoProperty $updated_info_stats io_threaded_postponed_jobs_to_mainthread] + regexp {count=(\d*)} [getInfoProperty $updated_info_stats errorstat_WRONGTYPE] _ updated_errs_cnt + set updated_offloaded_commands [get_offloaded_commands r] + + # Check that the get command was offloaded to io thread and error stat was updated + assert_equal 1 [expr {$updated_posted_jobs - $initial_posted_jobs}] + assert_equal 1 [expr {$updated_errs_cnt - $errs_cnt}] + assert_equal 1 [expr {$updated_offloaded_commands - $initial_offloaded_commands}] + } + + test {Read commands are offloaded to IO threads} { + # Set up some test data + r SET key1 value1 + r SET key2 value2 + r HSET hash_key field1 value1 field2 value2 + + # Get the initial IO thread stats + set initial_offloaded_commands [get_offloaded_commands r] + # Execute a series of read commands that should be offloaded + r GET key1 + r GET key2 + r HGET hash_key field1 + + # Verify that the commands were offloaded (processed count should increase by 3) + set updated_offloaded_commands [get_offloaded_commands r] + assert_equal [expr {$initial_offloaded_commands + 3}] $updated_offloaded_commands + } + + test {Write commands are not offloaded to IO threads} { + # Get the initial IO thread stats + set initial_offloaded_commands [get_offloaded_commands r] + + # Execute a series of write commands + r SET write_key1 value1 + r HSET write_hash field1 value1 + r INCR write_counter + + # Verify that the write commands were not offloaded + set updated_offloaded_commands [get_offloaded_commands r] + assert_equal $initial_offloaded_commands $updated_offloaded_commands + } + + test {Commands not marked to be offloaded are not offloaded} { + # Get the initial IO thread stats + set initial_offloaded_commands [get_offloaded_commands r] + + # Execute commands with side effects + r PUBLISH channel message + r CLIENT LIST + r INFO + + # Verify that these commands were not offloaded + set updated_offloaded_commands [get_offloaded_commands r] + assert_equal $initial_offloaded_commands $updated_offloaded_commands + } + + test {Command offloading can be disabled via configuration} { + # This test verifies that command offloading can be disabled + # Set up test data + r set config_test_key value + + # Get the initial IO thread stats + set initial_offloaded_commands [get_offloaded_commands r] + + # Execute a read command that would normally be offloaded + r get config_test_key + + # Get the stats after first command + set mid_offloaded_commands [get_offloaded_commands r] + + # Verify the command was offloaded + assert_equal [expr {$initial_offloaded_commands + 1}] $mid_offloaded_commands + + # Disable command offloading + r config set io-threads-do-commands-offloading no + + # Execute another read command + r get config_test_key + + # Get the final stats + set final_offloaded_commands [get_offloaded_commands r] + + # Verify the command was not offloaded after disabling + assert_equal $mid_offloaded_commands $final_offloaded_commands + + # Re-enable command offloading for other tests + r config set io-threads-do-commands-offloading yes + } + + test {Key expiry is postponed when read from main thread} { + # This test verifies that when a key with expiry is read from the IO thread, + # its expiry deletion is postponed to the main-thread to prevent race conditions with the main-thread + set updated_info_stats [r INFO] + set initial_postponed [getInfoProperty $updated_info_stats io_threaded_postponed_jobs_to_mainthread] + + # Set a key with a short expiry time (2 seconds) + r set key "val" + r expire key 2 + + # Verify the key exists + assert_equal "val" [r get key] + + # Wait for the key to be expired + after 2000 + + # Read the key (this should trigger expiry postponement) + # The answer should be empty + assert_equal {} [r get key] + + # Check if postponed jobs counter increased + set updated_info_stats [r INFO] + set updated_postponed [getInfoProperty $updated_info_stats io_threaded_postponed_jobs_to_mainthread] + # The postponed jobs counter should have increased if expiry was postponed + assert {$updated_postponed > $initial_postponed} + } + + test "test io-threads are runtime modifiable" { + # Randomly set the number of threads between 1 and 5 + for {set i 0} {$i < 100} {incr i} { + set random_num [expr {int(rand() * 5) + 1}] + r config set io-threads $random_num + set thread_num [lindex [r config get io-threads] 1] + assert_equal $random_num $thread_num + } + } + } +} diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl index a03fbaf639..edd2d2325c 100644 --- a/tests/unit/maxmemory.tcl +++ b/tests/unit/maxmemory.tcl @@ -55,6 +55,11 @@ start_server {tags {"maxmemory external:skip"}} { } foreach {client_eviction} {false true} { + # Skip client eviction test when IO threads are enabled because client freeing + # may be deferred with IO threads, making the eviction behavior unpredictable + if {$client_eviction && $::io_threads} { + continue + } set clients {} test "eviction due to output buffers of many MGET clients, client eviction: $client_eviction" { init_test $client_eviction diff --git a/tests/unit/moduleapi/misc.tcl b/tests/unit/moduleapi/misc.tcl index 9e0ea72456..9e261a39e8 100644 --- a/tests/unit/moduleapi/misc.tcl +++ b/tests/unit/moduleapi/misc.tcl @@ -560,3 +560,40 @@ if {[string match {*jemalloc*} [s mem_allocator]]} { assert_equal {OK} [r module unload misc] } } + +start_cluster 1 0 {tags {"modules external:skip"}} { + set testmodule [file normalize tests/modules/misc.so] + test {Commands are not offloaded by default when modules are loaded} { + # Skip if non io-threads mode - as it is relevant only for io-threads mode + if {[r config get io-threads] ne "io-threads 1"} { + r module load $testmodule + + # Get the initial IO thread stats + set initial_info [r info stats] + set initial_processed [getInfoProperty $initial_info io_threaded_commands_processed] + + # Send a GET command that normally would be offloaded + r GET key + + # Get the updated IO thread stats + set updated_info [r info stats] + set updated_processed [getInfoProperty $updated_info io_threaded_commands_processed] + + # Verify the GET command was not offloaded (processed count should be the same) + assert_equal $initial_processed $updated_processed + + # Now override the default behavior to allow offloading even with modules + r config set io-threads-do-command-offloading-with-modules yes + + # Send another GET command + r GET key + + # Get the final IO thread stats + set final_info [r info stats] + set final_processed [getInfoProperty $final_info io_threaded_commands_processed] + + # Verify the GET command was offloaded (processed count should increase) + assert {$final_processed > $updated_processed} + } + } +} diff --git a/tests/unit/networking.tcl b/tests/unit/networking.tcl index d0038e9a03..24f8caae9c 100644 --- a/tests/unit/networking.tcl +++ b/tests/unit/networking.tcl @@ -170,161 +170,3 @@ start_server {config "minimal.conf" tags {"external:skip"}} { } } } - -start_server {config "minimal.conf" tags {"external:skip"} overrides {enable-debug-command {yes}}} { - set server_pid [s process_id] - # Skip if non io-threads mode - as it is relevant only for io-threads mode - if {[r config get io-threads] ne "io-threads 1"} { - test {prefetch works as expected when killing a client from the middle of prefetch commands batch} { - # Create 16 (prefetch batch size) +1 clients - for {set i 0} {$i < 16} {incr i} { - set rd$i [valkey_deferring_client] - } - - # set a key that will be later be prefetch - r set a 0 - - # Get the client ID of rd4 - $rd4 client id - set rd4_id [$rd4 read] - - # Create a batch of commands by suspending the server for a while - # before responding to the first command - pause_process $server_pid - - # The first client will kill the fourth client - $rd0 client kill id $rd4_id - - # Send set commands for all clients except the first - for {set i 1} {$i < 16} {incr i} { - [set rd$i] set a $i - [set rd$i] flush - } - - # Resume the server - resume_process $server_pid - - # Read the results - assert_equal {1} [$rd0 read] - catch {$rd4 read} err - assert_match {I/O error reading reply} $err - - # verify the prefetch stats are as expected - set info [r info stats] - set prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries] - assert_range $prefetch_entries 2 15; # With slower machines, the number of prefetch entries can be lower - set prefetch_batches [getInfoProperty $info io_threaded_total_prefetch_batches] - assert_range $prefetch_batches 1 7; # With slower machines, the number of batches can be higher - - # Verify the final state - $rd15 get a - assert_equal {OK} [$rd15 read] - assert_equal {15} [$rd15 read] - } - - test {prefetch works as expected when changing the batch size while executing the commands batch} { - # Create 16 (default prefetch batch size) clients - for {set i 0} {$i < 16} {incr i} { - set rd$i [valkey_deferring_client] - } - - # Create a batch of commands by suspending the server for a while - # before responding to the first command - pause_process $server_pid - - # Send set commands for all clients the 5th client will change the prefetch batch size - for {set i 0} {$i < 16} {incr i} { - if {$i == 4} { - [set rd$i] config set prefetch-batch-max-size 1 - } - [set rd$i] set a $i - [set rd$i] flush - } - # Resume the server - resume_process $server_pid - # Read the results - for {set i 0} {$i < 16} {incr i} { - assert_equal {OK} [[set rd$i] read] - } - - # assert the configured prefetch batch size was changed - assert {[r config get prefetch-batch-max-size] eq "prefetch-batch-max-size 1"} - } - - test {no prefetch when the batch size is set to 0} { - # set the batch size to 0 - r config set prefetch-batch-max-size 0 - # save the current value of prefetch entries - set info [r info stats] - set prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries] - - # Create 16 (default prefetch batch size) clients - for {set i 0} {$i < 16} {incr i} { - set rd$i [valkey_deferring_client] - } - - # Create a batch of commands by suspending the server for a while - # before responding to the first command - pause_process $server_pid - - # Send set commands for all clients - for {set i 0} {$i < 16} {incr i} { - [set rd$i] set a $i - [set rd$i] flush - } - - # Resume the server - resume_process $server_pid - - # Read the results - for {set i 0} {$i < 16} {incr i} { - assert_equal {OK} [[set rd$i] read] - } - - # assert the prefetch entries did not change - set info [r info stats] - set new_prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries] - assert_equal $prefetch_entries $new_prefetch_entries - } - - start_server {} { - test {replicas writes are offloaded to IO threads} { - set primary [srv -1 client] - set primary_host [srv -1 host] - set primary_port [srv -1 port] - - set replica [srv 0 client] - $replica replicaof $primary_host $primary_port - - wait_for_condition 500 100 { - [s 0 master_link_status] eq {up} - } else { - fail "Replication not started." - } - - # get the current io_threaded_writes_processed - set info [$primary info stats] - set io_threaded_writes_processed [getInfoProperty $info io_threaded_writes_processed] - - # Send a write command to the primary - $primary set a 1 - - # Wait for the write to be propagated to the replica - wait_for_condition 50 100 { - [$replica get a] eq {1} - } else { - fail "Replication not propagated." - } - - # Get the new io_threaded_writes_processed - set info [$primary info stats] - set new_io_threaded_writes_processed [getInfoProperty $info io_threaded_writes_processed] - # Assert new is old + 3, 3 for the write to the info-client, set-client and to the replica. - assert {$new_io_threaded_writes_processed >= $io_threaded_writes_processed + 3} ; - - # Verify the write was propagated to the replica - assert_equal {1} [$replica get a] - } - } - } -} diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl index b86f3690a8..66382d9801 100644 --- a/tests/unit/other.tcl +++ b/tests/unit/other.tcl @@ -579,17 +579,7 @@ start_server {tags {"other external:skip"}} { } } -start_server {tags {"other external:skip"}} { - test "test io-threads are runtime modifiable" { - # Randomly set the number of threads between 1 and 5 - for {set i 0} {$i < 100} {incr i} { - set random_num [expr {int(rand() * 5) + 1}] - r config set io-threads $random_num - set thread_num [lindex [r config get io-threads] 1] - assert_equal $random_num $thread_num - } - } -} + set tempFileName [file join [pwd] [pid]] if {$::verbose} { From 800dbc451f360f46ce4e61524ff84c9605b9eccb Mon Sep 17 00:00:00 2001 From: Uri Yagelnik Date: Tue, 17 Jun 2025 15:58:06 +0000 Subject: [PATCH 2/5] Fix failed tests, addressed PR comments Signed-off-by: Uri Yagelnik --- .github/workflows/ci.yml | 14 ++++++ src/ae.c | 9 +++- src/ae.h | 21 +++++---- src/commands/get.json | 2 +- src/db.c | 2 +- src/io_threads.c | 93 +++++++++++++++++++++----------------- src/io_threads.h | 6 ++- src/kvstore.c | 2 +- src/memory_prefetch.c | 12 ++--- src/networking.c | 68 +++++----------------------- src/server.c | 55 ++++++++++++++++++++-- src/server.h | 5 +- src/socket.c | 2 +- src/tls.c | 4 +- src/unit/test_networking.c | 6 +-- 15 files changed, 167 insertions(+), 134 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2a2275866f..276aa6445e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -80,6 +80,20 @@ jobs: - name: unit tests run: ./src/valkey-unit-tests + test-ubuntu-io-threads-sanitizer: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - name: make + # build with TLS module just for compilation coverage + run: make -j4 all-with-unit-tests SANITIZER=address SERVER_CFLAGS='-Werror' BUILD_TLS=module + - name: testprep + run: sudo apt-get install tcl8.6 tclx -y + - name: test + run: ./runtest --io-threads --verbose --tags -slow --dump-logs + - name: module api test + run: CFLAGS='-Werror' ./runtest-moduleapi --io-threads --verbose --dump-logs + test-rdma: runs-on: ubuntu-latest steps: diff --git a/src/ae.c b/src/ae.c index 0e56a0defb..eda5cc876f 100644 --- a/src/ae.c +++ b/src/ae.c @@ -91,6 +91,7 @@ aeEventLoop *aeCreateEventLoop(int setsize) { eventLoop->beforesleep = NULL; eventLoop->aftersleep = NULL; eventLoop->custompoll = NULL; + eventLoop->prefetch = NULL; eventLoop->flags = 0; eventLoop->epoll_batch_size = 0; /* Default to 0, meaning use setsize */ /* Initialize the eventloop mutex with PTHREAD_MUTEX_ERRORCHECK type */ @@ -218,8 +219,8 @@ void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask) { * is removed. */ if (mask & AE_WRITABLE) mask |= AE_BARRIER; - /* We want to always remove AE_PREFETCH if set when AE_READABLE is removed. */ - if (mask & AE_READABLE) mask |= AE_PREFETCH; + /* We want to always remove AE_PRE_READABLE_HOOK if set when AE_READABLE is removed. */ + if (mask & AE_READABLE) mask |= AE_PRE_READABLE_HOOK; /* Only remove attached events */ mask = mask & fe->mask; @@ -578,3 +579,7 @@ void aeSetPollProtect(aeEventLoop *eventLoop, int protect) { eventLoop->flags &= ~AE_PROTECT_POLL; } } + +void aeSetEpollBatchSize(aeEventLoop *eventLoop, int batchSize) { + eventLoop->epoll_batch_size = batchSize; +} diff --git a/src/ae.h b/src/ae.h index fc4aeb33c1..e212576e8d 100644 --- a/src/ae.h +++ b/src/ae.h @@ -39,15 +39,15 @@ #define AE_OK 0 #define AE_ERR -1 -#define AE_NONE 0 /* No events registered. */ -#define AE_READABLE 1 /* Fire when descriptor is readable. */ -#define AE_WRITABLE 2 /* Fire when descriptor is writable. */ -#define AE_BARRIER 4 /* With WRITABLE, never fire the event if the \ - READABLE event already fired in the same event \ - loop iteration. Useful when you want to persist \ - things to disk before sending replies, and want \ - to do that in a group fashion. */ -#define AE_PREFETCH 8 /* With PREFETCH, call prefetch callback for the events */ +#define AE_NONE 0 /* No events registered. */ +#define AE_READABLE 1 /* Fire when descriptor is readable. */ +#define AE_WRITABLE 2 /* Fire when descriptor is writable. */ +#define AE_BARRIER 4 /* With WRITABLE, never fire the event if the \ + READABLE event already fired in the same event \ + loop iteration. Useful when you want to persist \ + things to disk before sending replies, and want \ + to do that in a group fashion. */ +#define AE_PRE_READABLE_HOOK 8 /* Call pre-process-read callback for the events */ #define AE_FILE_EVENTS (1 << 0) #define AE_TIME_EVENTS (1 << 1) @@ -118,7 +118,7 @@ typedef struct aeEventLoop { aePrefetchProc *prefetch; pthread_mutex_t poll_mutex; int flags; - int epoll_batch_size; /* Optional batch size for epoll_wait */ + int epoll_batch_size; /* Maximum events to process per epoll_wait call (0 = use system default batch size) */ } aeEventLoop; /* Prototypes */ @@ -148,5 +148,6 @@ int aePoll(aeEventLoop *eventLoop, struct timeval *tvp); int aeGetSetSize(aeEventLoop *eventLoop); int aeResizeSetSize(aeEventLoop *eventLoop, int setsize); void aeSetDontWait(aeEventLoop *eventLoop, int noWait); +void aeSetEpollBatchSize(aeEventLoop *eventLoop, int batchSize); #endif diff --git a/src/commands/get.json b/src/commands/get.json index f0153f9505..c6861f7914 100644 --- a/src/commands/get.json +++ b/src/commands/get.json @@ -9,7 +9,7 @@ "command_flags": [ "READONLY", "FAST", - "CAN_BE_OFFLOADED" + "CAN_BE_OFFLOADED" ], "acl_categories": [ "STRING" diff --git a/src/db.c b/src/db.c index 8a6b6d53a9..d60a1f4347 100644 --- a/src/db.c +++ b/src/db.c @@ -2012,7 +2012,7 @@ static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val, if (!inMainThread()) { postpone_expired_key_ctx ctx = {.dict_index = dict_index, .db = db, .key = key}; if (!static_key) incrRefCount(key); - threadAddDelayedJob(dict_index, handlePostponeExpiredKey, sizeof(ctx), &ctx); + threadAdddeferredJob(dict_index, handlePostponeExpiredKey, sizeof(ctx), &ctx); return KEY_EXPIRED; } diff --git a/src/io_threads.c b/src/io_threads.c index fdeee20c95..9fd3a9c0ff 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -68,7 +68,7 @@ static IoToMTQueue *IoToMTQueueCreate(size_t capacity) { * 1 - If the value was successfully added to the queue * 0 - If the queue was full and the value couldn't be added */ -static int IoToMTQueueProduce(uint64_t value, uint64_t counter) { +static int IoToMTQueueProduce(uintptr_t value, uintptr_t counter) { IoToMTQueue *q = io_to_mt_queue; int first_try = counter == 0; /* Get the next producer slot if no slot is given */ @@ -136,7 +136,7 @@ static int IoToMTQueueConsumeBatch(int max_items, uint64_t *values) { /* If we consumed any items, update the producer limit */ if (consumed_count > 0) { - /* Release so that the threads see the NULL assingments */ + /* Release so that the threads see the NULL assignments */ atomic_store_explicit(&q->producer_limit, q->producer_limit + consumed_count, memory_order_release); /* Acqiure to get the latest thread changes */ atomic_thread_fence(memory_order_acquire); @@ -313,14 +313,14 @@ typedef struct deferredQueue { deferredQueue deferredCmdExclusive = {0}; deferredQueue slot_use_info[16384] = {0}; -typedef struct delayedJob { +typedef struct deferredJob { job_handler handler; int slot; char data[]; -} delayedJob; +} deferredJob; -/* Global thread-local storage for delayed jobs */ -static __thread list *thread_delayed_jobs = NULL; +/* Global thread-local storage for deferred jobs */ +static __thread list *thread_deferred_jobs = NULL; /* * executionContext @@ -377,8 +377,8 @@ static void dqIncr(deferredQueue *queue) { /* Create a new job with the given handler and data */ static listNode *createJobNode(int slot, job_handler handler, size_t data_size, void *data) { /* Allocate memory for job structure plus data using flexible array member */ - listNode *node = zmalloc(sizeof(listNode) + sizeof(delayedJob) + data_size); - delayedJob *job = (delayedJob *)(node + 1); + listNode *node = zmalloc(sizeof(listNode) + sizeof(deferredJob) + data_size); + deferredJob *job = (deferredJob *)(node + 1); job->slot = slot; job->handler = handler; if (data_size) { @@ -392,7 +392,7 @@ static listNode *createJobNode(int slot, job_handler handler, size_t data_size, /* Process a job immediately or add it to queue based on refcount */ static void processOrAddJob(deferredQueue *q, listNode *jobNode) { if (q->refcount == 0) { - delayedJob *job = listNodeValue(jobNode); + deferredJob *job = listNodeValue(jobNode); job->handler(job->data); zfree(jobNode); } else { @@ -436,7 +436,7 @@ static void processDeferredJobsList(deferredQueue *queue) { listRewind(queue->deferred_jobs, &li); while ((ln = listNext(&li))) { - delayedJob *job = listNodeValue(ln); + deferredJob *job = listNodeValue(ln); listUnlinkNode(queue->deferred_jobs, ln); job->handler(job->data); zfree(ln); @@ -516,7 +516,7 @@ static void dqDecr(int slot) { /* Add a client to the pending clients list of a deferred queue */ static void dqAddPendingClient(deferredQueue *queue, client *c) { /* Create the pending clients list if it doesn't exist */ - if (isClientListEmpty(queue)) { + if (queue->pending_clients == NULL) { queue->pending_clients = listCreate(); } @@ -550,39 +550,39 @@ static void dqRemoveClient(deferredQueue *queue, client *c) { server.stat_io_threaded_clients_blocked_on_slot--; } -static void delayedServerCron(void *data) { +static void deferServerCron(void *data) { UNUSED(data); long long interval = serverCron(server.el, 0, NULL); aeCreateTimeEvent(server.el, interval, serverCron, NULL, NULL); } -/* Add a delayed job to the thread-local job list */ -void threadAddDelayedJob(int slot, job_handler handler, size_t data_size, void *data) { +/* Add a deferred job to the thread-local job list */ +void threadAdddeferredJob(int slot, job_handler handler, size_t data_size, void *data) { /* Allocate memory for job structure plus data using flexible array member */ listNode *job_node = createJobNode(slot, handler, data_size, data); - listLinkNodeTail(thread_delayed_jobs, job_node); + listLinkNodeTail(thread_deferred_jobs, job_node); } -int isServerCronDelayed(void) { +int isServerCronDeferred(void) { if (!server.cluster_enabled || server.io_threads_num == 1) { return 0; } if (dqAvailable(&deferredCmdExclusive)) return 0; - listNode *job_node = createJobNode(-1, delayedServerCron, 0, NULL); + listNode *job_node = createJobNode(-1, deferServerCron, 0, NULL); listLinkNodeTail(deferredCmdExclusive.deferred_jobs, job_node); return 1; } -/* Dispatch delayed jobs based on their type */ +/* Dispatch deferred jobs based on their type */ static void dispatchThreadDeferredJobs(list *jobs_list) { listIter li; listNode *ln; listRewind(jobs_list, &li); while ((ln = listNext(&li))) { - delayedJob *job = listNodeValue(ln); + deferredJob *job = listNodeValue(ln); if (job->slot == -1) { job->handler(job->data); listDelNode(jobs_list, ln); @@ -590,7 +590,7 @@ static void dispatchThreadDeferredJobs(list *jobs_list) { listUnlinkNode(jobs_list, ln); processOrAddJob(getDeferredQueue(job->slot), ln); } - server.stat_delayed_jobs_processed++; + server.stat_deferred_jobs_processed++; } listRelease(jobs_list); @@ -768,9 +768,9 @@ void cleanupThreadResources(void *dummy) { freeSharedQueryBuf(); /* Free the delayed jobs list if it exists */ - if (thread_delayed_jobs) { - listRelease(thread_delayed_jobs); - thread_delayed_jobs = NULL; + if (thread_deferred_jobs) { + listRelease(thread_deferred_jobs); + thread_deferred_jobs = NULL; } /* Clean any other thread-specific resources here */ @@ -790,7 +790,7 @@ static void *IOThreadMain(void *myid) { initSharedQueryBuf(); setCurrentClient(NULL); setExecutingClient(NULL); - thread_delayed_jobs = listCreate(); + thread_deferred_jobs = listCreate(); pthread_cleanup_push(cleanupThreadResources, NULL); thread_id = (int)id; @@ -938,7 +938,6 @@ void initIOThreads(void) { prefetchCommandsBatchInit(); size_t io_to_mt_queue_size = (server.io_threads_num - 1) * DEFAULT_MPSC_QUEUE_SIZE_PER_THREAD; io_to_mt_queue = IoToMTQueueCreate(io_to_mt_queue_size); - thread_delayed_jobs = listCreate(); deferredCmdExclusive.pending_clients = listCreate(); deferredCmdExclusive.deferred_jobs = listCreate(); @@ -969,30 +968,40 @@ static int isCommandPostpone(client *c) { return C_ERR; } -int trySendProcessCommandToIOThreads(client *c) { +/* Check if a command can be offloaded to IO threads. + * Returns 1 if the command can be offloaded, 0 otherwise. */ +int canCommandBeOffloaded(struct serverCommand *cmd) { + if (!server.cluster_enabled) { + return 0; /* Avoid offloading commands in non cluster mode. */ + } + if (server.active_io_threads_num == 1) { - return C_ERR; /* No IO threads to offload to. */ + return 0; /* No IO threads to offload to. */ } if (!server.io_threads_do_commands_offloading) { - return C_ERR; /* Command offloading is disabled. */ + return 0; /* Command offloading is disabled. */ } /* Check if modules are loaded and module offloading is disabled */ if (moduleCount() > 0 && !server.io_threads_do_commands_offloading_with_modules) { - return C_ERR; /* Modules are loaded and module command offloading is disabled. */ + return 0; /* Modules are loaded and module command offloading is disabled. */ } - if (!(c->cmd->flags & CMD_CAN_BE_OFFLOADED)) { - return C_ERR; + if (!(cmd->flags & CMD_CAN_BE_OFFLOADED)) { + return 0; } - if (!server.cluster_enabled) { - return C_ERR; /* Avoid offloading commands in non cluster mode. */ + if (server.notify_keyspace_events & NOTIFY_KEY_MISS) { + return 0; /* Avoid offloading commands when NOTIFY_KEY_MISS is enabled. */ } - if (server.notify_keyspace_events & NOTIFY_KEY_MISS) { - return C_ERR; /* Avoid offloading commands when NOTIFY_KEY_MISS is enabled. */ + return 1; +} + +int trySendProcessCommandToIOThreads(client *c) { + if (!canCommandBeOffloaded(c->cmd)) { + return C_ERR; } if (c->io_read_state != CLIENT_IDLE || c->io_command_state != CLIENT_IDLE || c->io_write_state != CLIENT_IDLE) { @@ -1029,7 +1038,7 @@ int trySendProcessCommandToIOThreads(client *c) { /* Setting current client to NULL to avoid accessing it after it was sent to IO */ setCurrentClient(NULL); setExecutingClient(NULL); - IOJobQueue_push(&io_jobs[tid], ioThreadProcessCommand, c); + IOJobQueue_push(&io_jobs[tid], ioThreadCallCommand, c); server.stat_io_commands_pending++; return C_OK; @@ -1346,8 +1355,8 @@ static inline jobResponseType getJobResponseType(uint64_t jobData) { return type; } -static inline void *getJobData(uint64_t jobData) { - return (void *)(jobData & CLIENT_PTR_MASK); +static inline void *getJobData(uintptr_t jobData) { + return (void *)(uintptr_t)(jobData & CLIENT_PTR_MASK); } /* Function to handle read jobs */ @@ -1388,10 +1397,10 @@ static void handleWriteJobs(client **write_jobs, int write_count) { } static void threadRespondJobList(void) { - if (listLength(thread_delayed_jobs) == 0) return; + if (listLength(thread_deferred_jobs) == 0) return; - IoToMTQueueProduce((uint64_t)thread_delayed_jobs | (uint64_t)R_JOBLIST, 0); - thread_delayed_jobs = listCreate(); + IoToMTQueueProduce((uintptr_t)thread_deferred_jobs | (uintptr_t)R_JOBLIST, 0); + thread_deferred_jobs = listCreate(); } void threadRespond(client *c, jobResponseType r) { @@ -1400,7 +1409,7 @@ void threadRespond(client *c, jobResponseType r) { threadRespondJobList(); } - IoToMTQueueProduce((uint64_t)c | (uint64_t)r, 0); + IoToMTQueueProduce((uintptr_t)c | (uintptr_t)r, 0); } static void processClientIOCommandDone(client *c) { diff --git a/src/io_threads.h b/src/io_threads.h index e6b7288466..197ee8732d 100644 --- a/src/io_threads.h +++ b/src/io_threads.h @@ -6,6 +6,7 @@ struct client; struct connection; struct serverObject; +struct serverCommand; typedef enum { R_READ = 0, @@ -29,12 +30,13 @@ void drainIOThreadsQueue(void); void trySendPollJobToIOThreads(void); int trySendAcceptToIOThreads(struct connection *conn); int trySendProcessCommandToIOThreads(struct client *c); +int canCommandBeOffloaded(struct serverCommand *cmd); int processIOThreadsResponses(void); -void threadAddDelayedJob(int slot, job_handler handler, size_t len, void *data); +void threadAdddeferredJob(int slot, job_handler handler, size_t len, void *data); void threadRespond(struct client *c, jobResponseType r); int clientIOInProgress(struct client *c); int postponeClientCommand(struct client *c); -int isServerCronDelayed(void); +int isServerCronDeferred(void); void ioThreadsOnUnlinkClient(struct client *c); void pollIOThreadStats(void); int isCommandOffloadingRunning(void); diff --git a/src/kvstore.c b/src/kvstore.c index 4f1e33cced..b88f7c88ee 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -250,7 +250,7 @@ void kvstoreHashtableRehashingCompleted(hashtable *ht) { /* If not in main-thread postpone the update of kvs rehashing info to be done later by the main-thread -*/ if (!inMainThread()) { - threadAddDelayedJob(-1, kvstoreHashtableUpdateRehashingInfo, sizeof(ctx), &ctx); + threadAdddeferredJob(-1, kvstoreHashtableUpdateRehashingInfo, sizeof(ctx), &ctx); } else { kvstoreHashtableUpdateRehashingInfo(&ctx); } diff --git a/src/memory_prefetch.c b/src/memory_prefetch.c index 32ea8805c0..e78c00a16b 100644 --- a/src/memory_prefetch.c +++ b/src/memory_prefetch.c @@ -242,15 +242,13 @@ void processClientsCommandsBatch(void) { /* Check if the command is about to be offloaded to IO threads */ static int isCommandBeingOffloaded(client *c) { - if (!server.io_threads_do_commands_offloading) { + /* Check basic offloading conditions */ + if (canCommandBeOffloaded(c->parsed_cmd) != C_OK) { return 0; } - if (!server.cluster_enabled) { - return 0; - } - - return (c->parsed_cmd->flags & CMD_CAN_BE_OFFLOADED) && (c->querybuf == NULL); + /* We avoid offloading commands when there is data pending in the query buffer */ + return (c->querybuf == NULL); } /* Adds the client's command to the current batch and processes the batch @@ -314,7 +312,7 @@ void prefetchEvents(aeEventLoop *eventLoop, int cur_idx, int numevents) { /* Phase 1: Prefetch aeFileEvent structures for events that need prefetching */ for (int i = start; i < end; i++) { int mask = eventLoop->fired[i].mask; - if (mask & AE_PREFETCH) { + if (mask & AE_PRE_READABLE_HOOK) { fes[fes_idx] = &eventLoop->events[eventLoop->fired[i].fd]; valkey_prefetch(fes[fes_idx]); } else { diff --git a/src/networking.c b/src/networking.c index 82930dcbc6..63a99a225e 100644 --- a/src/networking.c +++ b/src/networking.c @@ -218,7 +218,7 @@ static inline int isReplicaReadyForReplData(client *replica) { /* Decides if copy avoidance is preferred according to client type, number of I/O threads, object size * Maybe called with NULL obj for evaluation with no regard to object size * Copy avoidance can be allowed only for regular Valkey clients - * that use _writeToClient handler to write replies to client connection */ + * that use writeClientData handler to write replies to client connection */ static int isCopyAvoidPreferred(client *c, robj *obj) { if (c->flag.fake || isDeferredReplyEnabled(c)) return 0; @@ -836,7 +836,7 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) { /* Postpone error updates if its io-thread */ if (!inMainThread()) { delayedErrorStatsUpdateCtx ctx = {.c = c, .s = sdsnewlen(s, len), .len = len, .flags = flags}; - threadAddDelayedJob(-1, afterErrorReplyDelayed, sizeof(ctx), &ctx); + threadAdddeferredJob(-1, afterErrorReplyDelayed, sizeof(ctx), &ctx); return; } @@ -2269,7 +2269,7 @@ static void postWriteToReplica(client *c) { incrementalTrimReplicationBacklog(REPL_BACKLOG_TRIM_BLOCKS_PER_CALL); } -static void writeToReplica(client *c) { +static void writeReplicaData(client *c) { listNode *last_node; size_t bufpos; @@ -2531,7 +2531,7 @@ static void proceedToUnwritten(replyIOV *reply, int nwritten) { } } -/* This function should be called from _writeToClient when the reply list is not empty, +/* This function should be called from writeClientData when the reply list is not empty, * it gathers the scattered buffers from reply list and sends them away with connWritev. * If we write successfully, it returns C_OK, otherwise, C_ERR is returned. * Sets the c->nwritten to the number of bytes the server wrote to the client. @@ -2640,7 +2640,7 @@ static int writevToClient(client *c) { /* This function does actual writing output buffers to non-replica client, it is called by writeToClient. * If we write successfully, it returns C_OK, otherwise, C_ERR is returned, * and 'c->nwritten' is set to the number of bytes the server wrote to the client. */ -int _writeToClient(client *c) { +int writeClientData(client *c) { listNode *lastblock; size_t bufpos; @@ -2775,7 +2775,7 @@ static void _postWriteToClient(client *c) { /* Updates the client's memory usage and bucket and server stats after writing. * If a write handler is installed , it will attempt to clear the write event. * If the client is no longer valid, it will return C_ERR, otherwise C_OK. */ -int postWriteToClient(client *c) { +int postWriteClientData(client *c) { c->io_last_reply_block = NULL; c->io_last_bufpos = 0; /* Update total number of writes on server */ @@ -2830,12 +2830,12 @@ int writeToClient(client *c) { c->write_flags = 0; if (getClientType(c) == CLIENT_TYPE_REPLICA) { - writeToReplica(c); + writeReplicaData(c); } else { - _writeToClient(c); + writeClientData(c); } - return postWriteToClient(c); + return postWriteClientData(c); } /* Write event handler. Just send data to the client. */ @@ -2999,7 +2999,7 @@ void processClientIOWriteDone(client *c, int allow_async_writes) { connSetPostponeUpdateState(c->conn, 0); connUpdateState(c->conn); - if (postWriteToClient(c) == C_ERR) { + if (postWriteClientData(c) == C_ERR) { return; } @@ -5938,54 +5938,10 @@ void ioThreadWriteToClient(void *data) { serverAssert(c->io_write_state == CLIENT_PENDING_IO); c->nwritten = 0; if (c->write_flags & WRITE_FLAGS_IS_REPLICA) { - writeToReplica(c); + writeReplicaData(c); } else { - _writeToClient(c); + writeClientData(c); } c->io_write_state = CLIENT_COMPLETED_IO; threadRespond(c, R_WRITE); } - -void ioThreadProcessCommand(void *data) { - client *c = (client *)data; - serverAssert(c->cmd->flags & CMD_CAN_BE_OFFLOADED); - const long long call_timer = ustime(); - c->flag.executing_command = 1; - setCurrentClient(c); - setExecutingClient(c); - - monotime monotonic_start = 0; - if (monotonicGetType() == MONOTONIC_CLOCK_HW) { - monotonic_start = getMonotonicUs(); - } - - /* Execute the command */ - c->cmd->proc(c); - - c->flag.executing_command = 0; - - ustime_t duration; - if (monotonicGetType() == MONOTONIC_CLOCK_HW) - duration = getMonotonicUs() - monotonic_start; - else - duration = ustime() - call_timer; - - c->duration += duration; - - /* Send write response to the client */ - c->nwritten = 0; - c->write_flags = 0; - /* Set the rebly block and bufpos */ - c->io_last_reply_block = listLast(c->reply); - if (c->io_last_reply_block) { - c->io_last_bufpos = ((clientReplyBlock *)listNodeValue(c->io_last_reply_block))->used; - } else { - c->io_last_bufpos = (size_t)c->bufpos; - } - - _writeToClient(c); - - c->io_command_state = CLIENT_COMPLETED_IO; - c->io_write_state = CLIENT_COMPLETED_IO; - threadRespond(c, R_COMMAND); -} diff --git a/src/server.c b/src/server.c index 171c57e63a..05095886f0 100644 --- a/src/server.c +++ b/src/server.c @@ -1485,7 +1485,7 @@ long long serverCron(struct aeEventLoop *eventLoop, long long id, void *clientDa UNUSED(id); UNUSED(clientData); - if (isServerCronDelayed()) { + if (isServerCronDeferred()) { return AE_NOMORE; } @@ -2735,7 +2735,7 @@ void resetServerStats(void) { server.stat_total_writes_processed = 0; server.stat_client_qbuf_limit_disconnections = 0; server.stat_client_outbuf_limit_disconnections = 0; - server.stat_delayed_jobs_processed = 0; + server.stat_deferred_jobs_processed = 0; for (j = 0; j < STATS_METRIC_COUNT; j++) { server.inst_metric[j].idx = 0; server.inst_metric[j].last_sample_base = 0; @@ -2881,7 +2881,7 @@ void initServer(void) { exit(1); } /* Set the epoll batch size for the server event loop */ - server.el->epoll_batch_size = AE_EPOLL_EVENTS_BATCH_SIZE; + aeSetEpollBatchSize(server.el, AE_EPOLL_EVENTS_BATCH_SIZE); aeSetPrefetchProc(server.el, prefetchEvents); @@ -3946,6 +3946,53 @@ void call(client *c, int flags) { setExecutingClient(prev_client); } +/* Execute a command that has been offloaded to an IO thread. + * This function is called by IO threads. + * It executes the command and writes the response back to the client. */ +void ioThreadCallCommand(void *data) { + client *c = (client *)data; + serverAssert(c->cmd->flags & CMD_CAN_BE_OFFLOADED); + const long long call_timer = ustime(); + c->flag.executing_command = 1; + setCurrentClient(c); + setExecutingClient(c); + + monotime monotonic_start = 0; + if (monotonicGetType() == MONOTONIC_CLOCK_HW) { + monotonic_start = getMonotonicUs(); + } + + /* Execute the command */ + c->cmd->proc(c); + + c->flag.executing_command = 0; + + ustime_t duration; + if (monotonicGetType() == MONOTONIC_CLOCK_HW) + duration = getMonotonicUs() - monotonic_start; + else + duration = ustime() - call_timer; + + c->duration += duration; + + /* Send write response to the client */ + c->nwritten = 0; + c->write_flags = 0; + /* Set the reply block and bufpos */ + c->io_last_reply_block = listLast(c->reply); + if (c->io_last_reply_block) { + c->io_last_bufpos = ((clientReplyBlock *)listNodeValue(c->io_last_reply_block))->used; + } else { + c->io_last_bufpos = (size_t)c->bufpos; + } + + writeClientData(c); + + c->io_command_state = CLIENT_COMPLETED_IO; + c->io_write_state = CLIENT_COMPLETED_IO; + threadRespond(c, R_COMMAND); +} + /* Used when a command that is ready for execution needs to be rejected, due to * various pre-execution checks. it returns the appropriate error to the client. * If there's a transaction is flags it as dirty, and if the command is EXEC, @@ -6102,7 +6149,7 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "io_threaded_total_prefetch_entries:%lld\r\n", server.stat_total_prefetch_entries, "io_threaded_clients_blocked_on_slot:%lld\r\n", server.stat_io_threaded_clients_blocked_on_slot, "io_threaded_clients_blocked_total:%lld\r\n", server.stat_io_threaded_clients_blocked_total, - "io_threaded_postponed_jobs_to_mainthread:%lld\r\n", server.stat_delayed_jobs_processed, + "io_threaded_postponed_jobs_to_mainthread:%lld\r\n", server.stat_deferred_jobs_processed, "client_query_buffer_limit_disconnections:%lld\r\n", server.stat_client_qbuf_limit_disconnections, "client_output_buffer_limit_disconnections:%lld\r\n", server.stat_client_outbuf_limit_disconnections, "reply_buffer_shrinks:%lld\r\n", server.stat_reply_buffer_shrinks, diff --git a/src/server.h b/src/server.h index f5d56b6cc6..8703b85e62 100644 --- a/src/server.h +++ b/src/server.h @@ -1807,7 +1807,7 @@ struct valkeyServer { long long stat_client_outbuf_limit_disconnections; /* Total number of clients reached output buf length limit */ long long stat_total_prefetch_entries; /* Total number of prefetched dict entries */ long long stat_total_prefetch_batches; /* Total number of prefetched batches */ - long long stat_delayed_jobs_processed; /* Total number of delayed jobs sent to main thread from worker threads */ + long long stat_deferred_jobs_processed; /* Total number of delayed jobs sent to main thread from worker threads */ /* The following two are used to track instantaneous metrics, like * number of operations per second, network traffic. */ struct { @@ -2832,6 +2832,7 @@ void removeClientFromMemUsageBucket(client *c, int allow_eviction); void unlinkClient(client *c); void removeFromServerClientList(client *c); int writeToClient(client *c); +int writeClientData(client *c); void linkClient(client *c); void protectClient(client *c); void unprotectClient(client *c); @@ -2846,7 +2847,7 @@ void deleteCachedResponseClient(client *recording_client); void waitForClientIO(client *c); void ioThreadReadQueryFromClient(void *data); void ioThreadWriteToClient(void *data); -void ioThreadProcessCommand(void *data); +void ioThreadCallCommand(void *data); int canParseCommand(client *c); void processClientIOReadsDone(client *c); void processClientIOWriteDone(client *c, int allow_async_writes); diff --git a/src/socket.c b/src/socket.c index 2cf2db90e9..6f1c76f849 100644 --- a/src/socket.c +++ b/src/socket.c @@ -251,7 +251,7 @@ static int connSocketSetReadHandler(connection *conn, ConnectionCallbackFunc fun conn->read_handler = func; if (!conn->read_handler) aeDeleteFileEvent(server.el, conn->fd, AE_READABLE); - else if (aeCreateFileEvent(server.el, conn->fd, AE_READABLE | AE_PREFETCH, conn->type->ae_handler, conn) == AE_ERR) + else if (aeCreateFileEvent(server.el, conn->fd, AE_READABLE | AE_PRE_READABLE_HOOK, conn->type->ae_handler, conn) == AE_ERR) return C_ERR; return C_OK; } diff --git a/src/tls.c b/src/tls.c index edaa4d9d74..0432fb42cd 100644 --- a/src/tls.c +++ b/src/tls.c @@ -591,7 +591,7 @@ static void registerSSLEvent(tls_connection *conn) { if (conn->flags & TLS_CONN_FLAG_WRITE_WANT_READ) { if (mask & AE_WRITABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE); - if (!(mask & AE_READABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE | AE_PREFETCH, tlsEventHandler, conn); + if (!(mask & AE_READABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE | AE_PRE_READABLE_HOOK, tlsEventHandler, conn); } else if (conn->flags & TLS_CONN_FLAG_READ_WANT_WRITE) { if (mask & AE_READABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE); if (!(mask & AE_WRITABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_WRITABLE, tlsEventHandler, conn); @@ -641,7 +641,7 @@ static void updateSSLEvent(tls_connection *conn) { int need_write = conn->c.write_handler || (conn->flags & TLS_CONN_FLAG_READ_WANT_WRITE); if (need_read && !(mask & AE_READABLE)) - aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE | AE_PREFETCH, tlsEventHandler, conn); + aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE | AE_PRE_READABLE_HOOK, tlsEventHandler, conn); if (!need_read && (mask & AE_READABLE)) aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE); if (need_write && !(mask & AE_WRITABLE)) diff --git a/src/unit/test_networking.c b/src/unit/test_networking.c index fe23992ad4..154e3426a5 100644 --- a/src/unit/test_networking.c +++ b/src/unit/test_networking.c @@ -92,7 +92,7 @@ int test_writeToReplica(int argc, char **argv, int flags) { c->repl_data->ref_block_pos = 0; c->bufpos = 0; - writeToReplica(c); + writeReplicaData(c); TEST_ASSERT(c->nwritten == 64); TEST_ASSERT(fake_conn->written == 64); @@ -132,7 +132,7 @@ int test_writeToReplica(int argc, char **argv, int flags) { c->repl_data->ref_block_pos = 0; c->bufpos = 0; - writeToReplica(c); + writeReplicaData(c); TEST_ASSERT(c->nwritten == 96); /* 64 + 32 */ TEST_ASSERT(fake_conn->written == 96); @@ -170,7 +170,7 @@ int test_writeToReplica(int argc, char **argv, int flags) { c->repl_data->ref_block_pos = 0; c->bufpos = 0; - writeToReplica(c); + writeReplicaData(c); TEST_ASSERT(c->nwritten <= 0); TEST_ASSERT((c->write_flags & WRITE_FLAGS_WRITE_ERROR) != 0); From 01b7b2c1bef1bc293495ff8520b2480871241216 Mon Sep 17 00:00:00 2001 From: Uri Yagelnik Date: Tue, 17 Jun 2025 17:33:40 +0000 Subject: [PATCH 3/5] Adress PR comments --- src/io_threads.c | 10 +++++----- src/io_threads.h | 2 +- src/kvstore.c | 2 +- src/memory_prefetch.c | 3 +-- src/module.c | 3 ++- src/networking.c | 8 ++++---- src/rdb.c | 7 ++++--- src/server.c | 16 +++++++++------- tests/unit/io-threads.tcl | 16 ++++++++++------ 9 files changed, 37 insertions(+), 30 deletions(-) diff --git a/src/io_threads.c b/src/io_threads.c index 9fd3a9c0ff..f671398588 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -404,7 +404,7 @@ static void processOrAddJob(deferredQueue *q, listNode *jobNode) { } /* Returns whether the given command requires exclusive access to the whole database. */ -static int isDBExclusiveCmd(struct serverCommand *cmd, int slot) { +static int isServerExclusiveCmd(struct serverCommand *cmd, int slot) { /* If no slot is specified but the client command changes the keyspace, we assume it is an exclusive command */ if (slot == -1 && (cmd->flags & CMD_WRITE)) return 1; /* The exec command can contain commands that may affect the whole database */ @@ -422,7 +422,7 @@ static int isSlotExclusiveCmd(struct serverCommand *cmd, int slot) { if (cmd->flags & CMD_CAN_BE_OFFLOADED) return 0; /* Not slot exclusive rather DB exclusive */ - if (isDBExclusiveCmd(cmd, slot)) return 0; + if (isServerExclusiveCmd(cmd, slot)) return 0; return 1; } @@ -468,7 +468,7 @@ static void dqProcessPendingClients(int slot) { /* Check if we need to wait due to exclusive commands */ if (queue->refcount) { if (dq_context == CTX_EXCLUSIVE) { - if (isDBExclusiveCmd(c->cmd, c->slot)) break; + if (isServerExclusiveCmd(c->cmd, c->slot)) break; } else { if (isSlotExclusiveCmd(c->cmd, c->slot)) break; } @@ -605,7 +605,7 @@ int postponeClientCommand(client *c) { /* An exclusive command can be processed either when processing the exclusive deferered queue * or in immediate mode if there are no read commands executed in queues*/ - if (isDBExclusiveCmd(c->cmd, c->slot)) { + if (isServerExclusiveCmd(c->cmd, c->slot)) { if (dq_context == CTX_EXCLUSIVE) return 1; if (dqAvailable(&deferredCmdExclusive)) return 1; @@ -665,7 +665,7 @@ void drainIOThreadsQueue(void) { } /* Returns if there is an IO operation in progress for the given client. */ -int clientIOInProgress(client *c) { +int clientHandlingThreadedIO(client *c) { return c->io_read_state != CLIENT_IDLE || c->io_write_state != CLIENT_IDLE || c->io_command_state != CLIENT_IDLE; } diff --git a/src/io_threads.h b/src/io_threads.h index 197ee8732d..cd9e5fc6e4 100644 --- a/src/io_threads.h +++ b/src/io_threads.h @@ -34,7 +34,7 @@ int canCommandBeOffloaded(struct serverCommand *cmd); int processIOThreadsResponses(void); void threadAdddeferredJob(int slot, job_handler handler, size_t len, void *data); void threadRespond(struct client *c, jobResponseType r); -int clientIOInProgress(struct client *c); +int clientHandlingThreadedIO(struct client *c); int postponeClientCommand(struct client *c); int isServerCronDeferred(void); void ioThreadsOnUnlinkClient(struct client *c); diff --git a/src/kvstore.c b/src/kvstore.c index b88f7c88ee..a8e653ab1c 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -66,7 +66,7 @@ struct _kvstore { unsigned long long bucket_count; /* Total number of buckets in this kvstore across hash tables. */ unsigned long long *hashtable_size_index; /* Binary indexed tree (BIT) that describes cumulative key frequencies up until * given hashtable-index. */ - _Atomic size_t overhead_hashtable_lut; /* Overhead of all hashtables in bytes, Atomic as it may be update by the IO threads */ + _Atomic size_t overhead_hashtable_lut; /* Overhead of all hashtables in bytes, Atomic as it may be updated by the IO threads for rehashing. */ size_t overhead_hashtable_rehashing; /* Overhead of hash tables rehashing in bytes. */ }; diff --git a/src/memory_prefetch.c b/src/memory_prefetch.c index e78c00a16b..b1290b2fee 100644 --- a/src/memory_prefetch.c +++ b/src/memory_prefetch.c @@ -242,8 +242,7 @@ void processClientsCommandsBatch(void) { /* Check if the command is about to be offloaded to IO threads */ static int isCommandBeingOffloaded(client *c) { - /* Check basic offloading conditions */ - if (canCommandBeOffloaded(c->parsed_cmd) != C_OK) { + if (!canCommandBeOffloaded(c->parsed_cmd)) { return 0; } diff --git a/src/module.c b/src/module.c index d8c1d7df83..a6dad3cccc 100644 --- a/src/module.c +++ b/src/module.c @@ -6485,7 +6485,8 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const } int deny_write_type = writeCommandsDeniedByDiskError(); - int obey_client = (getCurrentClient() && mustObeyClient(getCurrentClient())); + client *current_client = getCurrentClient(); + int obey_client = (current_client && mustObeyClient(current_client)); if (deny_write_type != DISK_ERROR_TYPE_NONE && !obey_client) { errno = ESPIPE; diff --git a/src/networking.c b/src/networking.c index 63a99a225e..9ea76cf006 100644 --- a/src/networking.c +++ b/src/networking.c @@ -1957,7 +1957,7 @@ int freeClient(client *c) { /* If a client is protected, yet we need to free it right now, make sure * to at least use asynchronous freeing. */ - if (c->flag.protected || c->flag.protected_rdb_channel || clientIOInProgress(c)) { + if (c->flag.protected || c->flag.protected_rdb_channel || clientHandlingThreadedIO(c)) { freeClientAsync(c); return 0; } @@ -2214,7 +2214,7 @@ int freeClientsInAsyncFreeQueue(void) { c->flag.protected_rdb_channel = 0; } - if (clientIOInProgress(c)) continue; + if (clientHandlingThreadedIO(c)) continue; if (c->flag.protected) continue; @@ -5858,7 +5858,7 @@ void evictClients(void) { client *c = ln->value; if (c->flag.close_asap) { /* We don't want to continue evicting clients in this case - * since it can cause multiple clients to be evicted unnecssarily */ + * since it can cause multiple clients to be evicted unnecessarily */ break; } sds ci = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); @@ -5868,7 +5868,7 @@ void evictClients(void) { if (freeClient(c) == 0) { /* The client is protected and will be closed later. * We don't want to continue evicting clients in this case - * since it can cause multiple clients to be evicted unnecssarily */ + * since it can cause multiple clients to be evicted unnecessarily */ break; } } else { diff --git a/src/rdb.c b/src/rdb.c index 51cd4d9c21..c89a649cfc 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1879,11 +1879,12 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { int deep_integrity_validation = server.sanitize_dump_payload == SANITIZE_DUMP_YES; if (server.sanitize_dump_payload == SANITIZE_DUMP_CLIENTS) { + client *current_client = getCurrentClient(); /* Skip sanitization when loading (an RDB), or getting a RESTORE command * from either the primary or a client using an ACL user with the skip-sanitize-payload flag. */ - int skip = server.loading || (getCurrentClient() && (getCurrentClient()->flag.primary)); - if (!skip && getCurrentClient() && getCurrentClient()->user) - skip = !!(getCurrentClient()->user->flags & USER_FLAG_SANITIZE_PAYLOAD_SKIP); + int skip = server.loading || (current_client && (current_client->flag.primary)); + if (!skip && current_client && current_client->user) + skip = !!(current_client->user->flags & USER_FLAG_SANITIZE_PAYLOAD_SKIP); deep_integrity_validation = !skip; } diff --git a/src/server.c b/src/server.c index 05095886f0..88f54fd9c0 100644 --- a/src/server.c +++ b/src/server.c @@ -3614,8 +3614,9 @@ static void propagatePendingCommands(void) { /* In case a command that may modify random keys was run *directly* * (i.e. not from within a script, MULTI/EXEC, RM_Call, etc.) we want * to avoid using a transaction (much like active-expire) */ - if (getCurrentClient() && getCurrentClient()->cmd && - getCurrentClient()->cmd->flags & CMD_TOUCHES_ARBITRARY_KEYS) { + client *current_client = getCurrentClient(); + if (current_client && current_client->cmd && + current_client->cmd->flags & CMD_TOUCHES_ARBITRARY_KEYS) { transaction = 0; } @@ -3905,22 +3906,23 @@ void call(client *c, int flags) { /* If the client has keys tracking enabled for client side caching, * make sure to remember the keys it fetched via this command. For read-only * scripts, don't process the script, only the commands it executes. */ + client *current_client = getCurrentClient(); if ((c->cmd->flags & CMD_READONLY) && (c->cmd->proc != evalRoCommand) && (c->cmd->proc != evalShaRoCommand) && (c->cmd->proc != fcallroCommand)) { /* We use the tracking flag of the original external client that * triggered the command, but we take the keys from the actual command * being executed. */ - if (getCurrentClient() && (getCurrentClient()->flag.tracking) && - !(getCurrentClient()->flag.tracking_bcast)) { - trackingRememberKeys(getCurrentClient(), c); + if (current_client && (current_client->flag.tracking) && + !(current_client->flag.tracking_bcast)) { + trackingRememberKeys(current_client, c); } } if (!c->flag.blocked) { /* Modules may call commands in cron, in which case current_client * is not set. */ - if (getCurrentClient()) { - getCurrentClient()->commands_processed++; + if (current_client) { + current_client->commands_processed++; } server.stat_numcommands++; } diff --git a/tests/unit/io-threads.tcl b/tests/unit/io-threads.tcl index 3660ecc65b..03908c0b30 100644 --- a/tests/unit/io-threads.tcl +++ b/tests/unit/io-threads.tcl @@ -321,30 +321,34 @@ if {$::io_threads} { } test {Key expiry is postponed when read from main thread} { + r debug set-active-expire 0 # This test verifies that when a key with expiry is read from the IO thread, # its expiry deletion is postponed to the main-thread to prevent race conditions with the main-thread set updated_info_stats [r INFO] set initial_postponed [getInfoProperty $updated_info_stats io_threaded_postponed_jobs_to_mainthread] - # Set a key with a short expiry time (2 seconds) + # Set a key with a long expiry time (100 sec) r set key "val" - r expire key 2 - + r expire key 100 # Verify the key exists assert_equal "val" [r get key] + + # Set a key with a short expiry time (100 ms) + r set key2 "val2" + r pexpire key2 100 # Wait for the key to be expired - after 2000 + after 200 # Read the key (this should trigger expiry postponement) # The answer should be empty - assert_equal {} [r get key] + assert_equal {} [r get key2] # Check if postponed jobs counter increased set updated_info_stats [r INFO] set updated_postponed [getInfoProperty $updated_info_stats io_threaded_postponed_jobs_to_mainthread] # The postponed jobs counter should have increased if expiry was postponed - assert {$updated_postponed > $initial_postponed} + assert {$updated_postponed == $initial_postponed + 1} } test "test io-threads are runtime modifiable" { From 2ef2a4a8a6f1bcb82641ef131fd4898abbf0e026 Mon Sep 17 00:00:00 2001 From: Uri Yagelnik Date: Wed, 18 Jun 2025 12:41:29 +0000 Subject: [PATCH 4/5] server.cmd_time_snapshot change --- src/networking.c | 6 +++--- src/server.c | 12 ++++++++---- src/server.h | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/networking.c b/src/networking.c index 9ea76cf006..140ab1fb3c 100644 --- a/src/networking.c +++ b/src/networking.c @@ -5728,8 +5728,8 @@ void processEventsWhileBlocked(void) { * provide a fresher time than the one from when the script started (they * still won't get it from the call due to execution_nesting. For commands * during loading this doesn't matter. */ - mstime_t prev_cmd_time_snapshot = server.cmd_time_snapshot; - server.cmd_time_snapshot = server.mstime; + mstime_t prev_cmd_time_snapshot = cmd_time_snapshot; + cmd_time_snapshot = server.mstime; /* Note: when we are processing events while blocked (for instance during * busy Lua scripts), we set a global flag. When such flag is set, we @@ -5755,7 +5755,7 @@ void processEventsWhileBlocked(void) { ProcessingEventsWhileBlocked--; serverAssert(ProcessingEventsWhileBlocked >= 0); - server.cmd_time_snapshot = prev_cmd_time_snapshot; + cmd_time_snapshot = prev_cmd_time_snapshot; } /* Return 1 if the client read is handled using threaded I/O. diff --git a/src/server.c b/src/server.c index 88f54fd9c0..ec2ea74842 100644 --- a/src/server.c +++ b/src/server.c @@ -95,6 +95,7 @@ struct sharedObjectsStruct shared; __thread client *_current_client; /* The client that triggered the command execution (External or AOF). */ __thread client *_executing_client; /* The client executing the current command (possibly script or module). */ +__thread mstime_t cmd_time_snapshot; /* Time snapshot of the root execution nesting. */ /* Global vars that are actually used as constants. The following double * values are used for double on-disk serialization, and are initialized @@ -353,7 +354,7 @@ mstime_t commandTimeSnapshot(void) { * propagation to replicas / AOF consistent. See issue #1525 for more info. * Note that we cannot use the cached server.mstime because it can change * in processEventsWhileBlocked etc. */ - return server.cmd_time_snapshot; + return cmd_time_snapshot; } /* After an RDB dump or AOF rewrite we exit from children using _exit() instead of @@ -1362,7 +1363,7 @@ void enterExecutionUnit(int update_cached_time, long long us) { us = ustime(); } updateCachedTimeWithUs(0, us); - server.cmd_time_snapshot = server.mstime; + cmd_time_snapshot = server.mstime; } } @@ -2005,7 +2006,7 @@ void afterSleep(struct aeEventLoop *eventLoop, int numevents) { * e.g. somehow used by module timers. Don't update it while yielding to a * blocked command, call() will handle that and restore the original time. */ if (!ProcessingEventsWhileBlocked) { - server.cmd_time_snapshot = server.mstime; + cmd_time_snapshot = server.mstime; } adjustIOThreadsByEventLoad(numevents, 0); @@ -2216,7 +2217,7 @@ void initServerConfig(void) { initConfigValues(); updateCachedTime(1); - server.cmd_time_snapshot = server.mstime; + cmd_time_snapshot = server.mstime; getRandomHexChars(server.runid, CONFIG_RUN_ID_SIZE); server.runid[CONFIG_RUN_ID_SIZE] = '\0'; changeReplicationId(); @@ -3958,6 +3959,9 @@ void ioThreadCallCommand(void *data) { c->flag.executing_command = 1; setCurrentClient(c); setExecutingClient(c); + + /* Set command time snapshot for this thread context */ + cmd_time_snapshot = server.mstime; monotime monotonic_start = 0; if (monotonicGetType() == MONOTONIC_CLOCK_HW) { diff --git a/src/server.h b/src/server.h index 8703b85e62..86303d837b 100644 --- a/src/server.h +++ b/src/server.h @@ -1321,6 +1321,7 @@ typedef struct client { extern __thread client *_current_client; /* The client that triggered the command execution (External or AOF). */ extern __thread client *_executing_client; /* The client executing the current command (possibly script or module). */ +extern __thread mstime_t cmd_time_snapshot; /* Time snapshot of the root execution nesting. */ #define getCurrentClient() (_current_client) #define setCurrentClient(c) (_current_client = (c)) @@ -2109,7 +2110,6 @@ struct valkeyServer { _Atomic int daylight_active; /* Currently in daylight saving time. */ mstime_t mstime; /* 'unixtime' in milliseconds. */ ustime_t ustime; /* 'unixtime' in microseconds. */ - mstime_t cmd_time_snapshot; /* Time snapshot of the root execution nesting. */ size_t blocking_op_nesting; /* Nesting level of blocking operation, used to reset blocked_last_cron. */ long long blocked_last_cron; /* Indicate the mstime of the last time we did cron jobs from a blocking operation */ /* Pubsub */ From 5b258fa7bfedea466f86c31b8fc083b54ee5d9ac Mon Sep 17 00:00:00 2001 From: Uri Yagelnik Date: Wed, 18 Jun 2025 16:17:54 +0000 Subject: [PATCH 5/5] Adress PR comments, removing CAN_BE_OFFLOADED flag Signed-off-by: Uri Yagelnik --- src/commands.def | 66 +++++++++++++++--------------- src/commands/get.json | 3 +- src/commands/hexists.json | 3 +- src/commands/hget.json | 3 +- src/commands/hgetall.json | 3 +- src/commands/hkeys.json | 3 +- src/commands/hlen.json | 3 +- src/commands/hmget.json | 3 +- src/commands/hrandfield.json | 3 +- src/commands/hscan.json | 3 +- src/commands/hstrlen.json | 3 +- src/commands/hvals.json | 3 +- src/commands/lindex.json | 3 +- src/commands/llen.json | 3 +- src/commands/lrange.json | 3 +- src/commands/zcard.json | 3 +- src/commands/zcount.json | 3 +- src/commands/zdiff.json | 3 +- src/commands/zinter.json | 3 +- src/commands/zintercard.json | 3 +- src/commands/zlexcount.json | 3 +- src/commands/zmscore.json | 3 +- src/commands/zrandmember.json | 3 +- src/commands/zrange.json | 3 +- src/commands/zrangebylex.json | 3 +- src/commands/zrangebyscore.json | 3 +- src/commands/zrank.json | 3 +- src/commands/zrevrange.json | 3 +- src/commands/zrevrangebylex.json | 3 +- src/commands/zrevrangebyscore.json | 3 +- src/commands/zrevrank.json | 3 +- src/commands/zscan.json | 3 +- src/commands/zscore.json | 3 +- src/commands/zunion.json | 3 +- src/db.c | 2 +- src/io_threads.c | 16 ++++---- src/io_threads.h | 4 +- src/kvstore.c | 2 +- src/lazyfree.c | 2 + src/memory_prefetch.c | 2 +- src/networking.c | 2 +- src/server.c | 1 - src/server.h | 13 +++--- 43 files changed, 91 insertions(+), 118 deletions(-) diff --git a/src/commands.def b/src/commands.def index 516862c5d5..7585f56f32 100644 --- a/src/commands.def +++ b/src/commands.def @@ -11229,21 +11229,21 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("geosearchstore","Queries a geospatial index for members inside an area of a box, a circle, or a polygon, optionally stores the result.","O(N+log(M)) where N is the number of elements in the grid-aligned bounding box area around the shape provided as the filter and M is the number of items inside the shape","6.2.0",CMD_DOC_NONE,NULL,NULL,"geo",COMMAND_GROUP_GEO,GEOSEARCHSTORE_History,2,GEOSEARCHSTORE_Tips,0,geosearchstoreCommand,-8,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_GEO,GEOSEARCHSTORE_Keyspecs,2,NULL,7),.args=GEOSEARCHSTORE_Args}, /* hash */ {MAKE_CMD("hdel","Deletes one or more fields and their values from a hash. Deletes the hash if no fields remain.","O(N) where N is the number of fields to be removed.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HDEL_History,1,HDEL_Tips,0,hdelCommand,-3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_HASH,HDEL_Keyspecs,1,NULL,2),.args=HDEL_Args}, -{MAKE_CMD("hexists","Determines whether a field exists in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXISTS_History,0,HEXISTS_Tips,0,hexistsCommand,3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HEXISTS_Keyspecs,1,NULL,2),.args=HEXISTS_Args}, -{MAKE_CMD("hget","Returns the value of a field in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGET_History,0,HGET_Tips,0,hgetCommand,3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HGET_Keyspecs,1,NULL,2),.args=HGET_Args}, -{MAKE_CMD("hgetall","Returns all fields and values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGETALL_History,0,HGETALL_Tips,1,hgetallCommand,2,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HGETALL_Keyspecs,1,NULL,1),.args=HGETALL_Args}, +{MAKE_CMD("hexists","Determines whether a field exists in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HEXISTS_History,0,HEXISTS_Tips,0,hexistsCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HEXISTS_Keyspecs,1,NULL,2),.args=HEXISTS_Args}, +{MAKE_CMD("hget","Returns the value of a field in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGET_History,0,HGET_Tips,0,hgetCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HGET_Keyspecs,1,NULL,2),.args=HGET_Args}, +{MAKE_CMD("hgetall","Returns all fields and values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HGETALL_History,0,HGETALL_Tips,1,hgetallCommand,2,CMD_READONLY,ACL_CATEGORY_HASH,HGETALL_Keyspecs,1,NULL,1),.args=HGETALL_Args}, {MAKE_CMD("hincrby","Increments the integer value of a field in a hash by a number. Uses 0 as initial value if the field doesn't exist.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HINCRBY_History,0,HINCRBY_Tips,0,hincrbyCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HINCRBY_Keyspecs,1,NULL,3),.args=HINCRBY_Args}, {MAKE_CMD("hincrbyfloat","Increments the floating point value of a field by a number. Uses 0 as initial value if the field doesn't exist.","O(1)","2.6.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HINCRBYFLOAT_History,0,HINCRBYFLOAT_Tips,0,hincrbyfloatCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HINCRBYFLOAT_Keyspecs,1,NULL,3),.args=HINCRBYFLOAT_Args}, -{MAKE_CMD("hkeys","Returns all fields in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HKEYS_History,0,HKEYS_Tips,1,hkeysCommand,2,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HKEYS_Keyspecs,1,NULL,1),.args=HKEYS_Args}, -{MAKE_CMD("hlen","Returns the number of fields in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HLEN_History,0,HLEN_Tips,0,hlenCommand,2,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HLEN_Keyspecs,1,NULL,1),.args=HLEN_Args}, -{MAKE_CMD("hmget","Returns the values of all fields in a hash.","O(N) where N is the number of fields being requested.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HMGET_History,0,HMGET_Tips,0,hmgetCommand,-3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HMGET_Keyspecs,1,NULL,2),.args=HMGET_Args}, +{MAKE_CMD("hkeys","Returns all fields in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HKEYS_History,0,HKEYS_Tips,1,hkeysCommand,2,CMD_READONLY,ACL_CATEGORY_HASH,HKEYS_Keyspecs,1,NULL,1),.args=HKEYS_Args}, +{MAKE_CMD("hlen","Returns the number of fields in a hash.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HLEN_History,0,HLEN_Tips,0,hlenCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HLEN_Keyspecs,1,NULL,1),.args=HLEN_Args}, +{MAKE_CMD("hmget","Returns the values of all fields in a hash.","O(N) where N is the number of fields being requested.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HMGET_History,0,HMGET_Tips,0,hmgetCommand,-3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HMGET_Keyspecs,1,NULL,2),.args=HMGET_Args}, {MAKE_CMD("hmset","Sets the values of multiple fields.","O(N) where N is the number of fields being set.","2.0.0",CMD_DOC_DEPRECATED,"`HSET` with multiple field-value pairs","4.0.0","hash",COMMAND_GROUP_HASH,HMSET_History,0,HMSET_Tips,0,hsetCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HMSET_Keyspecs,1,NULL,2),.args=HMSET_Args}, -{MAKE_CMD("hrandfield","Returns one or more random fields from a hash.","O(N) where N is the number of fields returned","6.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HRANDFIELD_History,0,HRANDFIELD_Tips,1,hrandfieldCommand,-2,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HRANDFIELD_Keyspecs,1,NULL,2),.args=HRANDFIELD_Args}, -{MAKE_CMD("hscan","Iterates over fields and values of a hash.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSCAN_History,0,HSCAN_Tips,1,hscanCommand,-3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HSCAN_Keyspecs,1,NULL,5),.args=HSCAN_Args}, +{MAKE_CMD("hrandfield","Returns one or more random fields from a hash.","O(N) where N is the number of fields returned","6.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HRANDFIELD_History,0,HRANDFIELD_Tips,1,hrandfieldCommand,-2,CMD_READONLY,ACL_CATEGORY_HASH,HRANDFIELD_Keyspecs,1,NULL,2),.args=HRANDFIELD_Args}, +{MAKE_CMD("hscan","Iterates over fields and values of a hash.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSCAN_History,0,HSCAN_Tips,1,hscanCommand,-3,CMD_READONLY,ACL_CATEGORY_HASH,HSCAN_Keyspecs,1,NULL,5),.args=HSCAN_Args}, {MAKE_CMD("hset","Creates or modifies the value of a field in a hash.","O(1) for each field/value pair added, so O(N) to add N field/value pairs when the command is called with multiple field/value pairs.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSET_History,1,HSET_Tips,0,hsetCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HSET_Keyspecs,1,NULL,2),.args=HSET_Args}, {MAKE_CMD("hsetnx","Sets the value of a field in a hash only when the field doesn't exist.","O(1)","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSETNX_History,0,HSETNX_Tips,0,hsetnxCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HASH,HSETNX_Keyspecs,1,NULL,3),.args=HSETNX_Args}, -{MAKE_CMD("hstrlen","Returns the length of the value of a field.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSTRLEN_History,0,HSTRLEN_Tips,0,hstrlenCommand,3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HSTRLEN_Keyspecs,1,NULL,2),.args=HSTRLEN_Args}, -{MAKE_CMD("hvals","Returns all values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HVALS_History,0,HVALS_Tips,1,hvalsCommand,2,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_HASH,HVALS_Keyspecs,1,NULL,1),.args=HVALS_Args}, +{MAKE_CMD("hstrlen","Returns the length of the value of a field.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HSTRLEN_History,0,HSTRLEN_Tips,0,hstrlenCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_HASH,HSTRLEN_Keyspecs,1,NULL,2),.args=HSTRLEN_Args}, +{MAKE_CMD("hvals","Returns all values in a hash.","O(N) where N is the size of the hash.","2.0.0",CMD_DOC_NONE,NULL,NULL,"hash",COMMAND_GROUP_HASH,HVALS_History,0,HVALS_Tips,1,hvalsCommand,2,CMD_READONLY,ACL_CATEGORY_HASH,HVALS_Keyspecs,1,NULL,1),.args=HVALS_Args}, /* hyperloglog */ {MAKE_CMD("pfadd","Adds elements to a HyperLogLog key. Creates the key if it doesn't exist.","O(1) to add every element.","2.8.9",CMD_DOC_NONE,NULL,NULL,"hyperloglog",COMMAND_GROUP_HYPERLOGLOG,PFADD_History,0,PFADD_Tips,0,pfaddCommand,-2,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_HYPERLOGLOG,PFADD_Keyspecs,1,NULL,2),.args=PFADD_Args}, {MAKE_CMD("pfcount","Returns the approximated cardinality of the set(s) observed by the HyperLogLog key(s).","O(1) with a very small average constant time when called with a single key. O(N) with N being the number of keys, and much bigger constant times, when called with multiple keys.","2.8.9",CMD_DOC_NONE,NULL,NULL,"hyperloglog",COMMAND_GROUP_HYPERLOGLOG,PFCOUNT_History,0,PFCOUNT_Tips,0,pfcountCommand,-2,CMD_READONLY|CMD_MAY_REPLICATE,ACL_CATEGORY_HYPERLOGLOG,PFCOUNT_Keyspecs,1,NULL,1),.args=PFCOUNT_Args}, @@ -11256,16 +11256,16 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("blpop","Removes and returns the first element in a list. Blocks until an element is available otherwise. Deletes the list if the last element was popped.","O(N) where N is the number of provided keys.","2.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,BLPOP_History,1,BLPOP_Tips,0,blpopCommand,-3,CMD_WRITE|CMD_BLOCKING,ACL_CATEGORY_LIST,BLPOP_Keyspecs,1,NULL,2),.args=BLPOP_Args}, {MAKE_CMD("brpop","Removes and returns the last element in a list. Blocks until an element is available otherwise. Deletes the list if the last element was popped.","O(N) where N is the number of provided keys.","2.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,BRPOP_History,1,BRPOP_Tips,0,brpopCommand,-3,CMD_WRITE|CMD_BLOCKING,ACL_CATEGORY_LIST,BRPOP_Keyspecs,1,NULL,2),.args=BRPOP_Args}, {MAKE_CMD("brpoplpush","Pops an element from a list, pushes it to another list and returns it. Block until an element is available otherwise. Deletes the list if the last element was popped.","O(1)","2.2.0",CMD_DOC_DEPRECATED,"`BLMOVE` with the `RIGHT` and `LEFT` arguments","6.2.0","list",COMMAND_GROUP_LIST,BRPOPLPUSH_History,1,BRPOPLPUSH_Tips,0,brpoplpushCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_BLOCKING,ACL_CATEGORY_LIST,BRPOPLPUSH_Keyspecs,2,NULL,3),.args=BRPOPLPUSH_Args}, -{MAKE_CMD("lindex","Returns an element from a list by its index.","O(N) where N is the number of elements to traverse to get to the element at index. This makes asking for the first or the last element of the list O(1).","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LINDEX_History,0,LINDEX_Tips,0,lindexCommand,3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_LIST,LINDEX_Keyspecs,1,NULL,2),.args=LINDEX_Args}, +{MAKE_CMD("lindex","Returns an element from a list by its index.","O(N) where N is the number of elements to traverse to get to the element at index. This makes asking for the first or the last element of the list O(1).","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LINDEX_History,0,LINDEX_Tips,0,lindexCommand,3,CMD_READONLY,ACL_CATEGORY_LIST,LINDEX_Keyspecs,1,NULL,2),.args=LINDEX_Args}, {MAKE_CMD("linsert","Inserts an element before or after another element in a list.","O(N) where N is the number of elements to traverse before seeing the value pivot. This means that inserting somewhere on the left end on the list (head) can be considered O(1) and inserting somewhere on the right end (tail) is O(N).","2.2.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LINSERT_History,0,LINSERT_Tips,0,linsertCommand,5,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_LIST,LINSERT_Keyspecs,1,NULL,4),.args=LINSERT_Args}, -{MAKE_CMD("llen","Returns the length of a list.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LLEN_History,0,LLEN_Tips,0,llenCommand,2,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_LIST,LLEN_Keyspecs,1,NULL,1),.args=LLEN_Args}, +{MAKE_CMD("llen","Returns the length of a list.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LLEN_History,0,LLEN_Tips,0,llenCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_LIST,LLEN_Keyspecs,1,NULL,1),.args=LLEN_Args}, {MAKE_CMD("lmove","Returns an element after popping it from one list and pushing it to another. Deletes the list if the last element was moved.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LMOVE_History,0,LMOVE_Tips,0,lmoveCommand,5,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_LIST,LMOVE_Keyspecs,2,NULL,4),.args=LMOVE_Args}, {MAKE_CMD("lmpop","Returns multiple elements from a list after removing them. Deletes the list if the last element was popped.","O(N+M) where N is the number of provided keys and M is the number of elements returned.","7.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LMPOP_History,0,LMPOP_Tips,0,lmpopCommand,-4,CMD_WRITE,ACL_CATEGORY_LIST,LMPOP_Keyspecs,1,lmpopGetKeys,4),.args=LMPOP_Args}, {MAKE_CMD("lpop","Returns the first elements in a list after removing it. Deletes the list if the last element was popped.","O(N) where N is the number of elements returned","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LPOP_History,1,LPOP_Tips,0,lpopCommand,-2,CMD_WRITE|CMD_FAST,ACL_CATEGORY_LIST,LPOP_Keyspecs,1,NULL,2),.args=LPOP_Args}, {MAKE_CMD("lpos","Returns the index of matching elements in a list.","O(N) where N is the number of elements in the list, for the average case. When searching for elements near the head or the tail of the list, or when the MAXLEN option is provided, the command may run in constant time.","6.0.6",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LPOS_History,0,LPOS_Tips,0,lposCommand,-3,CMD_READONLY,ACL_CATEGORY_LIST,LPOS_Keyspecs,1,NULL,5),.args=LPOS_Args}, {MAKE_CMD("lpush","Prepends one or more elements to a list. Creates the key if it doesn't exist.","O(1) for each element added, so O(N) to add N elements when the command is called with multiple arguments.","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LPUSH_History,1,LPUSH_Tips,0,lpushCommand,-3,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_LIST,LPUSH_Keyspecs,1,NULL,2),.args=LPUSH_Args}, {MAKE_CMD("lpushx","Prepends one or more elements to a list only when the list exists.","O(1) for each element added, so O(N) to add N elements when the command is called with multiple arguments.","2.2.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LPUSHX_History,1,LPUSHX_Tips,0,lpushxCommand,-3,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_LIST,LPUSHX_Keyspecs,1,NULL,2),.args=LPUSHX_Args}, -{MAKE_CMD("lrange","Returns a range of elements from a list.","O(S+N) where S is the distance of start offset from HEAD for small lists, from nearest end (HEAD or TAIL) for large lists; and N is the number of elements in the specified range.","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LRANGE_History,0,LRANGE_Tips,0,lrangeCommand,4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_LIST,LRANGE_Keyspecs,1,NULL,3),.args=LRANGE_Args}, +{MAKE_CMD("lrange","Returns a range of elements from a list.","O(S+N) where S is the distance of start offset from HEAD for small lists, from nearest end (HEAD or TAIL) for large lists; and N is the number of elements in the specified range.","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LRANGE_History,0,LRANGE_Tips,0,lrangeCommand,4,CMD_READONLY,ACL_CATEGORY_LIST,LRANGE_Keyspecs,1,NULL,3),.args=LRANGE_Args}, {MAKE_CMD("lrem","Removes elements from a list. Deletes the list if the last element was removed.","O(N+M) where N is the length of the list and M is the number of elements removed.","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LREM_History,0,LREM_Tips,0,lremCommand,4,CMD_WRITE,ACL_CATEGORY_LIST,LREM_Keyspecs,1,NULL,3),.args=LREM_Args}, {MAKE_CMD("lset","Sets the value of an element in a list by its index.","O(N) where N is the length of the list. Setting either the first or the last element of the list is O(1).","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LSET_History,0,LSET_Tips,0,lsetCommand,4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_LIST,LSET_Keyspecs,1,NULL,3),.args=LSET_Args}, {MAKE_CMD("ltrim","Removes elements from both ends a list. Deletes the list if all elements were trimmed.","O(N) where N is the number of elements to be removed by the operation.","1.0.0",CMD_DOC_NONE,NULL,NULL,"list",COMMAND_GROUP_LIST,LTRIM_History,0,LTRIM_Tips,0,ltrimCommand,4,CMD_WRITE,ACL_CATEGORY_LIST,LTRIM_Keyspecs,1,NULL,3),.args=LTRIM_Args}, @@ -11348,36 +11348,36 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("bzpopmax","Removes and returns the member with the highest score from one or more sorted sets. Blocks until a member available otherwise. Deletes the sorted set if the last element was popped.","O(log(N)) with N being the number of elements in the sorted set.","5.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,BZPOPMAX_History,1,BZPOPMAX_Tips,0,bzpopmaxCommand,-3,CMD_WRITE|CMD_FAST|CMD_BLOCKING,ACL_CATEGORY_SORTEDSET,BZPOPMAX_Keyspecs,1,NULL,2),.args=BZPOPMAX_Args}, {MAKE_CMD("bzpopmin","Removes and returns the member with the lowest score from one or more sorted sets. Blocks until a member is available otherwise. Deletes the sorted set if the last element was popped.","O(log(N)) with N being the number of elements in the sorted set.","5.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,BZPOPMIN_History,1,BZPOPMIN_Tips,0,bzpopminCommand,-3,CMD_WRITE|CMD_FAST|CMD_BLOCKING,ACL_CATEGORY_SORTEDSET,BZPOPMIN_Keyspecs,1,NULL,2),.args=BZPOPMIN_Args}, {MAKE_CMD("zadd","Adds one or more members to a sorted set, or updates their scores. Creates the key if it doesn't exist.","O(log(N)) for each item added, where N is the number of elements in the sorted set.","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZADD_History,3,ZADD_Tips,0,zaddCommand,-4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZADD_Keyspecs,1,NULL,6),.args=ZADD_Args}, -{MAKE_CMD("zcard","Returns the number of members in a sorted set.","O(1)","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZCARD_History,0,ZCARD_Tips,0,zcardCommand,2,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZCARD_Keyspecs,1,NULL,1),.args=ZCARD_Args}, -{MAKE_CMD("zcount","Returns the count of members in a sorted set that have scores within a range.","O(log(N)) with N being the number of elements in the sorted set.","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZCOUNT_History,0,ZCOUNT_Tips,0,zcountCommand,4,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZCOUNT_Keyspecs,1,NULL,3),.args=ZCOUNT_Args}, -{MAKE_CMD("zdiff","Returns the difference between multiple sorted sets.","O(L + (N-K)log(N)) worst case where L is the total number of elements in all the sets, N is the size of the first set, and K is the size of the result set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZDIFF_History,0,ZDIFF_Tips,0,zdiffCommand,-3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZDIFF_Keyspecs,1,zunionInterDiffGetKeys,3),.args=ZDIFF_Args}, +{MAKE_CMD("zcard","Returns the number of members in a sorted set.","O(1)","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZCARD_History,0,ZCARD_Tips,0,zcardCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZCARD_Keyspecs,1,NULL,1),.args=ZCARD_Args}, +{MAKE_CMD("zcount","Returns the count of members in a sorted set that have scores within a range.","O(log(N)) with N being the number of elements in the sorted set.","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZCOUNT_History,0,ZCOUNT_Tips,0,zcountCommand,4,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZCOUNT_Keyspecs,1,NULL,3),.args=ZCOUNT_Args}, +{MAKE_CMD("zdiff","Returns the difference between multiple sorted sets.","O(L + (N-K)log(N)) worst case where L is the total number of elements in all the sets, N is the size of the first set, and K is the size of the result set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZDIFF_History,0,ZDIFF_Tips,0,zdiffCommand,-3,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZDIFF_Keyspecs,1,zunionInterDiffGetKeys,3),.args=ZDIFF_Args}, {MAKE_CMD("zdiffstore","Stores the difference of multiple sorted sets in a key.","O(L + (N-K)log(N)) worst case where L is the total number of elements in all the sets, N is the size of the first set, and K is the size of the result set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZDIFFSTORE_History,0,ZDIFFSTORE_Tips,0,zdiffstoreCommand,-4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_SORTEDSET,ZDIFFSTORE_Keyspecs,2,zunionInterDiffStoreGetKeys,3),.args=ZDIFFSTORE_Args}, {MAKE_CMD("zincrby","Increments the score of a member in a sorted set.","O(log(N)) where N is the number of elements in the sorted set.","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZINCRBY_History,0,ZINCRBY_Tips,0,zincrbyCommand,4,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZINCRBY_Keyspecs,1,NULL,3),.args=ZINCRBY_Args}, -{MAKE_CMD("zinter","Returns the intersect of multiple sorted sets.","O(N*K)+O(M*log(M)) worst case with N being the smallest input sorted set, K being the number of input sorted sets and M being the number of elements in the resulting sorted set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZINTER_History,0,ZINTER_Tips,0,zinterCommand,-3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZINTER_Keyspecs,1,zunionInterDiffGetKeys,5),.args=ZINTER_Args}, -{MAKE_CMD("zintercard","Returns the number of members of the intersect of multiple sorted sets.","O(N*K) worst case with N being the smallest input sorted set, K being the number of input sorted sets.","7.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZINTERCARD_History,0,ZINTERCARD_Tips,0,zinterCardCommand,-3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZINTERCARD_Keyspecs,1,zunionInterDiffGetKeys,3),.args=ZINTERCARD_Args}, +{MAKE_CMD("zinter","Returns the intersect of multiple sorted sets.","O(N*K)+O(M*log(M)) worst case with N being the smallest input sorted set, K being the number of input sorted sets and M being the number of elements in the resulting sorted set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZINTER_History,0,ZINTER_Tips,0,zinterCommand,-3,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZINTER_Keyspecs,1,zunionInterDiffGetKeys,5),.args=ZINTER_Args}, +{MAKE_CMD("zintercard","Returns the number of members of the intersect of multiple sorted sets.","O(N*K) worst case with N being the smallest input sorted set, K being the number of input sorted sets.","7.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZINTERCARD_History,0,ZINTERCARD_Tips,0,zinterCardCommand,-3,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZINTERCARD_Keyspecs,1,zunionInterDiffGetKeys,3),.args=ZINTERCARD_Args}, {MAKE_CMD("zinterstore","Stores the intersect of multiple sorted sets in a key.","O(N*K)+O(M*log(M)) worst case with N being the smallest input sorted set, K being the number of input sorted sets and M being the number of elements in the resulting sorted set.","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZINTERSTORE_History,0,ZINTERSTORE_Tips,0,zinterstoreCommand,-4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_SORTEDSET,ZINTERSTORE_Keyspecs,2,zunionInterDiffStoreGetKeys,5),.args=ZINTERSTORE_Args}, -{MAKE_CMD("zlexcount","Returns the number of members in a sorted set within a lexicographical range.","O(log(N)) with N being the number of elements in the sorted set.","2.8.9",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZLEXCOUNT_History,0,ZLEXCOUNT_Tips,0,zlexcountCommand,4,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZLEXCOUNT_Keyspecs,1,NULL,3),.args=ZLEXCOUNT_Args}, +{MAKE_CMD("zlexcount","Returns the number of members in a sorted set within a lexicographical range.","O(log(N)) with N being the number of elements in the sorted set.","2.8.9",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZLEXCOUNT_History,0,ZLEXCOUNT_Tips,0,zlexcountCommand,4,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZLEXCOUNT_Keyspecs,1,NULL,3),.args=ZLEXCOUNT_Args}, {MAKE_CMD("zmpop","Returns the highest- or lowest-scoring members from one or more sorted sets after removing them. Deletes the sorted set if the last member was popped.","O(K) + O(M*log(N)) where K is the number of provided keys, N being the number of elements in the sorted set, and M being the number of elements popped.","7.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZMPOP_History,0,ZMPOP_Tips,0,zmpopCommand,-4,CMD_WRITE,ACL_CATEGORY_SORTEDSET,ZMPOP_Keyspecs,1,zmpopGetKeys,4),.args=ZMPOP_Args}, -{MAKE_CMD("zmscore","Returns the score of one or more members in a sorted set.","O(N) where N is the number of members being requested.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZMSCORE_History,0,ZMSCORE_Tips,0,zmscoreCommand,-3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZMSCORE_Keyspecs,1,NULL,2),.args=ZMSCORE_Args}, +{MAKE_CMD("zmscore","Returns the score of one or more members in a sorted set.","O(N) where N is the number of members being requested.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZMSCORE_History,0,ZMSCORE_Tips,0,zmscoreCommand,-3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZMSCORE_Keyspecs,1,NULL,2),.args=ZMSCORE_Args}, {MAKE_CMD("zpopmax","Returns the highest-scoring members from a sorted set after removing them. Deletes the sorted set if the last member was popped.","O(log(N)*M) with N being the number of elements in the sorted set, and M being the number of elements popped.","5.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZPOPMAX_History,0,ZPOPMAX_Tips,0,zpopmaxCommand,-2,CMD_WRITE|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZPOPMAX_Keyspecs,1,NULL,2),.args=ZPOPMAX_Args}, {MAKE_CMD("zpopmin","Returns the lowest-scoring members from a sorted set after removing them. Deletes the sorted set if the last member was popped.","O(log(N)*M) with N being the number of elements in the sorted set, and M being the number of elements popped.","5.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZPOPMIN_History,0,ZPOPMIN_Tips,0,zpopminCommand,-2,CMD_WRITE|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZPOPMIN_Keyspecs,1,NULL,2),.args=ZPOPMIN_Args}, -{MAKE_CMD("zrandmember","Returns one or more random members from a sorted set.","O(N) where N is the number of members returned","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANDMEMBER_History,0,ZRANDMEMBER_Tips,1,zrandmemberCommand,-2,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZRANDMEMBER_Keyspecs,1,NULL,2),.args=ZRANDMEMBER_Args}, -{MAKE_CMD("zrange","Returns members in a sorted set within a range of indexes.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements returned.","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGE_History,1,ZRANGE_Tips,0,zrangeCommand,-4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZRANGE_Keyspecs,1,NULL,7),.args=ZRANGE_Args}, -{MAKE_CMD("zrangebylex","Returns members in a sorted set within a lexicographical range.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","2.8.9",CMD_DOC_DEPRECATED,"`ZRANGE` with the `BYLEX` argument","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGEBYLEX_History,0,ZRANGEBYLEX_Tips,0,zrangebylexCommand,-4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZRANGEBYLEX_Keyspecs,1,NULL,4),.args=ZRANGEBYLEX_Args}, -{MAKE_CMD("zrangebyscore","Returns members in a sorted set within a range of scores.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","1.0.5",CMD_DOC_DEPRECATED,"`ZRANGE` with the `BYSCORE` argument","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGEBYSCORE_History,1,ZRANGEBYSCORE_Tips,0,zrangebyscoreCommand,-4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZRANGEBYSCORE_Keyspecs,1,NULL,5),.args=ZRANGEBYSCORE_Args}, +{MAKE_CMD("zrandmember","Returns one or more random members from a sorted set.","O(N) where N is the number of members returned","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANDMEMBER_History,0,ZRANDMEMBER_Tips,1,zrandmemberCommand,-2,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZRANDMEMBER_Keyspecs,1,NULL,2),.args=ZRANDMEMBER_Args}, +{MAKE_CMD("zrange","Returns members in a sorted set within a range of indexes.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements returned.","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGE_History,1,ZRANGE_Tips,0,zrangeCommand,-4,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZRANGE_Keyspecs,1,NULL,7),.args=ZRANGE_Args}, +{MAKE_CMD("zrangebylex","Returns members in a sorted set within a lexicographical range.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","2.8.9",CMD_DOC_DEPRECATED,"`ZRANGE` with the `BYLEX` argument","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGEBYLEX_History,0,ZRANGEBYLEX_Tips,0,zrangebylexCommand,-4,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZRANGEBYLEX_Keyspecs,1,NULL,4),.args=ZRANGEBYLEX_Args}, +{MAKE_CMD("zrangebyscore","Returns members in a sorted set within a range of scores.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","1.0.5",CMD_DOC_DEPRECATED,"`ZRANGE` with the `BYSCORE` argument","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGEBYSCORE_History,1,ZRANGEBYSCORE_Tips,0,zrangebyscoreCommand,-4,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZRANGEBYSCORE_Keyspecs,1,NULL,5),.args=ZRANGEBYSCORE_Args}, {MAKE_CMD("zrangestore","Stores a range of members from sorted set in a key.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements stored into the destination key.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANGESTORE_History,0,ZRANGESTORE_Tips,0,zrangestoreCommand,-5,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_SORTEDSET,ZRANGESTORE_Keyspecs,2,NULL,7),.args=ZRANGESTORE_Args}, -{MAKE_CMD("zrank","Returns the index of a member in a sorted set ordered by ascending scores.","O(log(N))","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANK_History,1,ZRANK_Tips,0,zrankCommand,-3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZRANK_Keyspecs,1,NULL,3),.args=ZRANK_Args}, +{MAKE_CMD("zrank","Returns the index of a member in a sorted set ordered by ascending scores.","O(log(N))","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZRANK_History,1,ZRANK_Tips,0,zrankCommand,-3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZRANK_Keyspecs,1,NULL,3),.args=ZRANK_Args}, {MAKE_CMD("zrem","Removes one or more members from a sorted set. Deletes the sorted set if all members were removed.","O(M*log(N)) with N being the number of elements in the sorted set and M the number of elements to be removed.","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZREM_History,1,ZREM_Tips,0,zremCommand,-3,CMD_WRITE|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZREM_Keyspecs,1,NULL,2),.args=ZREM_Args}, {MAKE_CMD("zremrangebylex","Removes members in a sorted set within a lexicographical range. Deletes the sorted set if all members were removed.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements removed by the operation.","2.8.9",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZREMRANGEBYLEX_History,0,ZREMRANGEBYLEX_Tips,0,zremrangebylexCommand,4,CMD_WRITE,ACL_CATEGORY_SORTEDSET,ZREMRANGEBYLEX_Keyspecs,1,NULL,3),.args=ZREMRANGEBYLEX_Args}, {MAKE_CMD("zremrangebyrank","Removes members in a sorted set within a range of indexes. Deletes the sorted set if all members were removed.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements removed by the operation.","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZREMRANGEBYRANK_History,0,ZREMRANGEBYRANK_Tips,0,zremrangebyrankCommand,4,CMD_WRITE,ACL_CATEGORY_SORTEDSET,ZREMRANGEBYRANK_Keyspecs,1,NULL,3),.args=ZREMRANGEBYRANK_Args}, {MAKE_CMD("zremrangebyscore","Removes members in a sorted set within a range of scores. Deletes the sorted set if all members were removed.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements removed by the operation.","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZREMRANGEBYSCORE_History,0,ZREMRANGEBYSCORE_Tips,0,zremrangebyscoreCommand,4,CMD_WRITE,ACL_CATEGORY_SORTEDSET,ZREMRANGEBYSCORE_Keyspecs,1,NULL,3),.args=ZREMRANGEBYSCORE_Args}, -{MAKE_CMD("zrevrange","Returns members in a sorted set within a range of indexes in reverse order.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements returned.","1.2.0",CMD_DOC_DEPRECATED,"`ZRANGE` with the `REV` argument","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANGE_History,0,ZREVRANGE_Tips,0,zrevrangeCommand,-4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZREVRANGE_Keyspecs,1,NULL,4),.args=ZREVRANGE_Args}, -{MAKE_CMD("zrevrangebylex","Returns members in a sorted set within a lexicographical range in reverse order.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","2.8.9",CMD_DOC_DEPRECATED,"`ZRANGE` with the `REV` and `BYLEX` arguments","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANGEBYLEX_History,0,ZREVRANGEBYLEX_Tips,0,zrevrangebylexCommand,-4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZREVRANGEBYLEX_Keyspecs,1,NULL,4),.args=ZREVRANGEBYLEX_Args}, -{MAKE_CMD("zrevrangebyscore","Returns members in a sorted set within a range of scores in reverse order.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","2.2.0",CMD_DOC_DEPRECATED,"`ZRANGE` with the `REV` and `BYSCORE` arguments","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANGEBYSCORE_History,1,ZREVRANGEBYSCORE_Tips,0,zrevrangebyscoreCommand,-4,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZREVRANGEBYSCORE_Keyspecs,1,NULL,5),.args=ZREVRANGEBYSCORE_Args}, -{MAKE_CMD("zrevrank","Returns the index of a member in a sorted set ordered by descending scores.","O(log(N))","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANK_History,1,ZREVRANK_Tips,0,zrevrankCommand,-3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZREVRANK_Keyspecs,1,NULL,3),.args=ZREVRANK_Args}, -{MAKE_CMD("zscan","Iterates over members and scores of a sorted set.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZSCAN_History,1,ZSCAN_Tips,1,zscanCommand,-3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZSCAN_Keyspecs,1,NULL,5),.args=ZSCAN_Args}, -{MAKE_CMD("zscore","Returns the score of a member in a sorted set.","O(1)","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZSCORE_History,0,ZSCORE_Tips,0,zscoreCommand,3,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZSCORE_Keyspecs,1,NULL,2),.args=ZSCORE_Args}, -{MAKE_CMD("zunion","Returns the union of multiple sorted sets.","O(N)+O(M*log(M)) with N being the sum of the sizes of the input sorted sets, and M being the number of elements in the resulting sorted set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZUNION_History,0,ZUNION_Tips,0,zunionCommand,-3,CMD_READONLY|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_SORTEDSET,ZUNION_Keyspecs,1,zunionInterDiffGetKeys,5),.args=ZUNION_Args}, +{MAKE_CMD("zrevrange","Returns members in a sorted set within a range of indexes in reverse order.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements returned.","1.2.0",CMD_DOC_DEPRECATED,"`ZRANGE` with the `REV` argument","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANGE_History,0,ZREVRANGE_Tips,0,zrevrangeCommand,-4,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZREVRANGE_Keyspecs,1,NULL,4),.args=ZREVRANGE_Args}, +{MAKE_CMD("zrevrangebylex","Returns members in a sorted set within a lexicographical range in reverse order.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","2.8.9",CMD_DOC_DEPRECATED,"`ZRANGE` with the `REV` and `BYLEX` arguments","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANGEBYLEX_History,0,ZREVRANGEBYLEX_Tips,0,zrevrangebylexCommand,-4,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZREVRANGEBYLEX_Keyspecs,1,NULL,4),.args=ZREVRANGEBYLEX_Args}, +{MAKE_CMD("zrevrangebyscore","Returns members in a sorted set within a range of scores in reverse order.","O(log(N)+M) with N being the number of elements in the sorted set and M the number of elements being returned. If M is constant (e.g. always asking for the first 10 elements with LIMIT), you can consider it O(log(N)).","2.2.0",CMD_DOC_DEPRECATED,"`ZRANGE` with the `REV` and `BYSCORE` arguments","6.2.0","sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANGEBYSCORE_History,1,ZREVRANGEBYSCORE_Tips,0,zrevrangebyscoreCommand,-4,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZREVRANGEBYSCORE_Keyspecs,1,NULL,5),.args=ZREVRANGEBYSCORE_Args}, +{MAKE_CMD("zrevrank","Returns the index of a member in a sorted set ordered by descending scores.","O(log(N))","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZREVRANK_History,1,ZREVRANK_Tips,0,zrevrankCommand,-3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZREVRANK_Keyspecs,1,NULL,3),.args=ZREVRANK_Args}, +{MAKE_CMD("zscan","Iterates over members and scores of a sorted set.","O(1) for every call. O(N) for a complete iteration, including enough command calls for the cursor to return back to 0. N is the number of elements inside the collection.","2.8.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZSCAN_History,1,ZSCAN_Tips,1,zscanCommand,-3,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZSCAN_Keyspecs,1,NULL,5),.args=ZSCAN_Args}, +{MAKE_CMD("zscore","Returns the score of a member in a sorted set.","O(1)","1.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZSCORE_History,0,ZSCORE_Tips,0,zscoreCommand,3,CMD_READONLY|CMD_FAST,ACL_CATEGORY_SORTEDSET,ZSCORE_Keyspecs,1,NULL,2),.args=ZSCORE_Args}, +{MAKE_CMD("zunion","Returns the union of multiple sorted sets.","O(N)+O(M*log(M)) with N being the sum of the sizes of the input sorted sets, and M being the number of elements in the resulting sorted set.","6.2.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZUNION_History,0,ZUNION_Tips,0,zunionCommand,-3,CMD_READONLY,ACL_CATEGORY_SORTEDSET,ZUNION_Keyspecs,1,zunionInterDiffGetKeys,5),.args=ZUNION_Args}, {MAKE_CMD("zunionstore","Stores the union of multiple sorted sets in a key.","O(N)+O(M log(M)) with N being the sum of the sizes of the input sorted sets, and M being the number of elements in the resulting sorted set.","2.0.0",CMD_DOC_NONE,NULL,NULL,"sorted_set",COMMAND_GROUP_SORTED_SET,ZUNIONSTORE_History,0,ZUNIONSTORE_Tips,0,zunionstoreCommand,-4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_SORTEDSET,ZUNIONSTORE_Keyspecs,2,zunionInterDiffStoreGetKeys,5),.args=ZUNIONSTORE_Args}, /* stream */ {MAKE_CMD("xack","Returns the number of messages that were successfully acknowledged by the consumer group member of a stream.","O(1) for each message ID processed.","5.0.0",CMD_DOC_NONE,NULL,NULL,"stream",COMMAND_GROUP_STREAM,XACK_History,0,XACK_Tips,0,xackCommand,-4,CMD_WRITE|CMD_FAST,ACL_CATEGORY_STREAM,XACK_Keyspecs,1,NULL,3),.args=XACK_Args}, @@ -11400,7 +11400,7 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("decr","Decrements the integer value of a key by one. Uses 0 as initial value if the key doesn't exist.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,DECR_History,0,DECR_Tips,0,decrCommand,2,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_STRING,DECR_Keyspecs,1,NULL,1),.args=DECR_Args}, {MAKE_CMD("decrby","Decrements a number from the integer value of a key. Uses 0 as initial value if the key doesn't exist.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,DECRBY_History,0,DECRBY_Tips,0,decrbyCommand,3,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_STRING,DECRBY_Keyspecs,1,NULL,2),.args=DECRBY_Args}, {MAKE_CMD("delifeq","Delete key if value matches string.","O(1)","9.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,DELIFEQ_History,0,DELIFEQ_Tips,0,delifeqCommand,3,CMD_FAST|CMD_WRITE,ACL_CATEGORY_STRING,DELIFEQ_Keyspecs,1,NULL,2),.args=DELIFEQ_Args}, -{MAKE_CMD("get","Returns the string value of a key.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,GET_History,0,GET_Tips,0,getCommand,2,CMD_READONLY|CMD_FAST|CMD_CAN_BE_OFFLOADED,ACL_CATEGORY_STRING,GET_Keyspecs,1,NULL,1),.args=GET_Args}, +{MAKE_CMD("get","Returns the string value of a key.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,GET_History,0,GET_Tips,0,getCommand,2,CMD_READONLY|CMD_FAST,ACL_CATEGORY_STRING,GET_Keyspecs,1,NULL,1),.args=GET_Args}, {MAKE_CMD("getdel","Returns the string value of a key after deleting the key.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,GETDEL_History,0,GETDEL_Tips,0,getdelCommand,2,CMD_WRITE|CMD_FAST,ACL_CATEGORY_STRING,GETDEL_Keyspecs,1,NULL,1),.args=GETDEL_Args}, {MAKE_CMD("getex","Returns the string value of a key after setting its expiration time.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,GETEX_History,0,GETEX_Tips,0,getexCommand,-2,CMD_WRITE|CMD_FAST,ACL_CATEGORY_STRING,GETEX_Keyspecs,1,NULL,2),.args=GETEX_Args}, {MAKE_CMD("getrange","Returns a substring of the string stored at a key.","O(N) where N is the length of the returned string. The complexity is ultimately determined by the returned length, but because creating a substring from an existing string is very cheap, it can be considered O(1) for small strings.","2.4.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,GETRANGE_History,0,GETRANGE_Tips,0,getrangeCommand,4,CMD_READONLY,ACL_CATEGORY_STRING,GETRANGE_Keyspecs,1,NULL,3),.args=GETRANGE_Args}, diff --git a/src/commands/get.json b/src/commands/get.json index c6861f7914..693c1ac823 100644 --- a/src/commands/get.json +++ b/src/commands/get.json @@ -8,8 +8,7 @@ "function": "getCommand", "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "STRING" diff --git a/src/commands/hexists.json b/src/commands/hexists.json index 10d028bddd..f5ea405718 100644 --- a/src/commands/hexists.json +++ b/src/commands/hexists.json @@ -8,8 +8,7 @@ "function": "hexistsCommand", "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "HASH" diff --git a/src/commands/hget.json b/src/commands/hget.json index 01b0bed85b..a041143ec8 100644 --- a/src/commands/hget.json +++ b/src/commands/hget.json @@ -8,8 +8,7 @@ "function": "hgetCommand", "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "HASH" diff --git a/src/commands/hgetall.json b/src/commands/hgetall.json index 2e81ee24e6..9bbf835a34 100644 --- a/src/commands/hgetall.json +++ b/src/commands/hgetall.json @@ -7,8 +7,7 @@ "arity": 2, "function": "hgetallCommand", "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "HASH" diff --git a/src/commands/hkeys.json b/src/commands/hkeys.json index d72e73459f..917df1c9eb 100644 --- a/src/commands/hkeys.json +++ b/src/commands/hkeys.json @@ -7,8 +7,7 @@ "arity": 2, "function": "hkeysCommand", "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "HASH" diff --git a/src/commands/hlen.json b/src/commands/hlen.json index db0f364687..d4c13ac116 100644 --- a/src/commands/hlen.json +++ b/src/commands/hlen.json @@ -8,8 +8,7 @@ "function": "hlenCommand", "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "HASH" diff --git a/src/commands/hmget.json b/src/commands/hmget.json index fa96abcbd8..73fa9c311f 100644 --- a/src/commands/hmget.json +++ b/src/commands/hmget.json @@ -8,8 +8,7 @@ "function": "hmgetCommand", "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "HASH" diff --git a/src/commands/hrandfield.json b/src/commands/hrandfield.json index f59279b4a4..83abc74a9d 100644 --- a/src/commands/hrandfield.json +++ b/src/commands/hrandfield.json @@ -7,8 +7,7 @@ "arity": -2, "function": "hrandfieldCommand", "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "HASH" diff --git a/src/commands/hscan.json b/src/commands/hscan.json index 648c9b5a44..9e6099c2f2 100644 --- a/src/commands/hscan.json +++ b/src/commands/hscan.json @@ -7,8 +7,7 @@ "arity": -3, "function": "hscanCommand", "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "HASH" diff --git a/src/commands/hstrlen.json b/src/commands/hstrlen.json index 6aeb2f3301..82ac6dbe48 100644 --- a/src/commands/hstrlen.json +++ b/src/commands/hstrlen.json @@ -8,8 +8,7 @@ "function": "hstrlenCommand", "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "HASH" diff --git a/src/commands/hvals.json b/src/commands/hvals.json index 42061c18b8..55aeaaff92 100644 --- a/src/commands/hvals.json +++ b/src/commands/hvals.json @@ -7,8 +7,7 @@ "arity": 2, "function": "hvalsCommand", "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "HASH" diff --git a/src/commands/lindex.json b/src/commands/lindex.json index 5e19ea8b36..a589d52fc9 100644 --- a/src/commands/lindex.json +++ b/src/commands/lindex.json @@ -7,8 +7,7 @@ "arity": 3, "function": "lindexCommand", "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "LIST" diff --git a/src/commands/llen.json b/src/commands/llen.json index ac743d5cf7..846aa40867 100644 --- a/src/commands/llen.json +++ b/src/commands/llen.json @@ -8,8 +8,7 @@ "function": "llenCommand", "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "LIST" diff --git a/src/commands/lrange.json b/src/commands/lrange.json index 95ca2e5128..303d2f60b8 100644 --- a/src/commands/lrange.json +++ b/src/commands/lrange.json @@ -7,8 +7,7 @@ "arity": 4, "function": "lrangeCommand", "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "LIST" diff --git a/src/commands/zcard.json b/src/commands/zcard.json index 11bf3052f6..58683a4874 100644 --- a/src/commands/zcard.json +++ b/src/commands/zcard.json @@ -8,8 +8,7 @@ "function": "zcardCommand", "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zcount.json b/src/commands/zcount.json index 721f7f7e79..0fdebd7dff 100644 --- a/src/commands/zcount.json +++ b/src/commands/zcount.json @@ -8,8 +8,7 @@ "function": "zcountCommand", "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zdiff.json b/src/commands/zdiff.json index c49b8d5c61..912d5c6d05 100644 --- a/src/commands/zdiff.json +++ b/src/commands/zdiff.json @@ -8,8 +8,7 @@ "function": "zdiffCommand", "get_keys_function": "zunionInterDiffGetKeys", "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zinter.json b/src/commands/zinter.json index abef66f8fc..4828e21d6c 100644 --- a/src/commands/zinter.json +++ b/src/commands/zinter.json @@ -8,8 +8,7 @@ "function": "zinterCommand", "get_keys_function": "zunionInterDiffGetKeys", "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zintercard.json b/src/commands/zintercard.json index b02fb93db0..7fdab3ed64 100644 --- a/src/commands/zintercard.json +++ b/src/commands/zintercard.json @@ -8,8 +8,7 @@ "function": "zinterCardCommand", "get_keys_function": "zunionInterDiffGetKeys", "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zlexcount.json b/src/commands/zlexcount.json index 83d0585daa..8bf2884c93 100644 --- a/src/commands/zlexcount.json +++ b/src/commands/zlexcount.json @@ -8,8 +8,7 @@ "function": "zlexcountCommand", "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zmscore.json b/src/commands/zmscore.json index a8013f7c3b..6a036fe0be 100644 --- a/src/commands/zmscore.json +++ b/src/commands/zmscore.json @@ -8,8 +8,7 @@ "function": "zmscoreCommand", "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrandmember.json b/src/commands/zrandmember.json index 193daf8d1d..13abc9aa3c 100644 --- a/src/commands/zrandmember.json +++ b/src/commands/zrandmember.json @@ -7,8 +7,7 @@ "arity": -2, "function": "zrandmemberCommand", "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrange.json b/src/commands/zrange.json index a2e37c92ec..dc7af8dc14 100644 --- a/src/commands/zrange.json +++ b/src/commands/zrange.json @@ -13,8 +13,7 @@ ] ], "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrangebylex.json b/src/commands/zrangebylex.json index 1d05e93f7a..5949b87166 100644 --- a/src/commands/zrangebylex.json +++ b/src/commands/zrangebylex.json @@ -12,8 +12,7 @@ "DEPRECATED" ], "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrangebyscore.json b/src/commands/zrangebyscore.json index 9b1b41ff75..c89607e104 100644 --- a/src/commands/zrangebyscore.json +++ b/src/commands/zrangebyscore.json @@ -18,8 +18,7 @@ "DEPRECATED" ], "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrank.json b/src/commands/zrank.json index 8f3048d3a4..f5f427c66d 100644 --- a/src/commands/zrank.json +++ b/src/commands/zrank.json @@ -14,8 +14,7 @@ ], "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrevrange.json b/src/commands/zrevrange.json index 1bce02766c..a143f72153 100644 --- a/src/commands/zrevrange.json +++ b/src/commands/zrevrange.json @@ -12,8 +12,7 @@ "DEPRECATED" ], "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrevrangebylex.json b/src/commands/zrevrangebylex.json index f28b9a71e4..d1d8100d1e 100644 --- a/src/commands/zrevrangebylex.json +++ b/src/commands/zrevrangebylex.json @@ -12,8 +12,7 @@ "DEPRECATED" ], "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrevrangebyscore.json b/src/commands/zrevrangebyscore.json index 5e88f0c9ca..0eb9e86956 100644 --- a/src/commands/zrevrangebyscore.json +++ b/src/commands/zrevrangebyscore.json @@ -18,8 +18,7 @@ "DEPRECATED" ], "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zrevrank.json b/src/commands/zrevrank.json index c529391bfa..39897cae33 100644 --- a/src/commands/zrevrank.json +++ b/src/commands/zrevrank.json @@ -14,8 +14,7 @@ ], "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zscan.json b/src/commands/zscan.json index 477468476f..7948e393a5 100644 --- a/src/commands/zscan.json +++ b/src/commands/zscan.json @@ -13,8 +13,7 @@ ] ], "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zscore.json b/src/commands/zscore.json index 6da7a38058..502247051e 100644 --- a/src/commands/zscore.json +++ b/src/commands/zscore.json @@ -8,8 +8,7 @@ "function": "zscoreCommand", "command_flags": [ "READONLY", - "FAST", - "CAN_BE_OFFLOADED" + "FAST" ], "acl_categories": [ "SORTEDSET" diff --git a/src/commands/zunion.json b/src/commands/zunion.json index 26bf0b1605..1ce3dc5ee1 100644 --- a/src/commands/zunion.json +++ b/src/commands/zunion.json @@ -8,8 +8,7 @@ "function": "zunionCommand", "get_keys_function": "zunionInterDiffGetKeys", "command_flags": [ - "READONLY", - "CAN_BE_OFFLOADED" + "READONLY" ], "acl_categories": [ "SORTEDSET" diff --git a/src/db.c b/src/db.c index d60a1f4347..a3fba7d253 100644 --- a/src/db.c +++ b/src/db.c @@ -2012,7 +2012,7 @@ static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val, if (!inMainThread()) { postpone_expired_key_ctx ctx = {.dict_index = dict_index, .db = db, .key = key}; if (!static_key) incrRefCount(key); - threadAdddeferredJob(dict_index, handlePostponeExpiredKey, sizeof(ctx), &ctx); + threadAddDeferredJob(dict_index, handlePostponeExpiredKey, sizeof(ctx), &ctx); return KEY_EXPIRED; } diff --git a/src/io_threads.c b/src/io_threads.c index f671398588..f1eb81036f 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -418,10 +418,10 @@ static int isServerExclusiveCmd(struct serverCommand *cmd, int slot) { /* Returns if the given command requires exclusive access to the given slot. */ static int isSlotExclusiveCmd(struct serverCommand *cmd, int slot) { - /* No exclusivity required */ - if (cmd->flags & CMD_CAN_BE_OFFLOADED) return 0; + /* No exclusivity required for commands that can be offloaded */ + if (CMD_CAN_BE_OFFLOADED(cmd)) return 0; - /* Not slot exclusive rather DB exclusive */ + /* Not slot exclusive rather server exclusive */ if (isServerExclusiveCmd(cmd, slot)) return 0; return 1; @@ -557,7 +557,7 @@ static void deferServerCron(void *data) { } /* Add a deferred job to the thread-local job list */ -void threadAdddeferredJob(int slot, job_handler handler, size_t data_size, void *data) { +void threadAddDeferredJob(int slot, job_handler handler, size_t data_size, void *data) { /* Allocate memory for job structure plus data using flexible array member */ listNode *job_node = createJobNode(slot, handler, data_size, data); listLinkNodeTail(thread_deferred_jobs, job_node); @@ -970,7 +970,9 @@ static int isCommandPostpone(client *c) { /* Check if a command can be offloaded to IO threads. * Returns 1 if the command can be offloaded, 0 otherwise. */ -int canCommandBeOffloaded(struct serverCommand *cmd) { +int canCommandBeOffloaded(int slot, struct serverCommand *cmd) { + if (slot == -1) return 0; + if (!server.cluster_enabled) { return 0; /* Avoid offloading commands in non cluster mode. */ } @@ -988,7 +990,7 @@ int canCommandBeOffloaded(struct serverCommand *cmd) { return 0; /* Modules are loaded and module command offloading is disabled. */ } - if (!(cmd->flags & CMD_CAN_BE_OFFLOADED)) { + if (!CMD_CAN_BE_OFFLOADED(cmd)) { return 0; } @@ -1000,7 +1002,7 @@ int canCommandBeOffloaded(struct serverCommand *cmd) { } int trySendProcessCommandToIOThreads(client *c) { - if (!canCommandBeOffloaded(c->cmd)) { + if (!canCommandBeOffloaded(c->slot, c->cmd)) { return C_ERR; } diff --git a/src/io_threads.h b/src/io_threads.h index cd9e5fc6e4..54a38653db 100644 --- a/src/io_threads.h +++ b/src/io_threads.h @@ -30,9 +30,9 @@ void drainIOThreadsQueue(void); void trySendPollJobToIOThreads(void); int trySendAcceptToIOThreads(struct connection *conn); int trySendProcessCommandToIOThreads(struct client *c); -int canCommandBeOffloaded(struct serverCommand *cmd); +int canCommandBeOffloaded(int slot, struct serverCommand *cmd); int processIOThreadsResponses(void); -void threadAdddeferredJob(int slot, job_handler handler, size_t len, void *data); +void threadAddDeferredJob(int slot, job_handler handler, size_t len, void *data); void threadRespond(struct client *c, jobResponseType r); int clientHandlingThreadedIO(struct client *c); int postponeClientCommand(struct client *c); diff --git a/src/kvstore.c b/src/kvstore.c index a8e653ab1c..e8b8a52ab0 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -250,7 +250,7 @@ void kvstoreHashtableRehashingCompleted(hashtable *ht) { /* If not in main-thread postpone the update of kvs rehashing info to be done later by the main-thread -*/ if (!inMainThread()) { - threadAdddeferredJob(-1, kvstoreHashtableUpdateRehashingInfo, sizeof(ctx), &ctx); + threadAddDeferredJob(-1, kvstoreHashtableUpdateRehashingInfo, sizeof(ctx), &ctx); } else { kvstoreHashtableUpdateRehashingInfo(&ctx); } diff --git a/src/lazyfree.c b/src/lazyfree.c index 6d43c2c8f6..1c3e4f1c68 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -169,6 +169,8 @@ size_t lazyfreeGetFreeEffort(robj *key, robj *obj, int dbid) { /* Free an object, if the object is huge enough, free it in async way. */ void freeObjAsync(robj *key, robj *obj, int dbid) { + if (!inMainThread()) decrRefCount(obj); + size_t free_effort = lazyfreeGetFreeEffort(key, obj, dbid); /* Note that if the object is shared, to reclaim it now it is not * possible. This rarely happens, however sometimes the implementation diff --git a/src/memory_prefetch.c b/src/memory_prefetch.c index b1290b2fee..11675e7507 100644 --- a/src/memory_prefetch.c +++ b/src/memory_prefetch.c @@ -242,7 +242,7 @@ void processClientsCommandsBatch(void) { /* Check if the command is about to be offloaded to IO threads */ static int isCommandBeingOffloaded(client *c) { - if (!canCommandBeOffloaded(c->parsed_cmd)) { + if (!canCommandBeOffloaded(c->slot, c->parsed_cmd)) { return 0; } diff --git a/src/networking.c b/src/networking.c index 140ab1fb3c..33c2621a0e 100644 --- a/src/networking.c +++ b/src/networking.c @@ -836,7 +836,7 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) { /* Postpone error updates if its io-thread */ if (!inMainThread()) { delayedErrorStatsUpdateCtx ctx = {.c = c, .s = sdsnewlen(s, len), .len = len, .flags = flags}; - threadAdddeferredJob(-1, afterErrorReplyDelayed, sizeof(ctx), &ctx); + threadAddDeferredJob(-1, afterErrorReplyDelayed, sizeof(ctx), &ctx); return; } diff --git a/src/server.c b/src/server.c index ec2ea74842..2a40ade05f 100644 --- a/src/server.c +++ b/src/server.c @@ -3954,7 +3954,6 @@ void call(client *c, int flags) { * It executes the command and writes the response back to the client. */ void ioThreadCallCommand(void *data) { client *c = (client *)data; - serverAssert(c->cmd->flags & CMD_CAN_BE_OFFLOADED); const long long call_timer = ustime(); c->flag.executing_command = 1; setCurrentClient(c); diff --git a/src/server.h b/src/server.h index 86303d837b..1b68d6c224 100644 --- a/src/server.h +++ b/src/server.h @@ -252,7 +252,14 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define CMD_ALLOW_BUSY ((1ULL << 26)) #define CMD_MODULE_GETCHANNELS (1ULL << 27) /* Use the modules getchannels interface. */ #define CMD_TOUCHES_ARBITRARY_KEYS (1ULL << 28) -#define CMD_CAN_BE_OFFLOADED (1ULL << 29) /* Command can be offloaded to worker IO threads. */ + +/* Check if a command can be offloaded. */ +#define CMD_CAN_BE_OFFLOADED(cmd) \ + ((cmd->flags & CMD_READONLY) && \ + !(cmd->flags & CMD_NO_MANDATORY_KEYS) && \ + !(cmd->flags & CMD_MAY_REPLICATE) && \ + !(cmd->flags & CMD_BLOCKING)) + /* Command flags. Please don't forget to add command flag documentation in struct * serverCommand in this file. */ @@ -2476,10 +2483,6 @@ typedef int serverGetKeysProc(struct serverCommand *cmd, robj **argv, int argc, * CMD_TOUCHES_ARBITRARY_KEYS: The command may touch (and cause lazy-expire) * arbitrary key (i.e not provided in argv) * - * CMD_CAN_BE_OFFLOADED: The command can be safely offloaded to worker IO threads. - * Currently only simple read commands that don't have side effects - * are eligible for offloading. Commands with this flag should be - * idempotent and not modify any server state. * * The following additional flags are only used in order to put commands * in a specific ACL category. Commands can have multiple ACL categories.