diff --git a/00-RELEASENOTES b/00-RELEASENOTES index 36317ca35..81ff184fe 100644 --- a/00-RELEASENOTES +++ b/00-RELEASENOTES @@ -1,85 +1,16 @@ -Redis 2.6 release notes +Hello! This file is just a placeholder, since this is the "unstable" branch +of Redis, the place where all the development happens. -Migrating from 2.4 to 2.6 -========================= +There is no release notes for this branch, it gets forked into another branch +every time there is a partial feature freeze in order to eventually create +a new stable release. -Redis 2.4 is mostly a strict subset of 2.6. However there are a few things -that you should be aware of: +Usually "unstable" is stable enough for you to use it in development enviromnets +however you should never use it in production environments. It is possible +to download the latest stable release here: -* You can't use .rdb and AOF files generated with 2.6 into a 2.4 instance. -* 2.6 slaves can be attached to 2.4 masters, but not the contrary, and only - for the time needed to perform the version upgrade. + http://download.redis.io/releases/redis-stable.tar.gz -There are also a few API differences, that are unlikely to cause problems, -but it is better to keep them in mind: +More information is available at http://redis.io -* SORT now will refuse to sort in numerical mode elements that can't be parsed - as numbers. -* EXPIREs now all have millisecond resolution (but this is very unlikely to - break code that was not conceived exploting the previous resolution error - in some way.) -* INFO output is a bit different now, and contains empty lines and comments - starting with '#'. All the major clients should be already fixed to work - with the new INFO format. - -Also the following redis.conf and CONFIG GET / SET parameters changed name: - - * hash-max-zipmap-entries, now replaced by hash-max-ziplist-entries - * hash-max-zipmap-value, now replaced by hash-max-ziplist-value - * glueoutputbuf was now completely removed as it does not make sense - ---------- -CHANGELOG ---------- - -What's new in Redis 2.6.0 -========================= - -UPGRADE URGENCY: We suggest new users to start with 2.6.0, and old users to - upgrade after some testing of the application with the new - Redis version. - -* Server side Lua scripting, see http://redis.io/commands/eval -* Virtual Memory removed (was deprecated in 2.4) -* Hardcoded limits about max number of clients removed. -* AOF low level semantics is generally more sane, and especially when used - in slaves. -* Milliseconds resolution expires, also added new commands with milliseconds - precision (PEXPIRE, PTTL, ...). -* Clients max output buffer soft and hard limits. You can specifiy different - limits for different classes of clients (normal,pubsub,slave). -* AOF is now able to rewrite aggregate data types using variadic commands, - often producing an AOF that is faster to save, load, and is smaller in size. -* Every redis.conf directive is now accepted as a command line option for the - redis-server binary, with the same name and number of arguments. -* Hash table seed randomization for protection against collisions attacks. -* Performances improved when writing large objects to Redis. -* Significant parts of the core refactored or rewritten. New internal APIs - and core changes allowed to develop Redis Cluster on top of the new code, - however for 2.6 all the cluster code was removed, and will be released with - Redis 3.0 when it is more complete and stable. -* Redis ASCII art logo added at startup. -* Crash report on memory violation or failed asserts improved significantly - to make debugging of hard to catch bugs simpler. -* redis-benchmark improvements: ability to run selected tests, - CSV output, faster, better help. -* redis-cli improvements: --eval for comfortable development of Lua scripts. -* SHUTDOWN now supports two optional arguments: "SAVE" and "NOSAVE". -* INFO output split into sections, the command is now able to just show - pecific sections. -* New statistics about how many time a command was called, and how much - execution time it used (INFO commandstats). -* More predictable SORT behavior in edge cases. -* INCRBYFLOAT and HINCRBYFLOAT commands. - --------------------------------------------------------------------------------- - -Credits: Where not specified the implementation and design are done by -Salvatore Sanfilippo and Pieter Noordhuis. Thanks to VMware for making all -this possible. Also many thanks to all the other contributors and the amazing -community we have. - -See commit messages for more credits. - -Cheers, -Salvatore +Happy hacking! diff --git a/COPYING b/COPYING index c8665ba67..a58de44dd 100644 --- a/COPYING +++ b/COPYING @@ -1,4 +1,4 @@ -Copyright (c) 2006-2012, Salvatore Sanfilippo +Copyright (c) 2006-2014, Salvatore Sanfilippo All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/Changelog b/Changelog deleted file mode 100644 index f72746663..000000000 --- a/Changelog +++ /dev/null @@ -1,1032 +0,0 @@ -2010-07-01 gitignore modified (antirez) -2010-06-22 redis.c split into many different C files. (antirez) -2010-06-16 more pub/sub tests (Pieter Noordhuis) -2010-06-15 initial basic pub/sub tests (Pieter Noordhuis) -2010-06-15 fix BLPOP/BRPOP to use the wrapped function for list length (Pieter Noordhuis) -2010-06-15 tests for BLPOP/BRPOP via an option in the tcl client that defers reading the reply (Pieter Noordhuis) -2010-06-14 TODO updated (antirez) -2010-06-14 Merge branch 'ltrim-tests' of git://github.com/pietern/redis (antirez) -2010-06-14 rename "list" to "linkedlist" to be more verbose (Pieter Noordhuis) -2010-06-14 allow running the test suite against an external Redis instance, without auto spawning (antirez) -2010-06-14 change ltrim tests to cover all min/max cases and add stronger stresser (Pieter Noordhuis) -2010-06-13 Fixed deps in makefile and mkreleasehdr.sh script to really take advantage of the new trick to avoid recompilation of redis.c on git sha1 or dirty status change (antirez) -2010-06-13 hopefully faster recompiling with a trick (antirez) -2010-06-13 fixed a bug in rdbLoadObject abount specially encoded objects (antirez) -2010-06-13 use raw strings when loading a hash from the rdb into a zipmap (Pieter Noordhuis) -2010-06-12 Merge branch 'expire' of git://github.com/pietern/redis (antirez) -2010-06-11 Merge branch 'lists' of git://github.com/pietern/redis (antirez) -2010-06-11 LPUSHX, RPUSHX, LINSERT only work on non-empty lists, so there are no clients waiting for a push (Pieter Noordhuis) -2010-06-11 make LINSERT return -1 when the value could not be inserted (Pieter Noordhuis) -2010-06-11 check if the list encoding needs to be changed on LPUSHX, RPUSHX, LINSERT (Pieter Noordhuis) -2010-06-11 make sure the value to insert is string encoded (Pieter Noordhuis) -2010-06-11 rename vars, move arguments, add comments (Pieter Noordhuis) -2010-06-11 always iterate from head to tail on LINSERT (Pieter Noordhuis) -2010-06-11 use REDIS_TAIL to insert AFTER an entry and REDIS_HEAD to insert BEFORE an entry (Pieter Noordhuis) -2010-06-11 move listTypeInsert to be grouped with other wrapper functions (Pieter Noordhuis) -2010-06-11 squashed merge from robey/twitter3: LINSERT BEFORE|AFTER, LPUSHX, RPUSHX (Pieter Noordhuis) -2010-06-09 remove pop function and the sds dependency; can be implemented using get+delete (Pieter Noordhuis) -2010-06-07 compute swappability for ziplist encoded lists (Pieter Noordhuis) -2010-06-07 reuse the sds from the main dictionary in the expiration dictionary (Pieter Noordhuis) -2010-06-07 TODO updated (antirez) -2010-06-07 encode integers while loading an hash (antirez) -2010-06-05 Merge branch 'lists' of git://github.com/pietern/redis (antirez) -2010-06-05 fixed two leaks for the dual encoded lists (Pieter Noordhuis) -2010-06-04 TODO updated (antirez) -2010-06-04 DISCSARD now unwatches all keys, as it should (antirez) -2010-06-04 generated tests for different encodings to avoid test code duplication (Pieter Noordhuis) -2010-06-04 refactor list tests to test both encodings; implemented assert functions (Pieter Noordhuis) -2010-06-04 renamed hash wrapper functions to match wrapper function naming convention: "Type" (Pieter Noordhuis) -2010-06-04 Merge branch 'lists' of git://github.com/pietern/redis (antirez) -2010-06-04 Merge branch 'smallkeys' (antirez) -2010-06-04 safety assert in listTypeNext (Pieter Noordhuis) -2010-06-04 renamed list wrapper functions to be more verbose (Pieter Noordhuis) -2010-06-04 add thresholds for converting a ziplist to a real list (Pieter Noordhuis) -2010-06-04 merge antirez/smallkeys (Pieter Noordhuis) -2010-06-03 test restored (antirez) -2010-06-03 memory leak introduced in the latest big changes fixed (antirez) -2010-06-03 Fixed VM bugs introduced with the top level keys as sds strings changes (antirez) -2010-06-03 top level keys are no longer redis objects but sds strings. There are still a few bugs to fix when VM is enabled (antirez) -2010-06-03 update Makefile to include ziplist.o (Pieter Noordhuis) -2010-06-03 use ziplists in SORT STORE until the thresholds are determined (Pieter Noordhuis) -2010-06-03 Merge branch 'testsuite' of git://github.com/pietern/redis (antirez) -2010-06-03 Merge branch 'testsuite' of git://github.com/pietern/redis into smallkeys (antirez) -2010-06-03 tag memory leak check on kill server as "leaks" (Pieter Noordhuis) -2010-06-03 tag test with sleep() as slow (Pieter Noordhuis) -2010-06-03 make sure the config it returned when called without code (Pieter Noordhuis) -2010-06-03 tag more slow tests (Pieter Noordhuis) -2010-06-03 change how arguments are passed from the AOF tests (Pieter Noordhuis) -2010-06-03 scope res variable outside test (Pieter Noordhuis) -2010-06-02 tags for existing tests (Pieter Noordhuis) -2010-06-02 pass tags to filter and match via arguments (Pieter Noordhuis) -2010-06-02 basic support to tag tests (Pieter Noordhuis) -2010-06-02 changed how server.tcl accepts options to support more directives without requiring more arguments to the proc (Pieter Noordhuis) -2010-06-02 removed obsolete code (Pieter Noordhuis) -2010-06-02 catch exceptions in the server proc, to be able to kill the entire chain of running servers (Pieter Noordhuis) -2010-06-02 Merge branch 'master' into smallkeys (antirez) -2010-06-02 smarter swapout policy on AOF too (antirez) -2010-06-02 better swapout policy while loading RDB file (antirez) -2010-06-02 minor code comment change (antirez) -2010-06-01 use integer types from stdint.h to be more verbose on the size in bytes of encoded elements. update list length to use 2 bytes instead of 1. (Pieter Noordhuis) -2010-06-01 added stress test for heavy i/o in ziplists (Pieter Noordhuis) -2010-06-01 fix signedness errors in ziplist testing code (Pieter Noordhuis) -2010-06-01 minor code movements and free object pull restored to 1 million (antirez) -2010-06-01 TODO updated with syslog plans for 2.2 (antirez) -2010-06-01 Debug message was printing stuff that are sometimes not initialized/valid (antirez) -2010-06-01 Merge branch 'smallkeys' of github.com:antirez/redis into smallkeys (antirez) -2010-06-01 fixed a few comments (antirez) -2010-06-01 fixed bugs introduced in the rewrite of the new VM engine (antirez) -2010-05-31 support rewriting the AOF with dual list encoding (Pieter Noordhuis) -2010-05-31 small refactor of fwrite* commands for AOF rewrite to allow writing a bulk long long (Pieter Noordhuis) -2010-05-31 use list wrapper functions in computing the dataset digest (Pieter Noordhuis) -2010-05-31 ziplistNext should work as expected when called with a pointer to ZIP_END (Pieter Noordhuis) -2010-05-31 update SORT to work with the dual list encoding (Pieter Noordhuis) -2010-05-31 function to create a new ziplist encoded list (Pieter Noordhuis) -2010-05-31 fixed missing incrRefCount (antirez) -2010-05-31 support rdb saving/loading with dual list encoding (Pieter Noordhuis) -2010-05-31 fixed signedness and disambiguate variable names (Pieter Noordhuis) -2010-05-31 added rdb save function to directly save long long values (Pieter Noordhuis) -2010-05-31 update RPOPLPUSH to support dual encoding (Pieter Noordhuis) -2010-05-31 update list iteration semantic to work as expected (i.e. "while(lNext(..))") (Pieter Noordhuis) -2010-05-31 ziplistDelete no longer needs a direction now ziplistPrev is fixed (Pieter Noordhuis) -2010-05-31 ziplistPrev should return the tail when the argument is ZIP_END (Pieter Noordhuis) -2010-05-31 first step of VM rewrite. blocking VM tests passing, more work needed in the async side (antirez) -2010-05-31 Merge branch 'no-appendfsync-on-rewrite' (antirez) -2010-05-30 fix LREM to remove *all* occurances when a zero argument is given (Pieter Noordhuis) -2010-05-30 fixed LINDEX to always return bulk response (Pieter Noordhuis) -2010-05-30 the tail offset must be an integer pointer to hold a 32-bit offset (Pieter Noordhuis) -2010-05-30 update LREM to support dual encoding via extra iteration primitives (Pieter Noordhuis) -2010-05-30 support dual encoding in LTRIM (Pieter Noordhuis) -2010-05-30 update LRANGE to use basic iteration code to support dual encoding (Pieter Noordhuis) -2010-05-30 inline support for dual encoding in the LINDEX and LSET commands (Pieter Noordhuis) -2010-05-30 generic pop and length function for ziplist encoding (Pieter Noordhuis) -2010-05-30 generic push function that supports the dual encoding (Pieter Noordhuis) -2010-05-30 change delete function to accept a direction argument, so "p" can be properly updated (Pieter Noordhuis) -2010-05-30 expose extra functionality from ziplist.c (Pieter Noordhuis) -2010-05-30 code style consistency fixes (Pieter Noordhuis) -2010-05-29 ziplistIndex now accepts negative indices (Pieter Noordhuis) -2010-05-29 fix compile warnings (Pieter Noordhuis) -2010-05-29 use simpler encoding for the length of the previous entry (Pieter Noordhuis) -2010-05-29 replace functions to get pointers to head and tail by macros (Pieter Noordhuis) -2010-05-29 function to insert an element at an arbitrary position in the list (Pieter Noordhuis) -2010-05-29 extract a generic delete function that can be used in pop and delete(range) (Pieter Noordhuis) -2010-05-29 use the entry struct in zipRawEntryLength (Pieter Noordhuis) -2010-05-29 rename argument names to s* to disambiguate from e* (Pieter Noordhuis) -2010-05-29 change ziplistRepr to use the entry struct (Pieter Noordhuis) -2010-05-29 modify compare function to check if the encoding is equal before comparing (Pieter Noordhuis) -2010-05-29 use a struct to retrieve all details for an entry (Pieter Noordhuis) -2010-05-29 initial implementation for making the ziplist doubly linked (Pieter Noordhuis) -2010-05-29 fix some warnings (Pieter Noordhuis) -2010-05-29 add function to retrieve ziplist size in bytes (Pieter Noordhuis) -2010-05-22 fix compare function of ziplist to only load integer from ziplist when it is encoded as integer (Pieter Noordhuis) -2010-05-22 add function to retrieve length of ziplist (Pieter Noordhuis) -2010-05-22 re-introduce ZIP_BIGLEN for clarity (Pieter Noordhuis) -2010-05-22 added header ziplist.h (Pieter Noordhuis) -2010-05-22 code to compare strings with entries in ziplist, regardless of their encoding (Pieter Noordhuis) -2010-05-22 updated iteration code to work well with different encodings (Pieter Noordhuis) -2010-05-22 move code from zip.c to ziplist.c (Pieter Noordhuis) -2010-05-22 partial revert of c80df5 because ziplist functions are starting to divert too much from zipmap functions (Pieter Noordhuis) -2010-05-22 initial work for integer encoding in ziplists (Pieter Noordhuis) -2010-05-22 move length housekeeping to a macro (Pieter Noordhuis) -2010-05-21 allow entries to be deleted in place when iterating over a ziplist (Pieter Noordhuis) -2010-05-21 allow pointer to be stored to current element when iterating over ziplist (Pieter Noordhuis) -2010-05-21 rename ziplistDelete to ziplistDeleteRange (Pieter Noordhuis) -2010-05-21 code to delete an inner range from the ziplist (Pieter Noordhuis) -2010-05-21 check if *value is non-NULL before setting it (Pieter Noordhuis) -2010-05-21 change iteration code to avoid allocating a new sds for each traversed entry (Pieter Noordhuis) -2010-05-21 code to iterate over a ziplist (Pieter Noordhuis) -2010-05-21 implementation for a ziplist with push and pop support (Pieter Noordhuis) -2010-05-21 extracted general methods to zip.c for reuse in other zip* structures (Pieter Noordhuis) -2010-05-28 command table size calculated with sizeof (antirez) -2010-05-28 use qsort and bsearch to lookup commands in O(log(N)) instead of O(N) (Pieter Noordhuis) -2010-05-28 Merge branch 'cli-stdin' of git://github.com/pietern/redis (antirez) -2010-05-28 Fixed ZINCR Nan bugs leading to server crash and added tests (antirez) -2010-05-28 redis.conf new features the new option, a minor typo preventing the compilation fixed (antirez) -2010-05-28 don't fsync after a rewrite if appendfsync is set to no. use aof_fsycn instead of fsync where appropriate (antirez) -2010-05-28 added new option no-appendfsync-on-rewrite to avoid blocking on fsync() in the main thread while a background process is doing big I/O (antirez) -2010-05-28 Added Git sha1 and dirty status in redis-server -v output (antirez) -2010-05-28 changed the message in the Makefile with the new command like to run the test suite (antirez) -2010-05-27 Fixed typo. (Vincent Palmer) -2010-05-27 new multi/exec tests (antirez) -2010-05-26 build command outside while loop (Pieter Noordhuis) -2010-05-26 require the flag "-c" to be used for redis-cli to read the last argument from stdin (Pieter Noordhuis) -2010-05-26 Merge branch 'master' into nested-multi (antirez) -2010-05-26 Fix EXEC bug that was leaving the client in dirty status when used with WATCH (antirez) -2010-05-26 raise error on nested MULTI and WATCH inside multi (antirez) -2010-05-25 allow regular sets to be passed to zunionstore/zinterstore (Pieter Noordhuis) -2010-05-25 Version is now 2.1.1 (antirez) -2010-05-25 RENAME is now WATCH-aware (antirez) -2010-05-25 TODO updated (antirez) -2010-05-25 WATCH is now able to detect keys removed by FLUSHALL and FLUSHDB (antirez) -2010-05-25 WATCH tests (antirez) -2010-05-25 minor bug fixed in WATCH (antirez) -2010-05-25 WATCH for MULTI/EXEC (CAS alike concurrency) (antirez) -2010-05-25 gitignore updated (antirez) -2010-05-21 Master is now already unfreezed, unstable, and ready to hacking sessions! (antirez) -2010-05-21 Merge branch 'solaris' of git://github.com/pietern/redis (antirez) -2010-05-21 Changelog updated (antirez) -2010-05-21 redis version is now 1.3.14 (aka 2.0.0 RC1) (antirez) -2010-05-21 html doc updated (antirez) -2010-05-21 by default test with valgrind does not show full leak info (antirez) -2010-05-21 minor fix for the skiplist code, resulting in a false positive with valgrind, and in general into a useless small allocation (antirez) -2010-05-21 Merge branch 'master' of git@github.com:antirez/redis (antirez) -2010-05-21 tests suite initial support for valgrind, fixed the old test suite until the new one is able to target a specific host/port (antirez) -2010-05-21 include solaris fixes in sha1.c (Pieter Noordhuis) -2010-05-20 Don't exit with error in tests temp file cleanup if there are no files to clean (antirez) -2010-05-20 fix memory leak on 32-bit builds (Pieter Noordhuis) -2010-05-20 Merge branch 'master' of github.com:antirez/redis (antirez) -2010-05-20 Fix for DEBUG DIGEST (antirez) -2010-05-20 Merge branch 'test_vm' of git://github.com/pietern/redis (antirez) -2010-05-20 code to enable running tests with the vm enabled (Pieter Noordhuis) -2010-05-20 minor change to shutdown (antirez) -2010-05-20 shutdown on SIGTERM (antirez) -2010-05-20 Merge http://github.com/ngmoco/redis (antirez) -2010-05-20 fix compile error on solaris (Pieter Noordhuis) -2010-05-20 added regression for zipmap bug (antirez) -2010-05-20 fix lookup of keys with length larger than ZIPMAP_BIGLEN (Pieter Noordhuis) -2010-05-19 TODO updated (antirez) -2010-05-19 initial tests for AOF (and small changes to server.tcl to support these) (Pieter Noordhuis) -2010-05-19 Merge branch 'master' into integration (Pieter Noordhuis) -2010-05-19 Fix for 'CONFIG SET appendonly no' (antirez) -2010-05-19 It's now possible to turn off and on the AOF via CONFIG (antirez) -2010-05-18 git hash 00000000 in reelase.h when git is not found enabled again after some shell scripting fix that is now compatible with most shells (antirez) -2010-05-18 build fixed when simpler shells are used to create release.h (antirez) -2010-05-18 use git diff when generating release.h to check for dirty status (antirez) -2010-05-18 Solaris fixes (antirez) -2010-05-18 html doc rebuild (antirez) -2010-05-18 buliding of release.h moved into an external script. Avoided recompialtion of redis.c if git sha1 is the same as the previous one (antirez) -2010-05-17 create release.h in make process and add this information to INFO listing (Pieter Noordhuis) -2010-05-16 Redis version is now 1.3.12 (antirez) -2010-05-16 redis version is now 1.3.11 (antirez) -2010-05-16 random refactoring and speedups (antirez) -2010-05-16 faster INCR with very little efforts... (antirez) -2010-05-15 print warnings in redis log when a test raises an exception (very likely to be caused by something like a failed assertion) (Pieter Noordhuis) -2010-05-15 Merge branch 'redis-cli-fix' of http://github.com/tizoc/redis (antirez) -2010-05-15 added pid info to the check memory leaks test, so that those tests don't appear to be duplicated (antirez) -2010-05-15 Merge branch 'integration' of git://github.com/pietern/redis (antirez) -2010-05-14 more endianess detection fix for SHA1 (antirez) -2010-05-14 fixed a warning seen with some GCC version under Linux (antirez) -2010-05-14 initial rough integration test for replication (Pieter Noordhuis) -2010-05-14 store entire server object on the stack instead of just the client (Pieter Noordhuis) -2010-05-14 proc to retrieve values from INFO properties (Pieter Noordhuis) -2010-05-14 one more fix for endianess detection (antirez) -2010-05-14 Fixed sha1.c compilation on Linux, due to endianess detection lameness (antirez) -2010-05-14 ZUNION,ZINTER -> ZUNIONSTORE,ZINTERSTORE (antirez) -2010-05-14 minor fixes to the new test suite, html doc updated (antirez) -2010-05-14 wait for redis-server to be settled and ready for connections (Pieter Noordhuis) -2010-05-14 fix cleaning up tmp folder (Pieter Noordhuis) -2010-05-14 update makefile to use the new test suite (Pieter Noordhuis) -2010-05-14 check for memory leaks before killing a server (Pieter Noordhuis) -2010-05-14 extract code to kill a server to a separate proc (Pieter Noordhuis) -2010-05-14 start servers on different ports to prevent conflicts (Pieter Noordhuis) -2010-05-14 use DEBUG DIGEST in new test suite (Pieter Noordhuis) -2010-05-14 split test suite into multiple files; runs redis-server in isolation (Pieter Noordhuis) -2010-05-14 use DEBUG DIGEST in the test instead of a function that was doing a similar work, but in a much slower and buggy way (antirez) -2010-05-14 Don't rely on cliReadReply being able to return on shutdown (Bruno Deferrari) -2010-05-14 If command is a shutdown, ignore errors on reply (Bruno Deferrari) -2010-05-14 DEBUG DIGEST implemented, in order to improve the ability to test persistence and replication consistency (antirez) -2010-05-13 Add SIGTERM shutdown handling. (Ashley Martens) -2010-05-13 makefile deps updated (antirez) -2010-05-13 conflicts resolved (antirez) -2010-05-13 feed SETEX as SET and EXPIREAT to AOF (Pieter Noordhuis) -2010-05-13 very strong speedup in saving time performance when there are many integers in the dataset. Instead of decoding the object before to pass them to the rdbSaveObject layer we check asap if the object is integer encoded and can be written on disk as an integer. (antirez) -2010-05-13 include limits.h otherwise no double precison macros (antirez) -2010-05-13 explicitly checks with ifdefs if our floating point and long long assumptions are verified (antirez) -2010-05-13 Yet another version of the double saving code, with comments explaining what's happening there (antirez) -2010-05-12 added overflow check in the double -> long long conversion trick to avoid integer overflows. I think this was not needed in practical terms, but it is safer (antirez) -2010-05-12 use withscores when performing the dataset digest (antirez) -2010-05-12 If a float can be casted to a long long without rounding loss, we can use the integer conversion function to write the score on disk. This is a seriuous speedup (antirez) -2010-05-12 fixed compilation warnings in the AOF sanity check tool (antirez) -2010-05-12 Merge branch 'vm-speedup' (antirez) -2010-05-11 fix to return error when calling INCR on a non-string type (Pieter Noordhuis) -2010-05-11 load objects encoded from disk directly without useless conversion (antirez) -2010-05-11 fixed a problem leading to crashes, as keys can't be currently specially encoded, so we can't encode integers at object loading time... For now this can be fixed passing a few flags, or later can be fixed allowing encoded keys as well (antirez) -2010-05-11 long long to string conversion speedup applied in other places as well. Still the code has bugs, fixing right now... (antirez) -2010-05-11 hand written code to turn a long long into a string -> very big speed win (antirez) -2010-05-11 added specialized function to compare string objects for perfect match that is optimized for this task (antirez) -2010-05-11 better use of encoding inforamtion in dictEncObjKeyCompare (antirez) -2010-05-10 CONFIG now can change appendfsync policy at run time (antirez) -2010-05-10 CONFIG command now supports hot modification of RDB saving parameters. (antirez) -2010-05-10 while loading the rdb file don't add the key to the dictionary at all if it's already expired, instead of removing it just after the insertion. (antirez) -2010-05-10 Merge branch 'check-aof' of git://github.com/pietern/redis (antirez) -2010-05-08 minor changes to improve code readability (antirez) -2010-05-08 swap objects out directly while loading an RDB file if we detect we can't stay in the vm max memory limits anyway (antirez) -2010-05-07 change command names no longer used to zunion/zinter (Pieter Noordhuis) -2010-05-07 DEBUG POPULATE command for fast creation of test databases (antirez) -2010-05-07 update TODO (Pieter Noordhuis) -2010-05-07 swap arguments in blockClientOnSwappedKeys to be consistent (Pieter Noordhuis) -2010-05-07 added function that preloads all keys needed to execute a MULTI/EXEC block (Pieter Noordhuis) -2010-05-07 add sanity check to zunionInterBlockClientOnSwappedKeys, as the number of keys used is provided as argument to the function (Pieter Noordhuis) -2010-05-07 make prototype of custom function to preload keys from the vm match the prototype of waitForMultipleSwappedKeys (Pieter Noordhuis) -2010-05-07 extract preloading of multiple keys according to the command prototype to a separate function (Pieter Noordhuis) -2010-05-07 make append only filename configurable (Pieter Noordhuis) -2010-05-07 don't load value from VM for EXISTS (Pieter Noordhuis) -2010-05-07 swap file name pid expansion removed. Not suited for mission critical software... (antirez) -2010-05-07 Swap file is now locked (antirez) -2010-05-06 Merge branch 'master' into aof-speedup (antirez) -2010-05-06 log error and quit when the AOF contains an unfinished MULTI (antirez) -2010-05-06 log error and quit when the AOF contains an unfinished MULTI (Pieter Noordhuis) -2010-05-06 Merge branch 'master' into check-aof (Pieter Noordhuis) -2010-05-06 hincrby should report an error when called against a hash key that doesn't contain an integer (Pieter Noordhuis) -2010-05-06 AOF writes are now accumulated into a buffer and flushed into disk just before re-entering the event loop. A lot less writes but still this guarantees that AOF is written before the client gets a positive reply about a write operation, as no reply is trasnmitted before re-entering into the event loop. (antirez) -2010-05-06 clarified a few messages in redis.conf (antirez) -2010-05-05 ask for confirmation before AOF is truncated (Pieter Noordhuis) -2010-05-05 str can be free'd outside readString (Pieter Noordhuis) -2010-05-05 moved argument parsing around (Pieter Noordhuis) -2010-05-05 ignore redis-check-aof binary (Pieter Noordhuis) -2010-05-05 allow AOF to be fixed by truncating to the portion of the file that is valid (Pieter Noordhuis) -2010-05-05 tool to check if AOF is valid (Pieter Noordhuis) -2010-05-02 included fmacros.h in linenose.c to avoid compilation warnings on Linux (antirez) -2010-05-02 compilation fix for mac os x (antirez) -2010-05-02 Merge branch 'master' of git@github.com:antirez/redis (antirez) -2010-05-02 On Linux now fdatasync() is used insetad of fsync() in order to flush the AOF file kernel buffers (antirez) -2010-04-30 More tests for APPEND and tests for SUBSTR (antirez) -2010-04-30 linenoise.c updated, now redis-cli can be used in a pipe (antirez) -2010-04-29 redis-cli minor fix (less segfault is better) (antirez) -2010-04-29 New MONITOR output format with timestamp, every command in a single line, string representations (antirez) -2010-04-29 redis-cli INFO output format is now raw again (antirez) -2010-04-29 Added more information about slave election in Redis Cluster alternative doc (antirez) -2010-04-29 Redis cluster version 2 (antirez) -2010-04-27 Fixed a redis-cli bug, was using free instead of zfree call (antirez) -2010-04-27 AOF is now rewritten on slave after SYNC with master. Thanks to @_km for finding this bug and any others' (antirez) -2010-04-27 redis-cli is now using only the new protocol (antirez) -2010-04-27 Minimal support for subscribe/psubscribe in redis-cli (antirez) -2010-04-26 don't output the newline when stdout is not a tty (antirez) -2010-04-26 redis-cli now is able to also output the string representation instead of the raw string. Much better for debugging (antirez) -2010-04-26 Initial support for quoted strings in redis-cli (antirez) -2010-04-23 SETEX implemented (antirez) -2010-04-23 Pub/Sub API change: now messages received via pattern matching have a different message type and an additional field representing the original pattern the message matched (antirez) -2010-04-22 typo fixed, reloaded (antirez) -2010-04-22 typo fixed (antirez) -2010-04-22 REDIS-CLUSTER doc updated (antirez) -2010-04-22 Virtual memory design document removed, no longer needed as we have a full specification and implementation (antirez) -2010-04-22 new units for bytes specification (antirez) -2010-04-22 Now in redis.conf it is possible to specify units where appropriate instead of amounts of bytes, like 2Gi or 4M and so forth (antirez) -2010-04-21 binary safe keys ready implementation of RANDOMKEYS (antirez) -2010-04-21 Now that's the right 1.3.10 (antirez) -2010-04-21 Revert "fsync always now uses O_DIRECT on Linux" (antirez) -2010-04-21 Revert "define __USE_GNU to get O_DIRECT" (antirez) -2010-04-21 Merge branch 'master' of github.com:antirez/redis (antirez) -2010-04-21 Revert "version 1.3.10" (antirez) -2010-04-21 version 1.3.10 (antirez) -2010-04-20 define __USE_GNU to get O_DIRECT (antirez) -2010-04-20 fsync always now uses O_DIRECT on Linux (antirez) -2010-04-20 More precise memory used guesswork in zmalloc.c (antirez) -2010-04-19 Fix for MULTI/EXEC and Replication/AOF: now the block is correctly sent as MULTI/..writing operations../EXEC. Ok for slaves but more work needed for the AOF as it should be a write-all-or-nothing business (antirez) -2010-04-19 running the test using tcl8.5 directly instead of tclsh that too often it's a symlink to 8.4 (antirez) -2010-04-19 Added package require Tcl 8.5 in redis.tcl so it will show a clear error when the test suit is attempted to run under 8.4 (antirez) -2010-04-18 Fix for a SORT bug introduced with commit 16fa22f1, regression test added (antirez) -2010-04-18 Guru mediation -> meditation (antirez) -2010-04-16 check eptr inline (Pieter Noordhuis) -2010-04-16 refactor code that retrieves value from object or replies to client (Pieter Noordhuis) -2010-04-17 Merge branch 'hash' of git://github.com/pietern/redis (antirez) -2010-04-17 redisAssert(0) => redisPanic("something meaningful") (antirez) -2010-04-17 make sure that the resulting value in hincrby is encoded when possible (Pieter Noordhuis) -2010-04-17 increment dirty counter after hmset (Pieter Noordhuis) -2010-04-17 strip tryObjectEncoding from hashSet, to enable the arguments being encoded in-place (Pieter Noordhuis) -2010-04-17 Added support for Guru Mediation, and raising a guru mediation if refCount <= 0 but decrRefCount is called against such an object (antirez) -2010-04-16 fix small error and memory leaks in SORT (Pieter Noordhuis) -2010-04-16 SORT/GET test added (antirez) -2010-04-16 Added tests for GET/BY against hashes fields (antirez) -2010-04-16 Merge branch 'hash-refactor' of git://github.com/pietern/redis (antirez) -2010-04-16 check object type in lookupKeyByPattern (Pieter Noordhuis) -2010-04-16 make sortCommand aware that lookupKeyByPattern always increased the refcount of the returned value (Pieter Noordhuis) -2010-04-16 revert 0c390a to stop using tricks with o->refcount (Pieter Noordhuis) -2010-04-16 store the hash iterator on the heap instead of the stack (Pieter Noordhuis) -2010-04-16 drop inline directive (Pieter Noordhuis) -2010-04-16 rename hashReplace to hashSet (Pieter Noordhuis) -2010-04-16 added dictFetchValue() to dict.c to make hash table API a bit less verbose in the common cases (antirez) -2010-04-03 Don't set expire to keys with ttl=0, remove them immediately. (antirez) -2010-04-15 make sure that cmpobj is in decoded form when sorting by ALPHA (this solves edge case from previous commit where (!sortby && alpha) == 1) (Pieter Noordhuis) -2010-04-15 enable hash dereference in SORT on BY and GET (Pieter Noordhuis) -2010-04-15 use shared replies for hset (Pieter Noordhuis) -2010-04-15 set refcount of string objects retrieved from zipmaps to 0, so we don't have to touch the refcount of the objects inside dicts (Pieter Noordhuis) -2010-04-15 added HSETNX (Pieter Noordhuis) -2010-04-14 refactor of hash commands to use specialized api that abstracts zipmap and dict apis (Pieter Noordhuis) -2010-04-13 move retrieval of long up to prevent an empty hash from being created (Pieter Noordhuis) -2010-04-15 more advanced leaks detection in test redis (antirez) -2010-04-15 ability to select port/host from make test (antirez) -2010-04-15 Active rehashing (antirez) -2010-04-15 Incrementally rehahsing hash table! Thanks to Derek Collison and Pieter Noordhuis for feedbacks/help (antirez) -2010-04-14 Does not allow commands other than Pub/Sub commands when there is at least one pattern (antirez) -2010-04-13 Fixed a tiny memory leak when loading the configuration file. (Alex McHale) -2010-04-13 Merge branch 'hmget' of git://github.com/pietern/redis (antirez) -2010-03-29 Validate numeric inputs. (Alex McHale) -2010-03-24 Remove trailing whitespace. (Alex McHale) -2010-04-12 Now all the commands returning a multi bulk reply against non existing keys will return an empty multi bulk, not a nil one (antirez) -2010-04-12 implemented HMGET (Pieter Noordhuis) -2010-04-12 implemented HMSET (Pieter Noordhuis) -2010-04-12 Sharing of small integer objects: may save a lot of memory with datasets having many of this (antirez) -2010-04-10 dict.c fixed to play well with enabling/disabling of the hash table (antirez) -2010-04-09 removed a no longer true assert in the VM code (antirez) -2010-04-09 shareobjects feautres killed - no gains most of the time, but VM complexities (antirez) -2010-04-09 use directly the real key object in VM I/O jobs to match by pointer, and to handle different keys with the same name living in different DBs, but being at the same moment in the IO job queues (antirez) -2010-04-08 last change reverted as it was unstable... more testing needed (antirez) -2010-04-08 Prevent hash table resize while there are active child processes in order to play well with copy on write (antirez) -2010-04-08 Merge branch 'issue_218' of git://github.com/pietern/redis (antirez) -2010-04-08 -1 not needed... (antirez) -2010-04-08 Skiplist theoretical fix (antirez) -2010-04-07 Now when a child is terminated by a signal, the signal number is logged as well (antirez) -2010-04-07 First version of evented Redis Tcl client, that will be used for BLPOP and Pub/Sub tests (antirez) -2010-04-05 use long long reply type for HINCRBY (Pieter Noordhuis) -2010-04-05 last argument is never encoded for HINCRBY (Pieter Noordhuis) -2010-04-02 Now PUBLISH commands are replicated to slaves (antirez) -2010-04-01 use the right object when cleaning up after zunion/zinter (fixes issue 216) (Pieter Noordhuis) -2010-04-01 Merge branch 'zipmap' of git://github.com/pietern/redis (antirez) -2010-04-01 reduce code complexity because zipmapLen now is O(1) (Pieter Noordhuis) -2010-04-01 update the zipmap entry in-place instead of appending it (Pieter Noordhuis) -2010-04-01 updated zipmap documentation to match the implementation (Pieter Noordhuis) -2010-04-01 allow 4 free trailing bytes for each value (Pieter Noordhuis) -2010-04-01 Pub/Sub pattern matching capabilities (antirez) -2010-04-01 use function to determine length of a single entry (Pieter Noordhuis) -2010-03-31 Deny EXEC under out of memory (antirez) -2010-03-29 No timeouts nor other commands for clients in a Pub/Sub context (antirez) -2010-03-29 free hash table entries about no longer active classes, so that PUBSUB can be abused with millions of different classes (antirez) -2010-03-29 Fixed a refcount stuff leading to PUBSUB crashes (antirez) -2010-03-29 fmacros added to linenoise, avoiding all the nice warnings... (antirez) -2010-03-29 First pubsub fix (antirez) -2010-03-29 PUBSUB implemented (antirez) -2010-03-29 Redis version is now 1.3.8 (antirez) -2010-03-28 removed references in code to ZIPMAP_EMPTY (Pieter Noordhuis) -2010-03-28 use first byte of zipmap to store length (Pieter Noordhuis) -2010-03-28 implemented strategy that doesn't use free blocks in zipmaps (Pieter Noordhuis) -2010-03-26 Merge branch 'hincrby' of git://github.com/pietern/redis (antirez) -2010-03-26 removed unnecessary refcount increase that caused the HINCRBY memleak (Pieter Noordhuis) -2010-03-26 implements HINCRBY and tests (todo: find and fix small memleak) (Pieter Noordhuis) -2010-03-26 Removed a useless if spotted by Pieter Noordhuis (antirez) -2010-03-26 Fixed a critical replication bug: binary values issued with the multi bulk protocol caused a protocol desync with slaves. (antirez) -2010-03-24 Fixed the reply about denied write commands under maxmemory reached condition: now the error will no longer lead to a client-server protocol desync (antirez) -2010-03-24 CONFIG command implemened -- just a start but already useful (antirez) -2010-03-24 redis-cli prompt is now redis> (antirez) -2010-03-23 with --help states that you can use - as config file name to feed config via stdin (antirez) -2010-03-23 New INFO field: expired_keys (antirez) -2010-03-23 the Cron timer function is now called 10 times per second instead of 1 time per second to make Redis more responsibe to BGSAVE and to delete expired keys more incrementally (antirez) -2010-03-23 Use linenoise for line editing on redis-cli. (Michel Martens) -2010-03-23 Fix authentication for redis-cli on non-interactive mode. (Michel Martens) -2010-03-23 key deletion on empty value fix + some refactoring (antirez) -2010-03-23 Empty value trigger key removal in all the operations (antirez) -2010-03-22 Merged gnrfan patches fixing issues 191, 193, 194 (antirez) -2010-03-22 Merge branch 'issue_193' of git://github.com/gnrfan/redis (antirez) -2010-03-22 Merge branch 'issue_191' of git://github.com/gnrfan/redis (antirez) -2010-03-22 Redis master version is now 1.3.7 (antirez) -2010-03-19 support for include directive in config parser (Jeremy Zawodny) -2010-03-19 Removed a stupid overriding of config values due to a wrong cut&paste (antirez) -2010-03-19 VM hash type swappability implemented. Handling of failed pthread_create() call. (antirez) -2010-03-19 Solving issue #191 on Google Code: -v and --version should print the version of Redis (Antonio Ognio) -2010-03-19 Solves issue #194 on Google Code: --help parameter to redis-srver prints the usage message (Antonio Ognio) -2010-03-19 Fixing issue 193 (Antonio Ognio) -2010-03-18 increment server.dirty on HDEL (antirez) -2010-03-18 Redis 1.3.6 (antirez) -2010-03-18 test-redis.tcl dataset digest function Hash support (antirez) -2010-03-18 zipmap fix for large values (antirez) -2010-03-18 Optimization fixed and re-activated (antirez) -2010-03-18 reverted an optimization that makes Redis not stable (antirez) -2010-03-18 Fixed redis-cli auth code (antirez) -2010-03-17 HDEL fix, an optimization for comparison of objects in hash table lookups when they are integer encoding (antirez) -2010-03-17 Version is now 1.3.5 (antirez) -2010-03-17 Merged Pietern patch for VM key args helper function. Fixed an obvious bug in the redis-cli passwd auth stuff (antirez) -2010-03-17 Merge branch 'aggregates' of git://github.com/pietern/redis (antirez) -2010-03-17 Added Authentication to redis-cli.c using -a switch Update usage fixed Makefile to delete redis-check-dump during make clean (root) -2010-03-17 HEXISTS and tests implemented (antirez) -2010-03-17 More hash tests (antirez) -2010-03-17 better HSET test (antirez) -2010-03-17 Fixed a bug in HSET, a memory leak, and a theoretical bug in dict.c (antirez) -2010-03-17 More Hash tests (antirez) -2010-03-13 added preloading keys from VM when using ZINTER or ZUNION (Pieter Noordhuis) -2010-03-13 added explicit AGGREGATE [SUM|MIN|MAX] option to ZUNION/ZINTER (Pieter Noordhuis) -2010-03-16 HGET fix for integer encoded field against zipmap encoded hash (antirez) -2010-03-16 zrevrank support in redis-cli (antirez) -2010-03-16 HKEYS / HVALS / HGETALL (antirez) -2010-03-16 Solved a memory leak with Hashes (antirez) -2010-03-15 pretty big refactoring (antirez) -2010-03-15 An interesting refactoring + more expressive internal API (antirez) -2010-03-15 Fixed the same problem in ZREVRANK (antirez) -2010-03-15 Fixed a ZRANK bug (antirez) -2010-03-15 zipmap to hash conversion in HSET (antirez) -2010-03-14 max zipmap entries and max zipmap value parameters added into INFO output (antirez) -2010-03-14 HDEL and some improvement in DEBUG OBJECT command (antirez) -2010-03-14 Append only file support for hashes (antirez) -2010-03-13 utility to check rdb files for unprocessable opcodes (Pieter Noordhuis) -2010-03-12 A minor fix and a few debug messages removed (antirez) -2010-03-12 Applied the replication bug patch provided by Jeremy Zawodny, removing temp file collision after the slave got the dump.rdb file in the SYNC stage (antirez) -2010-03-11 Fix for HGET against non Hash type, debug messages used to understand a bit better a corrupted rdb file (antirez) -2010-03-09 fix: use zmalloc instead of malloc (Pieter Noordhuis) -2010-03-09 Merged zsetops branch from Pietern (antirez) -2010-03-09 Merged ZREMBYRANK from Pietern (antirez) -2010-03-09 Merged ZREVRANK from Pietern (antirez) -2010-03-09 use a struct to store both a dict and its weight for ZUNION and ZINTER, so qsort can be applied (Pieter Noordhuis) -2010-03-09 Hash auto conversion from zipmap to hash table, type fixed for hashes, hash loading from disk (antirez) -2010-03-09 replaced ZMERGE by ZUNION and ZINTER. note: key preloading by the VM does not yet work (Pieter Noordhuis) -2010-03-08 Hashes saving / fixes (antirez) -2010-03-08 use ZMERGE as starting point (Pieter Noordhuis) -2010-03-07 HSET fixes, now the new pointer is stored back in the object pointer field (antirez) -2010-03-07 added ZREVRANK (Pieter Noordhuis) -2010-03-06 Fix for replicaiton with over 2GB dump file initial SYNC stage (antirez) -2010-03-06 first implementation of HSET/HSET. More work needed (antirez) -2010-03-05 zipmaps functions to get, iterate, test for existence. Initial works for Hash data type (antirez) -2010-03-04 redis-benchmark now implements Set commands benchmarks (antirez) -2010-03-04 zipmap iteration code (antirez) -2010-03-04 moved code to delete a single node from a zset to a separate function (Pieter Noordhuis) -2010-03-04 rename zslDeleteRange to zslDeleteRangeByScore (to differentiate between deleting using score or rank) (Pieter Noordhuis) -2010-03-04 use 1-based rank across zsl*Rank functions consistently (Pieter Noordhuis) -2010-03-04 implemented ZREMBYRANK (Pieter Noordhuis) -2010-03-04 A fix for initialization of augmented skip lists (antirez) -2010-03-04 A fix for an invalid access when VM is disabled (antirez) -2010-03-04 Merge branch 'zsl-get-rank' of git://github.com/pietern/redis (antirez) -2010-03-04 redis-cli now runs in interactive mode if no command is provided (antirez) -2010-03-04 merged memory reduction patch (Pieter Noordhuis) -2010-03-04 Now list push commands return the length of the new list, thanks to Gustavo Picon (antirez) -2010-03-04 first check if starting point is trivial (head or tail) before applying log(N) search (Pieter Noordhuis) -2010-03-04 use rank to find starting point for ZRANGE and ZREVRANGE (Pieter Noordhuis) -2010-03-04 lookup rank of a zset entry in a different function (Pieter Noordhuis) -2010-03-04 SUBSTR fix for integer encoded vals (antirez) -2010-03-04 fix ZRANK (realize that rank is 1-based due to the skip list header) (Pieter Noordhuis) -2010-03-03 initial implementation of SUBSTR (antirez) -2010-03-03 TODO updated (antirez) -2010-03-03 fpurge call removed from redis-cli (antirez) -2010-03-03 ZRANK stress tester (antirez) -2010-03-03 use less memory as element->span[0] will always be 1; any level 0 skip list is essentially a linked list (Pieter Noordhuis) -2010-03-03 rank is very unlikely to overflow integer range (Pieter Noordhuis) -2010-03-03 x->backward never equals zsl->header (Pieter Noordhuis) -2010-03-03 initial implementation for augmented zsets and the zrank command (Pieter Noordhuis) -2010-03-03 zipampDel() implemented (antirez) -2010-03-03 added quit and exit commands to redis-cli in order to quit the interactive mode (antirez) -2010-03-03 Merge remote branch 'djanowski/interactive' (antirez) -2010-03-02 Add support for MULTI/EXEC. (Damian Janowski & Michel Martens) -2010-03-02 Remove trailing newline in interactive mode. (Damian Janowski & Michel Martens) -2010-03-02 minor fix for a Linux warning (antirez) -2010-03-02 Add interactive mode to redis-cli. (Michel Martens & Damian Janowski) -2010-03-02 Better to increment the version minor number when a VM bug is fixed... it will be simpler to understand what's going on when users will report problems with the INFO trace. (antirez) -2010-03-02 Fixed a subtle VM bug... was not flushing the buffer so the child process read truncated data (antirez) -2010-03-01 KEYS now returns a multi bulk reply (antirez) -2010-02-27 Add DISCARD command to discard queued MULTI commands. (antirez) -2010-03-01 Swappability bug due to a typo fixed thanks to code review by Felix Geisendörfer @felixge (antirez) -2010-02-28 minor fixes for zipmap.c (antirez) -2010-02-27 first zipmap fix of a long sequence in the days to come ;) (antirez) -2010-02-27 initial zipmap.c implementation (antirez) -2010-02-27 Bug #169 fixed (BLOP/BRPOP interrupted connections are not cleared from the queue) (antirez) -2010-02-22 Fixed 32bit make target to work on Linux out of the box (antirez) -2010-02-19 A problem with replication with multiple slaves connectiong to a single master fixed. It was due to a typo, and reported on github by the user micmac. Also the copyright year fixed from many files. (antirez) -2010-02-10 Saner VM defaults for redis.conf (antirez) -2010-02-09 VM now is able to block clients on swapped keys for all the commands (antirez) -2010-02-07 ZCOUNT and ZRANGEBYSCORE new tests (antirez) -2010-02-07 ZRANGEBYSCORE now supports open intervals, prefixing double values with a open paren. Added ZCOUNT that can count the elements inside an interval of scores, this supports open intervals too (antirez) -2010-02-07 WITHSCORES in ZRANGEBYSCORE thanks to Sam Hendley (antirez) -2010-02-06 Added "withscores" option to zrangebyscore command. Based on withscores support in zrange function, ugliest part was the argument parsing to handle using it with the limit option. (Sam Hendley) -2010-02-06 DEBUG OBJECT provide info about serialized object length even when VM is disabled (antirez) -2010-02-06 multi bulk requests in redis-benchmark, default fsync policy changed to everysec, added a prefix character for DEBUG logs (antirez) -2010-02-04 APPEND tests (antirez) -2010-02-04 APPEND command (antirez) -2010-02-02 Faster version of the function hashing possibly encoded objects, leading to a general speed gain when working with Sets of integers (antirez) -2010-02-02 faster Set loading time from .rdb file resizing the hash table to the right size before loading elements (antirez) -2010-02-02 Log time taken to load the DB at startup, in seconds (antirez) -2010-01-31 Fixed VM corruption due to child fclosing the VM file directly or indirectly calling exit(), now replaced with _exit() in all the sensible places. Masked a few signals from IO threads. (antirez) -2010-01-28 loading side of the threaded VM (antirez) -2010-01-26 TODO cahnges (antirez) -2010-01-23 Fixed memory human style memory reporting, removed server.usedmemory, now zmalloc_used_memory() is used always. (antirez) -2010-01-22 VM tuning thanks to redis-stat vmstat. Now it performs much better under high load (antirez) -2010-01-21 Changelog updated (antirez) -2010-01-21 REDIS_MAX_COMPLETED_JOBS_PROCESSED is now in percentage, not number of jobs. Moved a debugging message a few lines forward as it was called where a few logged parameters where invalid, leading to a crash (antirez) -2010-01-20 fixed a deadlock caused by too much finished processes in queue so that I/O clients writing to the wirte side of the pipe used to awake the main thread where blocking. Then a BGSAVE started waiting for the last active thread to finish, condition impossible because all the I/O threads where blocking on threads. Takes this as a note to myself... (antirez) -2010-01-20 ae.c event loop does no longer support exception notifications, as they are fully pointless. Also a theoretical bug that never happens in practice fixed. (antirez) -2010-01-19 commercial tools stuff removed from the Redis makefile. cotools are now migrated into a different repos (antirez) -2010-01-19 removed a bug in the function to cancel an I/O job (antirez) -2010-01-17 static symbols update (antirez) -2010-01-16 removed support for REDIS_HELGRIND_FRIENDLY since Helgrind 3.5.0 is friendly enough even with many threads created and destroyed (antirez) -2010-01-15 now redis-cli understands -h (antirez) -2010-01-15 Create swap file only if not exists (antirez) -2010-01-15 I hate warnings (antirez) -2010-01-15 fixed a minor memory leak in configuration file parsing (antirez) -2010-01-15 minor fix (antirez) -2010-01-15 support for named VM swap file. Fixed a few important interaction issues between the background saving processes and IO threads (antirez) -2010-01-15 fix for the just added new test (antirez) -2010-01-15 useless debugging messages removed (antirez) -2010-01-15 new test added (antirez) -2010-01-15 thread safe zmalloc used memory counter (antirez) -2010-01-15 A define to make Redis more helgrind friendly (antirez) -2010-01-15 removed a few races from threaded VM (antirez) -2010-01-14 Fixed a never experienced, theoretical bug that can actually happen in practice. Basically when a thread is working on a I/O Job we need to wait it to finish before to cancel the Job in vmCancelThreadedIOJob(), otherwise the thread may mess with an object that is being manipulated by the main thread as well. (antirez) -2010-01-14 Set the new threads stack size to a LZF friendly amount (antirez) -2010-01-13 access to already freed job structure fixed by statements reoredering (antirez) -2010-01-13 removed a useless debugging message (antirez) -2010-01-13 Wait zero active threads condition before to fork() for BGSAVE or BGREWRITEAOF (antirez) -2010-01-13 list API is now thread safe (antirez) -2010-01-13 minor TODO and debugging info changes (antirez) -2010-01-12 support for blocking VM in config file (antirez) -2010-01-12 more non blocking VM changes (antirez) -2010-01-12 fix for test #11 (antirez) -2010-01-12 a few more stuff in INFO about VM. Test #11 changed a bit in order to be less lame (antirez) -2010-01-12 Added a define to configure how many completed IO jobs the handler should process at every call. (antirez) -2010-01-11 Fixed a bug in the IO Job canceling funtion (antirez) -2010-01-11 more steps towards a working non blocking VM (antirez) -2010-01-11 converted random printfs in debug logs (antirez) -2010-01-11 removed a bug introduced with non blocking VM (antirez) -2010-01-11 a few non blocking VM bugs fixed (antirez) -2010-01-11 More work on non-blocking VM. Should work in a few days (antirez) -2010-01-11 More threaded I/O VM work + Redis init script (antirez) -2010-01-10 more work on VM threaded I/O. Still nothing of usable (antirez) -2010-01-09 non-blocking VM data structures, just a start (antirez) -2010-01-08 used_memory_human added to INFO output. Human readable amount of memory used. (antirez) -2010-01-07 Now DEBUG OBJECT plays well with swapped out objects (antirez) -2010-01-07 fflush VM swap file after object swapping (antirez) -2010-01-07 added the fmacros to enable support for fseeko() lseeko() with 64bit off_t (antirez) -2010-01-07 VM now swaps objects out while loading datasets not fitting into vm-max-memory bytes of RAM (antirez) -2010-01-07 added process id information in INFO (antirez) -2010-01-06 vm-enabled set to no by default in redis.conf (antirez) -2010-01-06 a new default redis.conf (antirez) -2010-01-06 VM stats in INFO command (antirez) -2010-01-06 Introduced a new log verbosity level, so now DEBUG is really for debugging. Refactored a bit maxmemory. When virtual memory is short in RAM free the objects freelist as well as swapping things out. (antirez) -2010-01-05 fixed a bug in bgsave when VM is off but still it was testing for obj->storage field (antirez) -2010-01-05 converted a few calls to assert() => redisAssert() to print stack trace (antirez) -2010-01-05 BGREWRITEAOF now works with swapping on (antirez) -2010-01-05 A first fix for SET key overwrite (antirez) -2010-01-05 SAVE now works with VM (antirez) -2010-01-05 swapping algorithm a bit more aggressive under low memory (antirez) -2010-01-05 basic VM mostly working! (antirez) -2010-01-05 New object field (one of the unused bytes) to hold the type of the swapped out value object in key objects (antirez) -2010-01-05 VM internals bugfixes, set 1 (antirez) -2010-01-05 load key from swap on key lookup (antirez) -2010-01-05 more object-level VM primitives (antirez) -2010-01-05 Redis objects swapping / loading (antirez) -2010-01-05 rdbLoadObject() as a separated function to load objects from disk. Dropped support for RDB version 0, I guess no longer has this legacy DBs around (antirez) -2010-01-04 VM low level pages handling (antirez) -2010-01-04 vm swap file creation, and some basic configuration (antirez) -2010-01-04 version marked 1.3.2 (antirez) -2010-01-04 saving code refactored a bit, added a function returning the number of bytes an object will use on disk (antirez) -2010-01-02 Now the PUSH side of RPOPLPUSH is able to unblock clients blocked on BLPOP (antirez) -2010-01-02 Version is now 1.3.1 (antirez) -2010-01-02 New vararg BLPOP able to block against multiple keys (antirez) -2009-12-29 fixed a problem with BLPOP timeout of zero, now it blocks forever (antirez) -2009-12-29 BLPOP timeouts implemented (antirez) -2009-12-29 first working implementation of BLPOP and BRPOP, still everything is to test well (antirez) -2009-12-29 a few more fixes, still broken (antirez) -2009-12-29 First fix, still broken (antirez) -2009-12-29 minor fix for Linux 64 bit (antirez) -2009-12-29 not yet working BLPOP implementation (antirez) -2009-12-27 AOFSYNC removed, got a better idea... (antirez) -2009-12-27 AOFSYNC command implemented (antirez) -2009-12-27 Version changed to 1.3.0, welcome to the new unstable (antirez) -2009-12-27 Now MULTI returns +OK as well (antirez) -2009-12-27 MULTI/EXEC first implementation (antirez) -2009-12-24 Fixed a minor bug in GETSET, now the SET part is not performed if the GET fails because the key does not contain a string value (antirez) -2009-12-23 html doc readded (antirez) -2009-12-23 ZRANGE WITHSCORES test added (antirez) -2009-12-23 version is now 1.1.94 (antirez) -2009-12-23 Add the command name in the unknown command error message. (antirez) -2009-12-22 ZRANGE, ZREVRANGE now support WITHSCORES options (antirez) -2009-12-22 html docs update (ZINCRBY added) (antirez) -2009-12-18 TODO list update (antirez) -2009-12-18 the pipelining test was ran against DB 1 for error, now it runs on DB 9 like all the other tests (antirez) -2009-12-18 still more tests (antirez) -2009-12-18 SORT STORE test added (antirez) -2009-12-18 Now SORT returns an empty bulk reply if the key does not exist (antirez) -2009-12-18 modified a bit the ZREVRANGE test to cover a few lines of code more (antirez) -2009-12-18 SHUTDOWN now does the right thing when append only is on, that is, fsync instead to save the snapshot. (antirez) -2009-12-18 Added a missing server.dirty increment in a non critical place, added more tests (antirez) -2009-12-18 LTRIM stress testing test added (antirez) -2009-12-18 LTRIM now returns +OK against non existing keys. More tests in test-redis.tcl (antirez) -2009-12-18 added sdstoupper() declaration in sds.h (antirez) -2009-12-18 Fixed sds.c bug #124 (antirez) -2009-12-16 LZF compression re-enabled by default, but with INIT_HTAB set to 0 to avoid the very costly memset initialization. Note that with this option set valgrind will output some false positive about lzf_c.c (antirez) -2009-12-16 lzf compression switched off by default now, with config file option to enable it in redis.conf (antirez) -2009-12-16 Regression for epoll bug in redis-test.tcl, version is now 1.1.93 (antirez) -2009-12-16 Fixed a lame epoll issue (antirez) -2009-12-15 html doc updated (antirez) -2009-12-15 version is now 1.1.92 (antirez) -2009-12-15 Two important fixes to append only file: zero length values and expires. A pretty neat new test to check consistency of randomly build datasets against snapshotting and AOF. (antirez) -2009-12-15 debug loadaof implemented in order to add more consistency tests in test-redis.tcl (antirez) -2009-12-15 Added a new test able to stress a lot the snapshotting engine (antirez) -2009-12-15 Unified handling of empty queries with normal queries. (antirez) -2009-12-15 Fixed some subtle bug in the command processing code almost impossible to spot in the real world, thanks to gcov (antirez) -2009-12-15 Regression test for SINTERSTORE added (antirez) -2009-12-15 Fixed issue #121 (antirez) -2009-12-14 a few more tests and ability to run a specific test in test-redis.tcl (antirez) -2009-12-13 Changed the reply of BGSAVE and BGREWRITEAOF from +OK to a more meaningful message that makes the user aware of an operation that just started and is not yet finished. (antirez) -2009-12-13 Set the master->slave logical client as authenticated on creation, so that if the slave requires a password replication works anyway (antirez) -2009-12-13 TODO update (antirez) -2009-12-12 bgrewriteaof_in_progress added to INFO (antirez) -2009-12-12 TODO list modified. What's planned for 1.4 is now written in the stone ;) (antirez) -2009-12-12 better handling of non blocking connect on redis-benchmark: EPIPE on read does not print an error message now (antirez) -2009-12-11 some change to redis-sha1.rb utility to make it more robust against non-meaningful changes in the dataset (antirez) -2009-12-10 redis-sha1.rb utility updated (antirez) -2009-12-10 a bit more verbose -ERR wrong number o arguments error, now gives info about the command name causing the error (antirez) -2009-12-10 TODO change and minor SETNX optimization (antirez) -2009-12-06 in rdbLoadDoubleValue now the buffer is nul terminated correctly. Thanks valgrind. (antirez) -2009-12-06 printf format warnings fixed by casting (antirez) -2009-12-06 Regression tests for SETNX and MSETNX bugs added (antirez) -2009-12-06 SETNX and MSETNX now respect the delete-on-write operation of EXPIREing keys (antirez) -2009-12-06 Fixed daemonization when using kqueue/kevent. Now the server initialization is performed *after* the daemonization (antirez) -2009-12-05 more HTML doc changes (antirez) -2009-12-05 HTML doc update (antirez) -2009-12-05 a few redis-cli format specified fixed (antirez) -2009-12-05 use __attribute__ format in sdscatprintf() when the compiler is GCC. Fixed format bugs resulting from the new warnings. (antirez) -2009-12-01 TODO update (antirez) -2009-12-01 compilation problem on 64bit mac os x 10.5 possibly fixed (antirez) -2009-12-01 virtual memory design doc typos (antirez) -2009-12-01 design documents added to the project (antirez) -2009-11-30 Fixed issued #85 (getDecodedObject: Assertion 1 != 1 failed. While sorting a set), added a smarter assert() function to dump the stacktrace, provided a macro to initalize Redis objects on the stack to avoid this kind of bugs. (antirez) -2009-11-30 fixed a subtle bug in redis-cli not having visible effects (antirez) -2009-11-29 TODO updated (antirez) -2009-11-29 Version chagned to 1.100, also known as the first first 2.0 beta version (antirez) -2009-11-29 more tests in test-redis.tcl, some minor fix (antirez) -2009-11-29 SORT support for sorted sets (antirez) -2009-11-28 Implemented LIMIT option in ZRANGEBYSCORE. We now enter feature-freeze (antirez) -2009-11-28 Changelog updated (antirez) -2009-11-28 html doc updated (antirez) -2009-11-28 enable kqueue/kevent only for Mac OS X 10.6.x as it seems that 10.5.x has a broken implementation of this syscalls. (antirez) -2009-11-28 TODO updated (antirez) -2009-11-28 ZRANGEBYSCORE fuzzy test (antirez) -2009-11-28 ZRANGEBYSCORE memory leak fixed, ZRANGEBYSCORE initial test added (antirez) -2009-11-28 INFO refactored. Stack trace on memory corruption now dumps the same information as the INFO command (antirez) -2009-11-28 ifdefs added to use kevent on Free Open and Net BSD as well. INFO and ae.c modified in order to report the multiplexing API in use (antirez) -2009-11-28 Enabled object encoding for multiple keys in MSET. Added a test for memory leaks in test-redis.tcl when running on Mac OS X (antirez) -2009-11-28 Merge branch 'kqueue' of git://github.com/mallipeddi/redis (antirez) -2009-11-28 Changes to TODO list, commented a function in redis.c (antirez) -2009-11-28 Added support for kqueue. (Harish Mallipeddi) -2009-11-27 TODO updated (antirez) -2009-11-26 zero length bulk data reading fixed in loadAppendOnlyFile() (antirez) -2009-11-26 append only file fixes (antirez) -2009-11-26 log rebuilding, random refactoring, work in progress please wait for an OK commit before to use this version (antirez) -2009-11-24 DEBUG RELOAD implemented, and test-redis.tcl modified to use it to check for persistence consistency. (antirez) -2009-11-24 Redis version set to 1.07 (antirez) -2009-11-24 sorted sets saving fixed (antirez) -2009-11-24 minor TODO change (antirez) -2009-11-24 minor fix to avoid a false valgrind warning. (antirez) -2009-11-23 epoll support enabled by default for Linux builds (antirez) -2009-11-23 epoll module for ae.c implemented. Some more testing needed (antirez) -2009-11-23 commented the HAVE_EPOLL test in config.h to allow compilation under Linux now that the epoll module is still missing (antirez) -2009-11-23 ae_select module added (antirez) -2009-11-23 ae.c now supports multiple polling API modules, even if only ae_select.c is implemented currently. Also adding and removing an event is now O(1). (antirez) -2009-11-23 ae.c initial refactoring for epoll implementation (antirez) -2009-11-21 version incremented up to 1.06 (antirez) -2009-11-21 TODO aesthetic changes (antirez) -2009-11-21 TODO updated with plans up to 1.5 (antirez) -2009-11-21 SRANDMEMBER test (antirez) -2009-11-21 Fixed a SORT memory leak that should never happen in practice (antirez) -2009-11-21 SORT GET # implemented, with a test (antirez) -2009-11-21 EXPIREAT test (antirez) -2009-11-20 EXPIRE tests (antirez) -2009-11-20 more RPOPLPUSH tests (antirez) -2009-11-20 RPOPLPUSH tests added (antirez) -2009-11-20 ZINCRBY return value fixed (antirez) -2009-11-20 ZINCRSCOREBY => ZINCRBY (antirez) -2009-11-19 ZINCRSCOREBY implemented (antirez) -2009-11-19 writev() finally uncommented again (antirez) -2009-11-19 redis-benchmark hopefully last bug with multi bulk reply fixed (antirez) -2009-11-19 debug mode in redis-bench (antirez) -2009-11-19 Use writev(2) if glue output buffers is disabled (antirez) -2009-11-19 benchmark.c fixes (antirez) -2009-11-18 more experiments with long replies, glue output buffer, and writev. (antirez) -2009-11-18 benchmarking with different number of LRANGE elements. Ability to change the glue output buffer limit by #define (antirez) -2009-11-18 more writev tests/work (antirez) -2009-11-18 redis-benchmark multi bulk reply support hopefully fixed (antirez) -2009-11-17 support for writev implemented but currently ifdef-ed in order to understan why I can't see the improvements expected. Btw code provided by Stefano Barbato (antirez) -2009-11-17 multi-bulk reply support for redis-bench, and as a result LRANGE is not tested, providing some number for the tuning of multi-bulk requests performances server-side (antirez) -2009-11-12 Solaris fix thanks to Alan Harder (antirez) -2009-11-12 Merge git://github.com/ianxm/redis (antirez) -2009-11-12 ZSCORE fixed, now returns NULL on missing key or missing element (antirez) -2009-11-12 Redis test will not fail the SAVE test even if a background save is in progress (antirez) -2009-11-12 LPOPPUSH renamed into RPOPLPUSH (antirez) -2009-11-11 can select db num (ian) -2009-11-11 Workaround for test-redis.tcl and Tcl 8.4.x about ZSCORE test (antirez) -2009-11-11 Removed a long time warning compiling with recent GCC on Linux (antirez) -2009-11-11 TODO updated (antirez) -2009-11-11 LPUSHPOP first implementation (antirez) -2009-11-10 Tcl script, make target, and redis.c changes to build the static symbol table automagically (antirez) -2009-11-10 Implemented a much better lazy expiring algorithm for EXPIRE (antirez) -2009-11-10 Fixed issue 92 in redis: redis-cli (nil) return value lacks CR/LF (antirez) -2009-11-10 Minor TODO change with new expiring algorithm description. New expiring algorithm moved since it'll go in 1.1 (antirez) -2009-11-04 redis-test is now a better Redis citizen, testing everything against DB 9 and 10 and only if this DBs are empty. (antirez) -2009-11-04 fixed a refcounting bug with SORT ... STORE leading to random crashes (root) -2009-11-04 masterauth option merged, thanks to Anthony Lauzon (antirez) -2009-11-03 ZSets double to string serialization fixed (antirez) -2009-11-03 client-libraries directory readded (antirez) -2009-11-03 redis.tcl put at toplevel since it's uesd for the test-redis.tcl script (antirez) -2009-11-03 client libs removed from Redis git (antirez) -2009-11-03 redis-cli now accepts a -r (repeat) switch. Still there is a memory leaks to fix (antirez) -2009-11-01 TODO updated again (antirez) -2009-11-01 TODO updated (antirez) -2009-11-01 redis-cli now makes clear when the returned string is an integer (antirez) -2009-11-01 SORT STORE option (antirez) -2009-11-01 now Redis prints DB stats just after the startup without to wait a second for the first report (antirez) -2009-11-01 another fix for append only mode, now read-only operations are not appended (antirez) -2009-11-01 appendfsync parsing in config file fixed. If you benchmarked Redis against different appendfsync options is time to try again ;) (antirez) -2009-11-01 append only file loading fixed (antirez) -2009-11-01 first version of append only file loading -- STILL BROKEN don't use it (antirez) -2009-10-31 Fixed Issue 83:Using TYPE on a zset results in a malformed response from the Redis server (antirez) -2009-10-31 Fixed compilation on Linux (antirez) -2009-10-30 append only mode is now able to translate EXPIRE into EXPIREAT transparently (antirez) -2009-10-30 appendfsync is now set to NO by default (antirez) -2009-10-30 support for appendonly mode no, always, everysec (antirez) -2009-10-30 first fix for append only mode (antirez) -2009-10-30 Initial implementation of append-only mode. Loading still not implemented. (antirez) -2009-10-30 EXPIRE behaviour changed a bit, a negative TTL or an EXPIREAT with unix time in the past will now delete the key. It seems saner to me than doing nothing. (antirez) -2009-10-30 EXPIREAT implemented, will be useful for the append-only mode (antirez) -2009-10-29 Fixed Issue 74 (ERR just returned on invalid password), now the error message is -ERR invalid password. (antirez) -2009-10-29 Fixed issue 72 (SLAVEOF shutdowns redis-server on malformed reply) (antirez) -2009-10-29 Fixed issue 77 (Incorrect time in log files) thanks to youwantalex (antirez) -2009-10-29 Fixed Issue 76 (redis-server crashes when it can't connect to MASTER and client connects to SLAVE) (antirez) -2009-10-29 ZREMRANGEBYSCORE implemented. Remove a range of elements with score between min and max (antirez) -2009-10-28 TODO changes and mostly theoretical minor skiplist change (antirez) -2009-10-28 ZLEN renamed ZCARD for consistency with SCARD (antirez) -2009-10-27 TODO reworked to reflect the real roadmap (antirez) -2009-10-27 Fix for 'make 32bit' (antirez) -2009-10-27 a fix for the solaris fix itself ;) (antirez) -2009-10-27 More Solaris fixes (antirez) -2009-10-27 A lot of ZSETs tests implemented, and a bug fixed thanks to this new tests (antirez) -2009-10-27 zmalloc Solaris fixes thanks to Alan Harder (antirez) -2009-10-27 ZSCORE implemented (antirez) -2009-10-26 fix for ZRANGEBYSCORE (antirez) -2009-10-26 ZRANGEBYSCORE implemented. Redis got range queries! (antirez) -2009-10-26 A trivial change makes the new implementation O(log(N)) instead of O(log(N))+O(M) when there are M repeated scores! (antirez) -2009-10-26 ZSET now saved on disk like any other type (antirez) -2009-10-26 double serialization routines implemented (antirez) -2009-10-26 ZSETs random fixes. Now the implementation appears to be pretty stable (antirez) -2009-10-26 another leak fixed. Can't find more for now, but still a bug in ZSETs to fix (antirez) -2009-10-26 ZSETs memory leak #1 solved, another one missing (antirez) -2009-10-26 Fix for skiplists backward link (antirez) -2009-10-26 Merged Solaris patches provided by Alan Harder (antirez) -2009-10-26 backward support to skiplists for ZREVRANGE, still broken, committing since I've to merge the Solaris patches (antirez) -2009-10-26 TODO updated (antirez) -2009-10-26 ZREM implemented (antirez) -2009-10-24 fix for ZADD in score update mode (antirez) -2009-10-24 some work on ZADD against existing element (score update), still broken... (antirez) -2009-10-23 zrange now starts to work. zadd still does not support update and will crash or leak or b000mmmmm (antirez) -2009-10-23 zrange initial hack (not working for now) (antirez) -2009-10-23 first skiplist fix, courtesy of valgrind (antirez) -2009-10-23 zset symbols added to stack trace code. ZSets will simply crash at the moment (antirez) -2009-10-23 more work on ZSETs and a new make target called 32bit to build i386 binaries on mac os x leopard (antirez) -2009-10-23 initial skiplist implementation. Most memory checks removed and zmalloc() modified to fail with an error message and abort. Anyway Redis is not designed to recover from out of memory conditions. (antirez) -2009-10-23 Fixed compilation in mac os x snow leopard when compiling a 32 bit binary. (antirez) -2009-10-22 version incremented to 1.050 to distinguish from 1.001 stable and next stable versions with minor fixes (antirez) -2009-10-21 TODO updated (antirez) -2009-10-21 SRANDMEMBER added (antirez) -2009-10-20 Imporant bug leading to data corruption fixed (NOT affecting stable distribution), Tcl client lib MSET/MSETNX implementation fixed, Added new tests for MSET and MSETNX in test-redis.tcl (antirez) -2009-10-17 added multi-bulk protocol support to redis-cli and support for MSET and MSETNX (antirez) -2009-10-17 MSET fixed, was not able to replace keys already set for a stupid bug (antirez) -2009-10-16 some dead code removed (antirez) -2009-10-16 multi bulk input protocol fixed (antirez) -2009-10-16 MSET and MSETNX commands implemented (antirez) -2009-10-07 undoed all the sds hacking that lead just to random bugs and no memory saving ;) (antirez) -2009-10-07 initial multi-bulk query protocol, this will allow MSET and other interesting features. (antirez) -2009-10-03 benchmark now outputs the right command line to shorten the TIME_WAIT interval on Mac OS X when keep alive is set (antirez) -2009-10-02 Issue 69 fixed. Object integer encoding now works with replication and MONITORing again. (antirez) -2009-09-18 LREM fixed, used to crash since the new object integer encoding is on the stage (antirez) -2009-09-17 maxmemory didn't worked in 64 systems for values > 4GB since it used to be an unsigned int. Fixed (antirez) -2009-09-10 incremented version number to 1.001, AKA Redis edge is no longer stable... (antirez) -2009-09-10 in-memory specialized object encoding (for now 32 signed integers only) (antirez) -2009-09-03 Latest doc changes for 1.0 (antirez) -2009-09-03 Redis 1.0.0 release (antirez) -2009-09-02 Redis version pushed to 1.0 (antirez) -2009-09-02 Ruby client lib updated to the latest git version (antirez) -2009-09-02 update-scala-client script added (antirez) -2009-09-02 Scala client added thanks to Alejanro Crosa (antirez) -2009-09-02 QuickStart added (antirez) -2009-09-01 Fixed crash with only space and newline as command (issue 61), thanks to a guy having as nick "fixxxerrr" (antirez) -2009-08-11 TODO list modified (antirez) -2009-07-24 more snow leopard related fixes (for 32bit systems) (antirez) -2009-07-24 fixed compilation with Snow Leopard, thanks to Lon Baker for providing SSH access to Snow Leopard box (antirez) -2009-07-22 Fixed NetBSD compile problems (antirez) -2009-07-17 now the size of the shared pool can be really modified via config, also the number of objects in the sharing pool is logged when the log level is set to debug. Thanks to Aman Gupta (antirez) -2009-07-05 added utils/redis-copy.rb, a script that is able to copy data from one Redis server to another one on the fly. (antirez) -2009-07-04 Applied three different patches thanks to Chris Lamb, one to fix compilation and get the IP register value on Linux IA64 and other systems. One in order to log the overcommit problem on the logs instead of the standard output when Redis is demonized. The latest in order to suggest a more consistent way in order to switch to 1 the memory overcommit Linux feature. (antirez) -2009-07-03 bugfix: EXPIRE now propagates to the Slave. (antirez) -2009-06-16 Redis version modified to 0.900 (antirez) -2009-06-16 update-ruby-client script already points to ezmobius repo (antirez) -2009-06-16 client libraries updated (antirez) -2009-06-16 Redis release candidate 1 (antirez) -2009-06-16 Better handling of background saving process killed or crashed (antirez) -2009-06-14 number of keys info in INFO command thanks to Diego Rosario Brogna (antirez) -2009-06-14 SPOP documented (antirez) -2009-06-14 Clojure library thanks to Ragnar Dahlén (antirez) -2009-06-10 It is now possible to specify - as config file name to read it from stdin (antirez) -2009-06-10 sync with jodosha redis-rb (antirez) -2009-06-10 Redis-rb sync (antirez) -2009-06-10 max inline request raised again to 1024*1024*256 bytes (antirez) -2009-06-10 max bytes in an inline command raised to 1024*1024 bytes, in order to allow for very large MGETs and still protect from client crashes (antirez) -2009-06-08 SPOP implemented. Hash table resizing for Sets and Expires too. Changed the resize policy to play better with RANDOMKEY and SPOP. (antirez) -2009-06-07 some minor changes to the backtrace code (antirez) -2009-06-07 enable backtrace capabilities only for Linux and MacOSX (antirez) -2009-06-07 Dump a backtrace on sigsegv/sigbus, original coded thanks to Diego Rosario Brogna, modified in order to work on different OSes and to enhance reliability (antirez) -2009-06-06 Merge git://github.com/dierbro/redis (antirez) -2009-06-06 add more output (hrothgar) -2009-06-06 store static function pointer for a useful stack trace (hrothgar) -2009-06-06 TODO updated (antirez) -2009-06-06 Makefile dependencies updated (antirez) -2009-06-05 Avoid a busy loop while sending very large replies against very fast links, this allows to be more responsive with other clients even under a KEY * against the loopback interface (antirez) -2009-06-05 Kill the background saving process before performing SHUTDOWN to avoid races (antirez) -2009-06-05 LREM now returns :0 for non existing keys (antirez) -2009-06-05 - put some order in code - better output (hrothgar) -2009-06-05 added config.h for #ifdef business isolation, added fstat64 for Mac OS X (antirez) -2009-06-04 remove die() :-) (hrothgar) -2009-06-04 add compile options to debug (hrothgar) -2009-06-04 initial commit print stack trace (hrothgar) -2009-06-04 initial commit print stack trace (hrothgar) -2009-06-04 macosx specific zmalloc.c, uses malloc_size function in order to avoid to waste memory and time to put an additional header (antirez) -2009-06-04 DEBUG OBJECT implemented (antirez) -2009-06-04 backtrace support removed: unreliable stack trace :( (antirez) -2009-06-04 initial backtrace dumping on sigsegv/sigbus + debug command (antirez) -2009-06-03 Python lib updated (antirez) -2009-06-03 shareobjectspoolsize implemented in reds.conf, in order to control the pool size when object sharing is on (antirez) -2009-05-30 Erlang client updated (antirez) -2009-05-30 Python client library updated (antirez) -2009-05-29 Redis-rb minor bool convertion fix (antirez) -2009-05-29 ruby library client is not Redis-rb merged with RubyRedis "engine" by Brian McKinney (antirez) -2009-05-28 __P completely removed from pqsort.c/h (antirez) -2009-05-28 another minor fix for Solaris boxes (antirez) -2009-05-28 minor fix for Solaris boxes (antirez) -2009-05-28 minor fix for Solaris boxes (antirez) -2009-05-27 maxmemory implemented (antirez) -2009-05-26 Redis git version modified to 0.101 in order to distinguish that from the latest tar.gz via INFO ;) (antirez) -2009-05-26 Redis 0.100 released (antirez) -2009-05-26 client libraries synched in git (antirez) -2009-05-26 ignore gcc warning about write() return code not checked. It is esplicitily this way since the "max number of clients reached" is a best-effort error (antirez) -2009-05-26 max bytes of a received command enlarged from 1k to 16k (antirez) -2009-05-26 RubyRedis: set TCP_NODELAY TCP socket option to to disable the neagle algorithm. Makes a huge difference under some OS, notably Linux (antirez) -2009-05-25 maxclients implemented, see redis.conf for details (antirez) -2009-05-25 INFO command now reports replication info (antirez) -2009-05-25 minor fix to RubyRedis about bulk commands sent without arguments (antirez) -2009-05-24 Warns if using the default config (antirez) -2009-05-24 Issue with redis-client used in scripts solved, now to check if the latest argument must come from standard input we do not check that stdin is or not a tty but the command arity (antirez) -2009-05-23 RubyRedis: now sets are returned as arrays again, and not as Set objects (antirez) -2009-05-23 SLAVEOF command documented (antirez) -2009-05-23 SLAVEOF command implemented for replication remote control (antirez) -2009-05-22 Fix: no connection timeout for the master! (antirez) -2009-05-22 replication slave timeout when receiving the initial bulk data set to 3600 seconds, now that replication is non-blocking the server must save the db before to start the async replication and this can take a lot of time with huge datasets (antirez) -2009-05-22 README tutorial now reflects the new proto (antirez) -2009-05-22 critical bug about glueoutputbuffers=yes fixed. Under load and with pipelining and clients disconnecting on the middle of the chat with the server, Redis could block. Now it's ok (antirez) -2009-05-22 TTL command doc added (antirez) -2009-05-22 TTL command implemented (antirez) -2009-05-22 S*STORE now return the cardinality of the resulting set (antirez) -2009-05-22 rubyredis more compatible with Redis-rb (antirez) -2009-05-21 minor indentation fix (antirez) -2009-05-21 timeout support and Redis-rb compatibility aliases implemented in RubyRedis (antirez) -2009-05-21 RubyRedis info postprocessor rewritten in a more functional way (antirez) -2009-05-21 dead code removed from RubyRedis (antirez) -2009-05-21 command postprocessing implemented into RubyRedis (antirez) -2009-05-20 Automagically reconnection of RubyRedis (antirez) -2009-05-20 RubyRedis: Array alike operators implemented (antirez) -2009-05-20 random testing code removed (antirez) -2009-05-20 RubyRedis DB selection forced at object creation (antirez) -2009-05-20 Initial version of an alternative Ruby client added (antirez) -2009-05-20 SDIFF / SDIFFSTORE added to doc (antirez) -2009-05-20 Aman Gupta changes merged (antirez) -2009-05-20 Merge git://github.com/tmm1/redis (antirez) -2009-05-19 Allow timeout=0 config to disable client timeouts (Aman Gupta) -2009-05-19 Partial qsort implemented in SORT command, only when both BY and LIMIT is used. minor fix for a warning compiling under Linux. (antirez) -2009-05-19 psort.c/h added. This is a partial qsort implementation that Redis will use when SORT+LIMIT is requested (antirez) -2009-05-17 Fix SINTER/UNIONSTORE to allow for &=/|= style operations (i.e. SINTERSTORE set1 set1 set2) (Aman Gupta) -2009-05-17 Optimize SDIFF to return as soon as the result set is empty (Aman Gupta) -2009-05-17 SDIFF/SDIFFSTORE implemnted unifying it with the implementation of SUNION/SUNIONSTORE (antirez) -2009-05-11 timestamp in log lines (antirez) -2009-05-11 Python client updated pushing from Ludo's repository (antirez) -2009-05-11 disconnect when we cannot read from the socket (Ludovico Magnocavallo) -2009-05-11 benchmark utility now supports random keys (antirez) -2009-05-10 minor doc changes (antirez) -2009-05-09 added tests for vararg DEL (antirez) -2009-05-09 DEL is now a vararg, IMPORTANT: memory leak fixed in loading DB code (antirez) -2009-05-09 doc changes (antirez) -2009-05-09 CPP client added thanks to Brian Hammond (antirez) -2009-05-06 Infinite number of arguments for MGET and all the other commands (antirez) -2009-05-04 Warns if /proc/sys/vm/overcommit_memory is set to 0 on Linux. Also make sure to don't resize the hash tables while the child process is saving in order to avoid copy-on-write of memory pages (antirez) -2009-04-30 zmalloc fix, return NULL or real malloc failure (antirez) -2009-04-30 more fixes for dict.c and the 150 million keys limit (antirez) -2009-04-30 dict.c modified to be able to handle more than 150,000,000 keys (antirez) -2009-04-29 fuzz stresser implemented in redis-test (antirez) -2009-04-29 fixed for HT resize check 32bits overflow (antirez) -2009-04-29 Check for fork() failure in background saving (antirez) -2009-04-29 fix for the LZF off-by-one bug added (antirez) -2009-04-28 print bytes used at exit on SHUTDOWN (antirez) -2009-04-28 SMOVE test added (antirez) -2009-04-28 SMOVE command implemented (antirez) -2009-04-28 less CPU usage in command parsing, case insensitive config directives (antirez) -2009-04-28 GETSET command doc added (antirez) -2009-04-28 GETSET tests (antirez) -2009-04-28 GETSET implemented (antirez) -2009-04-27 ability to specify a different file name for the DB (antirez) -2009-04-27 log file parsing code improved a bit (antirez) -2009-04-27 bgsave_in_progress field in INFO output (antirez) -2009-04-27 INCRBY/DECRBY now support 64bit increments, with tests (antirez) -2009-04-23 RANDOMKEY regression test added (antirez) -2009-04-23 dictGetRandomKey bug fixed, RANDOMKEY will not block the server anymore (antirez) -2009-04-22 FLUSHALL/FLUSHDB no longer sync on disk. Just increment the dirty counter by the number of elements removed, that will probably trigger a background saving operation (antirez) -2009-04-21 forgot to comment testing code in PHP lib. Now it is ok (antirez) -2009-04-21 PHP client ported to PHP5 and fixed (antirez) -2009-04-21 doc update (antirez) -2009-04-20 Non blocking replication (finally!). C-side linked lists API improved. (antirez) -2009-04-19 SUNION, SUNIONSTORE, Initial work on non blocking replication (antirez) -2009-04-10 Redis 0.091 released (antirez) -2009-04-10 SINTER/SINTERSTORE/SLEMENTS fix: misisng keys are now not errors, but just like empty sets (antirez) -2009-04-09 doc changes (antirez) -2009-04-08 TODO changes, minor change to default redis.conf (antirez) -2009-04-08 html doc updated (antirez) -2009-04-08 library clients update scripts (antirez) -2009-04-08 Ruby client updated (antirez) -2009-04-08 Lua client updated (antirez) -2009-04-08 Changelog updated (antirez) -2009-04-08 Merge git://github.com/ludoo/redis (antirez) -2009-04-08 add expire command to the php lib (Ludovico Magnocavallo) -2009-04-08 fix decode bug, add flush and info commands (Ludovico Magnocavallo) -2009-04-07 Rearrange redisObject struct to reduce memory usage in 64bit environments (as recommended http://groups.google.com/group/redis-db/msg/68f5a743f8f4e287) (Bob Potter) -2009-04-07 ruby19 compat: use each_line on string (Bob Potter) -2009-04-07 64bit fixes for usedmemory (Bob Potter) -2009-04-08 RANDOMKEY issue 26 fixed, generic test + regression added (antirez) -2009-04-06 Don't accept SAVE if BGSAVE is in progress (antirez) -2009-04-06 add expire command to the python lib (Ludovico Magnocavallo) -2009-04-03 persistent EXPIRE (antirez) -2009-04-03 dirty increment was missing in two points. TODO updated (antirez) -2009-04-02 LZF configured to initalize the HT in order to be determinsitic and play well with valgrind (antirez) -2009-04-02 fix select test (Ludovico Magnocavallo) -2009-04-02 fix trailing cr+nl in values (Ludovico Magnocavallo) -2009-04-02 compression/decompression of large values on disk now working (antirez) -2009-04-02 disable LZF compression since it's not able to load the DB for now, the load part is missing (antirez) -2009-04-02 new LZF files added (antirez) -2009-04-02 Fixed issue 23 about AUTH (antirez) -2009-04-02 Issue 22 fixed (antirez) -2009-04-01 non-lazy expired keys purging implemented (antirez) -2009-04-01 fastlz dependence removed (antirez) -2009-04-01 Initial implementation of EXPIRE (antirez) -2009-03-30 TODO updated (antirez) -2009-03-30 changelog added (antirez) -2009-03-28 redis-sha1 utility added (antirez) -2009-03-28 Integer encoding implemented in dump file. Doc updated (antirez) -2009-03-27 feature macros defined to play well with C99 (antirez) -2009-03-27 feature macros defined to play well with C99 (antirez) -2009-03-27 now Redis is C99-ok (antirez) -2009-03-27 IMPORTANT FIX: new dump format implementation was broken. Now it's ok but tests for the 32-bit case values are needed (antirez) -2009-03-27 ANSI-C compatibility changes (antirez) -2009-03-27 Ruby client library updated. Important changes in this new version! (antirez) -2009-03-26 Lua client added thanks to Daniele Alessandri (antirez) -2009-03-26 Lua client added thanks to Daniele Alessandri (antirez) -2009-03-26 AUTH merged from Brian Hammond fork, reworked a bit to fix minor problems (antirez) -2009-03-25 Adds AUTH command. (Brian Hammond) -2009-03-25 Nasty bug of the new DB format fixed, objects sharing implemented (antirez) -2009-03-25 doc update (antirez) -2009-03-25 Erlang client synched with Valentiono's repo (antirez) -2009-03-25 New file dump format, perl client library added (antirez) -2009-03-25 New protocol fix for LREM (antirez) -2009-03-24 two typos fixed (antirez) -2009-03-24 Now the Redis test uses the proper Tcl client library (antirez) -2009-03-24 Tcl client library (antirez) -2009-03-24 redis-benchmark sync with the new protocol (antirez) -2009-03-24 git mess :) (Ludovico Magnocavallo) -2009-03-24 sync python client to the new protocol (Ludovico Magnocavallo) -2009-03-24 protocol fix in SORT reply with null elements (antirez) -2009-03-24 protocol doc changed (antirez) -2009-03-24 Server replies now in the new format, test-redis.tcl and redis-cli modified accordingly (antirez) -2009-03-24 Python client library updated, thanks to Ludo! (antirez) -2009-03-24 random tested mode for test-redis.tcl, minor other stuff, version switched to 0.8 (antirez) -2009-03-23 Now MONITOR/SYNC cannot be issued multiple times (antirez) -2009-03-23 MONITOR command implemented. (antirez) -2009-03-23 lucsky changes imported. pid file path can now be configured, redis-cli fixes (antirez) -2009-03-23 Merge git://github.com/lucsky/redis (antirez) -2009-03-23 another missing free->zfree replacement fixed. Thanks to Ludo (antirez) -2009-03-23 Fixed redis-cli readLine loop to correctly handle EOF. (Luc Heinrich) -2009-03-23 Display the port on server startup. (Luc Heinrich) -2009-03-23 Allow to specify the pid file from the config file. (Luc Heinrich) -2009-03-23 Added gitignore file. (Luc Heinrich) -2009-03-22 MGET tests added (antirez) -2009-03-22 doc changes (antirez) -2009-03-22 added doc for MGET (antirez) -2009-03-22 redis-cli now checks the arity of vararg commnads (antirez) -2009-03-22 INFO fixed, MGET implemented, redis-cli implements INFO/MGET (antirez) -2009-03-22 first commit (antirez) \ No newline at end of file diff --git a/README b/README index 329eb1cb3..b7a12b828 100644 --- a/README +++ b/README @@ -130,7 +130,7 @@ it the proper way for a production system, we have a script doing this for Ubuntu and Debian systems: % cd utils - % ./install_server + % ./install_server.sh The script will ask you a few questions and will setup everything you need to run Redis properly as a background daemon that will start again on diff --git a/deps/linenoise/linenoise.c b/deps/linenoise/linenoise.c index 4632f7de8..aef5cdd24 100644 --- a/deps/linenoise/linenoise.c +++ b/deps/linenoise/linenoise.c @@ -10,8 +10,8 @@ * * ------------------------------------------------------------------------ * - * Copyright (c) 2010, Salvatore Sanfilippo - * Copyright (c) 2010, Pieter Noordhuis + * Copyright (c) 2010-2013, Salvatore Sanfilippo + * Copyright (c) 2010-2013, Pieter Noordhuis * * All rights reserved. * @@ -45,12 +45,10 @@ * - http://www.3waylabs.com/nw/WWW/products/wizcon/vt220.html * * Todo list: - * - Switch to gets() if $TERM is something we can't support. * - Filter bogus Ctrl+ combinations. * - Win32 support * * Bloat: - * - Completion? * - History search like Ctrl+r in readline? * * List of escape sequences used by this program, we do everything just @@ -72,6 +70,17 @@ * Sequence: ESC [ n C * Effect: moves cursor forward of n chars * + * When multi line mode is enabled, we also use an additional escape + * sequence. However multi line editing is disabled by default. + * + * CUU (Cursor Up) + * Sequence: ESC [ n A + * Effect: moves cursor up of n chars. + * + * CUD (Cursor Down) + * Sequence: ESC [ n B + * Effect: moves cursor down of n chars. + * * The following are used to clear the screen: ESC [ H ESC [ 2 J * This is actually composed of two sequences: * @@ -92,6 +101,7 @@ #include #include #include +#include #include #include #include @@ -99,19 +109,89 @@ #define LINENOISE_DEFAULT_HISTORY_MAX_LEN 100 #define LINENOISE_MAX_LINE 4096 -static char *unsupported_term[] = {"dumb","cons25",NULL}; +static char *unsupported_term[] = {"dumb","cons25","emacs",NULL}; static linenoiseCompletionCallback *completionCallback = NULL; -static struct termios orig_termios; /* in order to restore at exit */ -static int rawmode = 0; /* for atexit() function to check if restore is needed*/ -static int atexit_registered = 0; /* register atexit just 1 time */ +static struct termios orig_termios; /* In order to restore at exit.*/ +static int rawmode = 0; /* For atexit() function to check if restore is needed*/ +static int mlmode = 0; /* Multi line mode. Default is single line. */ +static int atexit_registered = 0; /* Register atexit just 1 time. */ static int history_max_len = LINENOISE_DEFAULT_HISTORY_MAX_LEN; static int history_len = 0; -char **history = NULL; +static char **history = NULL; + +/* The linenoiseState structure represents the state during line editing. + * We pass this state to functions implementing specific editing + * functionalities. */ +struct linenoiseState { + int ifd; /* Terminal stdin file descriptor. */ + int ofd; /* Terminal stdout file descriptor. */ + char *buf; /* Edited line buffer. */ + size_t buflen; /* Edited line buffer size. */ + const char *prompt; /* Prompt to display. */ + size_t plen; /* Prompt length. */ + size_t pos; /* Current cursor position. */ + size_t oldpos; /* Previous refresh cursor position. */ + size_t len; /* Current edited line length. */ + size_t cols; /* Number of columns in terminal. */ + size_t maxrows; /* Maximum num of rows used so far (multiline mode) */ + int history_index; /* The history index we are currently editing. */ +}; + +enum KEY_ACTION{ + KEY_NULL = 0, /* NULL */ + CTRL_A = 1, /* Ctrl+a */ + CTRL_B = 2, /* Ctrl-b */ + CTRL_C = 3, /* Ctrl-c */ + CTRL_D = 4, /* Ctrl-d */ + CTRL_E = 5, /* Ctrl-e */ + CTRL_F = 6, /* Ctrl-f */ + CTRL_H = 8, /* Ctrl-h */ + TAB = 9, /* Tab */ + CTRL_K = 11, /* Ctrl+k */ + CTRL_L = 12, /* Ctrl+l */ + ENTER = 13, /* Enter */ + CTRL_N = 14, /* Ctrl-n */ + CTRL_P = 16, /* Ctrl-p */ + CTRL_T = 20, /* Ctrl-t */ + CTRL_U = 21, /* Ctrl+u */ + CTRL_W = 23, /* Ctrl+w */ + ESC = 27, /* Escape */ + BACKSPACE = 127 /* Backspace */ +}; static void linenoiseAtExit(void); int linenoiseHistoryAdd(const char *line); +static void refreshLine(struct linenoiseState *l); + +/* Debugging macro. */ +#if 0 +FILE *lndebug_fp = NULL; +#define lndebug(...) \ + do { \ + if (lndebug_fp == NULL) { \ + lndebug_fp = fopen("/tmp/lndebug.txt","a"); \ + fprintf(lndebug_fp, \ + "[%d %d %d] p: %d, rows: %d, rpos: %d, max: %d, oldmax: %d\n", \ + (int)l->len,(int)l->pos,(int)l->oldpos,plen,rows,rpos, \ + (int)l->maxrows,old_rows); \ + } \ + fprintf(lndebug_fp, ", " __VA_ARGS__); \ + fflush(lndebug_fp); \ + } while (0) +#else +#define lndebug(fmt, ...) +#endif + +/* ======================= Low level terminal handling ====================== */ +/* Set if to use or not the multi line mode. */ +void linenoiseSetMultiLine(int ml) { + mlmode = ml; +} + +/* Return true if the terminal name is in the list of terminals we know are + * not able to understand basic escape sequences. */ static int isUnsupportedTerm(void) { char *term = getenv("TERM"); int j; @@ -122,16 +202,7 @@ static int isUnsupportedTerm(void) { return 0; } -static void freeHistory(void) { - if (history) { - int j; - - for (j = 0; j < history_len; j++) - free(history[j]); - free(history); - } -} - +/* Raw mode: 1960 magic shit. */ static int enableRawMode(int fd) { struct termios raw; @@ -173,51 +244,83 @@ static void disableRawMode(int fd) { rawmode = 0; } -/* At exit we'll try to fix the terminal to the initial conditions. */ -static void linenoiseAtExit(void) { - disableRawMode(STDIN_FILENO); - freeHistory(); +/* Use the ESC [6n escape sequence to query the horizontal cursor position + * and return it. On error -1 is returned, on success the position of the + * cursor. */ +static int getCursorPosition(int ifd, int ofd) { + char buf[32]; + int cols, rows; + unsigned int i = 0; + + /* Report cursor location */ + if (write(ofd, "\x1b[6n", 4) != 4) return -1; + + /* Read the response: ESC [ rows ; cols R */ + while (i < sizeof(buf)-1) { + if (read(ifd,buf+i,1) != 1) break; + if (buf[i] == 'R') break; + i++; + } + buf[i] = '\0'; + + /* Parse it. */ + if (buf[0] != ESC || buf[1] != '[') return -1; + if (sscanf(buf+2,"%d;%d",&rows,&cols) != 2) return -1; + return cols; } -static int getColumns(void) { +/* Try to get the number of columns in the current terminal, or assume 80 + * if it fails. */ +static int getColumns(int ifd, int ofd) { struct winsize ws; - if (ioctl(1, TIOCGWINSZ, &ws) == -1) return 80; - return ws.ws_col; -} + if (ioctl(1, TIOCGWINSZ, &ws) == -1 || ws.ws_col == 0) { + /* ioctl() failed. Try to query the terminal itself. */ + int start, cols; -static void refreshLine(int fd, const char *prompt, char *buf, size_t len, size_t pos, size_t cols) { - char seq[64]; - size_t plen = strlen(prompt); - - while((plen+pos) >= cols) { - buf++; - len--; - pos--; - } - while (plen+len > cols) { - len--; + /* Get the initial position so we can restore it later. */ + start = getCursorPosition(ifd,ofd); + if (start == -1) goto failed; + + /* Go to right margin and get position. */ + if (write(ofd,"\x1b[999C",6) != 6) goto failed; + cols = getCursorPosition(ifd,ofd); + if (cols == -1) goto failed; + + /* Restore position. */ + if (cols > start) { + char seq[32]; + snprintf(seq,32,"\x1b[%dD",cols-start); + if (write(ofd,seq,strlen(seq)) == -1) { + /* Can't recover... */ + } + } + return cols; + } else { + return ws.ws_col; } - /* Cursor to left edge */ - snprintf(seq,64,"\x1b[0G"); - if (write(fd,seq,strlen(seq)) == -1) return; - /* Write the prompt and the current buffer content */ - if (write(fd,prompt,strlen(prompt)) == -1) return; - if (write(fd,buf,len) == -1) return; - /* Erase to right */ - snprintf(seq,64,"\x1b[0K"); - if (write(fd,seq,strlen(seq)) == -1) return; - /* Move cursor to original position. */ - snprintf(seq,64,"\x1b[0G\x1b[%dC", (int)(pos+plen)); - if (write(fd,seq,strlen(seq)) == -1) return; +failed: + return 80; } -static void beep() { +/* Clear the screen. Used to handle ctrl+l */ +void linenoiseClearScreen(void) { + if (write(STDOUT_FILENO,"\x1b[H\x1b[2J",7) <= 0) { + /* nothing to do, just to avoid warning. */ + } +} + +/* Beep, used for completion when there is nothing to complete or when all + * the choices were already shown. */ +static void linenoiseBeep(void) { fprintf(stderr, "\x7"); fflush(stderr); } +/* ============================== Completion ================================ */ + +/* Free a list of completion option populated by linenoiseAddCompletion(). */ static void freeCompletions(linenoiseCompletions *lc) { size_t i; for (i = 0; i < lc->len; i++) @@ -226,28 +329,39 @@ static void freeCompletions(linenoiseCompletions *lc) { free(lc->cvec); } -static int completeLine(int fd, const char *prompt, char *buf, size_t buflen, size_t *len, size_t *pos, size_t cols) { +/* This is an helper function for linenoiseEdit() and is called when the + * user types the key in order to complete the string currently in the + * input. + * + * The state of the editing is encapsulated into the pointed linenoiseState + * structure as described in the structure definition. */ +static int completeLine(struct linenoiseState *ls) { linenoiseCompletions lc = { 0, NULL }; int nread, nwritten; char c = 0; - completionCallback(buf,&lc); + completionCallback(ls->buf,&lc); if (lc.len == 0) { - beep(); + linenoiseBeep(); } else { size_t stop = 0, i = 0; - size_t clen; while(!stop) { /* Show completion or original buffer */ if (i < lc.len) { - clen = strlen(lc.cvec[i]); - refreshLine(fd,prompt,lc.cvec[i],clen,clen,cols); + struct linenoiseState saved = *ls; + + ls->len = ls->pos = strlen(lc.cvec[i]); + ls->buf = lc.cvec[i]; + refreshLine(ls); + ls->len = saved.len; + ls->pos = saved.pos; + ls->buf = saved.buf; } else { - refreshLine(fd,prompt,buf,*len,*pos,cols); + refreshLine(ls); } - nread = read(fd,&c,1); + nread = read(ls->ifd,&c,1); if (nread <= 0) { freeCompletions(&lc); return -1; @@ -256,20 +370,18 @@ static int completeLine(int fd, const char *prompt, char *buf, size_t buflen, si switch(c) { case 9: /* tab */ i = (i+1) % (lc.len+1); - if (i == lc.len) beep(); + if (i == lc.len) linenoiseBeep(); break; case 27: /* escape */ /* Re-show original buffer */ - if (i < lc.len) { - refreshLine(fd,prompt,buf,*len,*pos,cols); - } + if (i < lc.len) refreshLine(ls); stop = 1; break; default: /* Update buffer and return */ if (i < lc.len) { - nwritten = snprintf(buf,buflen,"%s",lc.cvec[i]); - *len = *pos = nwritten; + nwritten = snprintf(ls->buf,ls->buflen,"%s",lc.cvec[i]); + ls->len = ls->pos = nwritten; } stop = 1; break; @@ -281,214 +393,526 @@ static int completeLine(int fd, const char *prompt, char *buf, size_t buflen, si return c; /* Return last read character */ } -void linenoiseClearScreen(void) { - if (write(STDIN_FILENO,"\x1b[H\x1b[2J",7) <= 0) { - /* nothing to do, just to avoid warning. */ +/* Register a callback function to be called for tab-completion. */ +void linenoiseSetCompletionCallback(linenoiseCompletionCallback *fn) { + completionCallback = fn; +} + +/* This function is used by the callback function registered by the user + * in order to add completion options given the input string when the + * user typed . See the example.c source code for a very easy to + * understand example. */ +void linenoiseAddCompletion(linenoiseCompletions *lc, const char *str) { + size_t len = strlen(str); + char *copy, **cvec; + + copy = malloc(len+1); + if (copy == NULL) return; + memcpy(copy,str,len+1); + cvec = realloc(lc->cvec,sizeof(char*)*(lc->len+1)); + if (cvec == NULL) { + free(copy); + return; + } + lc->cvec = cvec; + lc->cvec[lc->len++] = copy; +} + +/* =========================== Line editing ================================= */ + +/* We define a very simple "append buffer" structure, that is an heap + * allocated string where we can append to. This is useful in order to + * write all the escape sequences in a buffer and flush them to the standard + * output in a single call, to avoid flickering effects. */ +struct abuf { + char *b; + int len; +}; + +static void abInit(struct abuf *ab) { + ab->b = NULL; + ab->len = 0; +} + +static void abAppend(struct abuf *ab, const char *s, int len) { + char *new = realloc(ab->b,ab->len+len); + + if (new == NULL) return; + memcpy(new+ab->len,s,len); + ab->b = new; + ab->len += len; +} + +static void abFree(struct abuf *ab) { + free(ab->b); +} + +/* Single line low level line refresh. + * + * Rewrite the currently edited line accordingly to the buffer content, + * cursor position, and number of columns of the terminal. */ +static void refreshSingleLine(struct linenoiseState *l) { + char seq[64]; + size_t plen = strlen(l->prompt); + int fd = l->ofd; + char *buf = l->buf; + size_t len = l->len; + size_t pos = l->pos; + struct abuf ab; + + while((plen+pos) >= l->cols) { + buf++; + len--; + pos--; + } + while (plen+len > l->cols) { + len--; + } + + abInit(&ab); + /* Cursor to left edge */ + snprintf(seq,64,"\x1b[0G"); + abAppend(&ab,seq,strlen(seq)); + /* Write the prompt and the current buffer content */ + abAppend(&ab,l->prompt,strlen(l->prompt)); + abAppend(&ab,buf,len); + /* Erase to right */ + snprintf(seq,64,"\x1b[0K"); + abAppend(&ab,seq,strlen(seq)); + /* Move cursor to original position. */ + snprintf(seq,64,"\x1b[0G\x1b[%dC", (int)(pos+plen)); + abAppend(&ab,seq,strlen(seq)); + if (write(fd,ab.b,ab.len) == -1) {} /* Can't recover from write error. */ + abFree(&ab); +} + +/* Multi line low level line refresh. + * + * Rewrite the currently edited line accordingly to the buffer content, + * cursor position, and number of columns of the terminal. */ +static void refreshMultiLine(struct linenoiseState *l) { + char seq[64]; + int plen = strlen(l->prompt); + int rows = (plen+l->len+l->cols-1)/l->cols; /* rows used by current buf. */ + int rpos = (plen+l->oldpos+l->cols)/l->cols; /* cursor relative row. */ + int rpos2; /* rpos after refresh. */ + int old_rows = l->maxrows; + int fd = l->ofd, j; + struct abuf ab; + + /* Update maxrows if needed. */ + if (rows > (int)l->maxrows) l->maxrows = rows; + + /* First step: clear all the lines used before. To do so start by + * going to the last row. */ + abInit(&ab); + if (old_rows-rpos > 0) { + lndebug("go down %d", old_rows-rpos); + snprintf(seq,64,"\x1b[%dB", old_rows-rpos); + abAppend(&ab,seq,strlen(seq)); + } + + /* Now for every row clear it, go up. */ + for (j = 0; j < old_rows-1; j++) { + lndebug("clear+up"); + snprintf(seq,64,"\x1b[0G\x1b[0K\x1b[1A"); + abAppend(&ab,seq,strlen(seq)); + } + + /* Clean the top line. */ + lndebug("clear"); + snprintf(seq,64,"\x1b[0G\x1b[0K"); + abAppend(&ab,seq,strlen(seq)); + + /* Write the prompt and the current buffer content */ + abAppend(&ab,l->prompt,strlen(l->prompt)); + abAppend(&ab,l->buf,l->len); + + /* If we are at the very end of the screen with our prompt, we need to + * emit a newline and move the prompt to the first column. */ + if (l->pos && + l->pos == l->len && + (l->pos+plen) % l->cols == 0) + { + lndebug(""); + abAppend(&ab,"\n",1); + snprintf(seq,64,"\x1b[0G"); + abAppend(&ab,seq,strlen(seq)); + rows++; + if (rows > (int)l->maxrows) l->maxrows = rows; + } + + /* Move cursor to right position. */ + rpos2 = (plen+l->pos+l->cols)/l->cols; /* current cursor relative row. */ + lndebug("rpos2 %d", rpos2); + + /* Go up till we reach the expected positon. */ + if (rows-rpos2 > 0) { + lndebug("go-up %d", rows-rpos2); + snprintf(seq,64,"\x1b[%dA", rows-rpos2); + abAppend(&ab,seq,strlen(seq)); + } + + /* Set column. */ + lndebug("set col %d", 1+((plen+(int)l->pos) % (int)l->cols)); + snprintf(seq,64,"\x1b[%dG", 1+((plen+(int)l->pos) % (int)l->cols)); + abAppend(&ab,seq,strlen(seq)); + + lndebug("\n"); + l->oldpos = l->pos; + + if (write(fd,ab.b,ab.len) == -1) {} /* Can't recover from write error. */ + abFree(&ab); +} + +/* Calls the two low level functions refreshSingleLine() or + * refreshMultiLine() according to the selected mode. */ +static void refreshLine(struct linenoiseState *l) { + if (mlmode) + refreshMultiLine(l); + else + refreshSingleLine(l); +} + +/* Insert the character 'c' at cursor current position. + * + * On error writing to the terminal -1 is returned, otherwise 0. */ +int linenoiseEditInsert(struct linenoiseState *l, char c) { + if (l->len < l->buflen) { + if (l->len == l->pos) { + l->buf[l->pos] = c; + l->pos++; + l->len++; + l->buf[l->len] = '\0'; + if ((!mlmode && l->plen+l->len < l->cols) /* || mlmode */) { + /* Avoid a full update of the line in the + * trivial case. */ + if (write(l->ofd,&c,1) == -1) return -1; + } else { + refreshLine(l); + } + } else { + memmove(l->buf+l->pos+1,l->buf+l->pos,l->len-l->pos); + l->buf[l->pos] = c; + l->len++; + l->pos++; + l->buf[l->len] = '\0'; + refreshLine(l); + } + } + return 0; +} + +/* Move cursor on the left. */ +void linenoiseEditMoveLeft(struct linenoiseState *l) { + if (l->pos > 0) { + l->pos--; + refreshLine(l); + } +} + +/* Move cursor on the right. */ +void linenoiseEditMoveRight(struct linenoiseState *l) { + if (l->pos != l->len) { + l->pos++; + refreshLine(l); + } +} + +/* Move cursor to the start of the line. */ +void linenoiseEditMoveHome(struct linenoiseState *l) { + if (l->pos != 0) { + l->pos = 0; + refreshLine(l); + } +} + +/* Move cursor to the end of the line. */ +void linenoiseEditMoveEnd(struct linenoiseState *l) { + if (l->pos != l->len) { + l->pos = l->len; + refreshLine(l); + } +} + +/* Substitute the currently edited line with the next or previous history + * entry as specified by 'dir'. */ +#define LINENOISE_HISTORY_NEXT 0 +#define LINENOISE_HISTORY_PREV 1 +void linenoiseEditHistoryNext(struct linenoiseState *l, int dir) { + if (history_len > 1) { + /* Update the current history entry before to + * overwrite it with the next one. */ + free(history[history_len - 1 - l->history_index]); + history[history_len - 1 - l->history_index] = strdup(l->buf); + /* Show the new entry */ + l->history_index += (dir == LINENOISE_HISTORY_PREV) ? 1 : -1; + if (l->history_index < 0) { + l->history_index = 0; + return; + } else if (l->history_index >= history_len) { + l->history_index = history_len-1; + return; + } + strncpy(l->buf,history[history_len - 1 - l->history_index],l->buflen); + l->buf[l->buflen-1] = '\0'; + l->len = l->pos = strlen(l->buf); + refreshLine(l); + } +} + +/* Delete the character at the right of the cursor without altering the cursor + * position. Basically this is what happens with the "Delete" keyboard key. */ +void linenoiseEditDelete(struct linenoiseState *l) { + if (l->len > 0 && l->pos < l->len) { + memmove(l->buf+l->pos,l->buf+l->pos+1,l->len-l->pos-1); + l->len--; + l->buf[l->len] = '\0'; + refreshLine(l); + } +} + +/* Backspace implementation. */ +void linenoiseEditBackspace(struct linenoiseState *l) { + if (l->pos > 0 && l->len > 0) { + memmove(l->buf+l->pos-1,l->buf+l->pos,l->len-l->pos); + l->pos--; + l->len--; + l->buf[l->len] = '\0'; + refreshLine(l); } } -static int linenoisePrompt(int fd, char *buf, size_t buflen, const char *prompt) { - size_t plen = strlen(prompt); - size_t pos = 0; - size_t len = 0; - size_t cols = getColumns(); - int history_index = 0; - size_t old_pos; +/* Delete the previosu word, maintaining the cursor at the start of the + * current word. */ +void linenoiseEditDeletePrevWord(struct linenoiseState *l) { + size_t old_pos = l->pos; size_t diff; - buf[0] = '\0'; - buflen--; /* Make sure there is always space for the nulterm */ + while (l->pos > 0 && l->buf[l->pos-1] == ' ') + l->pos--; + while (l->pos > 0 && l->buf[l->pos-1] != ' ') + l->pos--; + diff = old_pos - l->pos; + memmove(l->buf+l->pos,l->buf+old_pos,l->len-old_pos+1); + l->len -= diff; + refreshLine(l); +} + +/* This function is the core of the line editing capability of linenoise. + * It expects 'fd' to be already in "raw mode" so that every key pressed + * will be returned ASAP to read(). + * + * The resulting string is put into 'buf' when the user type enter, or + * when ctrl+d is typed. + * + * The function returns the length of the current buffer. */ +static int linenoiseEdit(int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt) +{ + struct linenoiseState l; + + /* Populate the linenoise state that we pass to functions implementing + * specific editing functionalities. */ + l.ifd = stdin_fd; + l.ofd = stdout_fd; + l.buf = buf; + l.buflen = buflen; + l.prompt = prompt; + l.plen = strlen(prompt); + l.oldpos = l.pos = 0; + l.len = 0; + l.cols = getColumns(stdin_fd, stdout_fd); + l.maxrows = 0; + l.history_index = 0; + + /* Buffer starts empty. */ + l.buf[0] = '\0'; + l.buflen--; /* Make sure there is always space for the nulterm */ /* The latest history entry is always our current buffer, that * initially is just an empty string. */ linenoiseHistoryAdd(""); - if (write(fd,prompt,plen) == -1) return -1; + if (write(l.ofd,prompt,l.plen) == -1) return -1; while(1) { char c; int nread; - char seq[2], seq2[2]; + char seq[3]; - nread = read(fd,&c,1); - if (nread <= 0) return len; + nread = read(l.ifd,&c,1); + if (nread <= 0) return l.len; /* Only autocomplete when the callback is set. It returns < 0 when * there was an error reading from fd. Otherwise it will return the * character that should be handled next. */ if (c == 9 && completionCallback != NULL) { - c = completeLine(fd,prompt,buf,buflen,&len,&pos,cols); + c = completeLine(&l); /* Return on errors */ - if (c < 0) return len; + if (c < 0) return l.len; /* Read next character when 0 */ if (c == 0) continue; } switch(c) { - case 13: /* enter */ + case ENTER: /* enter */ history_len--; free(history[history_len]); - return (int)len; - case 3: /* ctrl-c */ + return (int)l.len; + case CTRL_C: /* ctrl-c */ errno = EAGAIN; return -1; - case 127: /* backspace */ + case BACKSPACE: /* backspace */ case 8: /* ctrl-h */ - if (pos > 0 && len > 0) { - memmove(buf+pos-1,buf+pos,len-pos); - pos--; - len--; - buf[len] = '\0'; - refreshLine(fd,prompt,buf,len,pos,cols); - } + linenoiseEditBackspace(&l); break; - case 4: /* ctrl-d, remove char at right of cursor */ - if (len > 1 && pos < (len-1)) { - memmove(buf+pos,buf+pos+1,len-pos); - len--; - buf[len] = '\0'; - refreshLine(fd,prompt,buf,len,pos,cols); - } else if (len == 0) { + case CTRL_D: /* ctrl-d, remove char at right of cursor, or of the + line is empty, act as end-of-file. */ + if (l.len > 0) { + linenoiseEditDelete(&l); + } else { history_len--; free(history[history_len]); return -1; } break; - case 20: /* ctrl-t */ - if (pos > 0 && pos < len) { - int aux = buf[pos-1]; - buf[pos-1] = buf[pos]; - buf[pos] = aux; - if (pos != len-1) pos++; - refreshLine(fd,prompt,buf,len,pos,cols); + case CTRL_T: /* ctrl-t, swaps current character with previous. */ + if (l.pos > 0 && l.pos < l.len) { + int aux = buf[l.pos-1]; + buf[l.pos-1] = buf[l.pos]; + buf[l.pos] = aux; + if (l.pos != l.len-1) l.pos++; + refreshLine(&l); } break; - case 2: /* ctrl-b */ - goto left_arrow; - case 6: /* ctrl-f */ - goto right_arrow; - case 16: /* ctrl-p */ - seq[1] = 65; - goto up_down_arrow; - case 14: /* ctrl-n */ - seq[1] = 66; - goto up_down_arrow; + case CTRL_B: /* ctrl-b */ + linenoiseEditMoveLeft(&l); break; - case 27: /* escape sequence */ - if (read(fd,seq,2) == -1) break; - if (seq[0] == 91 && seq[1] == 68) { -left_arrow: - /* left arrow */ - if (pos > 0) { - pos--; - refreshLine(fd,prompt,buf,len,pos,cols); - } - } else if (seq[0] == 91 && seq[1] == 67) { -right_arrow: - /* right arrow */ - if (pos != len) { - pos++; - refreshLine(fd,prompt,buf,len,pos,cols); - } - } else if (seq[0] == 91 && (seq[1] == 65 || seq[1] == 66)) { -up_down_arrow: - /* up and down arrow: history */ - if (history_len > 1) { - /* Update the current history entry before to - * overwrite it with tne next one. */ - free(history[history_len-1-history_index]); - history[history_len-1-history_index] = strdup(buf); - /* Show the new entry */ - history_index += (seq[1] == 65) ? 1 : -1; - if (history_index < 0) { - history_index = 0; + case CTRL_F: /* ctrl-f */ + linenoiseEditMoveRight(&l); + break; + case CTRL_P: /* ctrl-p */ + linenoiseEditHistoryNext(&l, LINENOISE_HISTORY_PREV); + break; + case CTRL_N: /* ctrl-n */ + linenoiseEditHistoryNext(&l, LINENOISE_HISTORY_NEXT); + break; + case ESC: /* escape sequence */ + /* Read the next two bytes representing the escape sequence. + * Use two calls to handle slow terminals returning the two + * chars at different times. */ + if (read(l.ifd,seq,1) == -1) break; + if (read(l.ifd,seq+1,1) == -1) break; + + /* ESC [ sequences. */ + if (seq[0] == '[') { + if (seq[1] >= '0' && seq[1] <= '9') { + /* Extended escape, read additional byte. */ + if (read(l.ifd,seq+2,1) == -1) break; + if (seq[2] == '~') { + switch(seq[1]) { + case '3': /* Delete key. */ + linenoiseEditDelete(&l); + break; + } + } + } else { + switch(seq[1]) { + case 'A': /* Up */ + linenoiseEditHistoryNext(&l, LINENOISE_HISTORY_PREV); + break; + case 'B': /* Down */ + linenoiseEditHistoryNext(&l, LINENOISE_HISTORY_NEXT); + break; + case 'C': /* Right */ + linenoiseEditMoveRight(&l); + break; + case 'D': /* Left */ + linenoiseEditMoveLeft(&l); + break; + case 'H': /* Home */ + linenoiseEditMoveHome(&l); break; - } else if (history_index >= history_len) { - history_index = history_len-1; + case 'F': /* End*/ + linenoiseEditMoveEnd(&l); break; } - strncpy(buf,history[history_len-1-history_index],buflen); - buf[buflen] = '\0'; - len = pos = strlen(buf); - refreshLine(fd,prompt,buf,len,pos,cols); } - } else if (seq[0] == 91 && seq[1] > 48 && seq[1] < 55) { - /* extended escape */ - if (read(fd,seq2,2) == -1) break; - if (seq[1] == 51 && seq2[0] == 126) { - /* delete */ - if (len > 0 && pos < len) { - memmove(buf+pos,buf+pos+1,len-pos-1); - len--; - buf[len] = '\0'; - refreshLine(fd,prompt,buf,len,pos,cols); - } + } + + /* ESC O sequences. */ + else if (seq[0] == 'O') { + switch(seq[1]) { + case 'H': /* Home */ + linenoiseEditMoveHome(&l); + break; + case 'F': /* End*/ + linenoiseEditMoveEnd(&l); + break; } } break; default: - if (len < buflen) { - if (len == pos) { - buf[pos] = c; - pos++; - len++; - buf[len] = '\0'; - if (plen+len < cols) { - /* Avoid a full update of the line in the - * trivial case. */ - if (write(fd,&c,1) == -1) return -1; - } else { - refreshLine(fd,prompt,buf,len,pos,cols); - } - } else { - memmove(buf+pos+1,buf+pos,len-pos); - buf[pos] = c; - len++; - pos++; - buf[len] = '\0'; - refreshLine(fd,prompt,buf,len,pos,cols); - } - } + if (linenoiseEditInsert(&l,c)) return -1; break; - case 21: /* Ctrl+u, delete the whole line. */ + case CTRL_U: /* Ctrl+u, delete the whole line. */ buf[0] = '\0'; - pos = len = 0; - refreshLine(fd,prompt,buf,len,pos,cols); + l.pos = l.len = 0; + refreshLine(&l); break; - case 11: /* Ctrl+k, delete from current to end of line. */ - buf[pos] = '\0'; - len = pos; - refreshLine(fd,prompt,buf,len,pos,cols); + case CTRL_K: /* Ctrl+k, delete from current to end of line. */ + buf[l.pos] = '\0'; + l.len = l.pos; + refreshLine(&l); break; - case 1: /* Ctrl+a, go to the start of the line */ - pos = 0; - refreshLine(fd,prompt,buf,len,pos,cols); + case CTRL_A: /* Ctrl+a, go to the start of the line */ + linenoiseEditMoveHome(&l); break; - case 5: /* ctrl+e, go to the end of the line */ - pos = len; - refreshLine(fd,prompt,buf,len,pos,cols); + case CTRL_E: /* ctrl+e, go to the end of the line */ + linenoiseEditMoveEnd(&l); break; - case 12: /* ctrl+l, clear screen */ + case CTRL_L: /* ctrl+l, clear screen */ linenoiseClearScreen(); - refreshLine(fd,prompt,buf,len,pos,cols); + refreshLine(&l); break; - case 23: /* ctrl+w, delete previous word */ - old_pos = pos; - while (pos > 0 && buf[pos-1] == ' ') - pos--; - while (pos > 0 && buf[pos-1] != ' ') - pos--; - diff = old_pos - pos; - memmove(&buf[pos], &buf[old_pos], len-old_pos+1); - len -= diff; - refreshLine(fd,prompt,buf,len,pos,cols); + case CTRL_W: /* ctrl+w, delete previous word */ + linenoiseEditDeletePrevWord(&l); break; } } - return len; + return l.len; +} + +/* This special mode is used by linenoise in order to print scan codes + * on screen for debugging / development purposes. It is implemented + * by the linenoise_example program using the --keycodes option. */ +void linenoisePrintKeyCodes(void) { + char quit[4]; + + printf("Linenoise key codes debugging mode.\n" + "Press keys to see scan codes. Type 'quit' at any time to exit.\n"); + if (enableRawMode(STDIN_FILENO) == -1) return; + memset(quit,' ',4); + while(1) { + char c; + int nread; + + nread = read(STDIN_FILENO,&c,1); + if (nread <= 0) continue; + memmove(quit,quit+1,sizeof(quit)-1); /* shift string to left. */ + quit[sizeof(quit)-1] = c; /* Insert current char on the right. */ + if (memcmp(quit,"quit",sizeof(quit)) == 0) break; + + printf("'%c' %02x (%d) (type quit to exit)\n", + isprint(c) ? c : '?', (int)c, (int)c); + printf("\x1b[0G"); /* Go left edge manually, we are in raw mode. */ + fflush(stdout); + } + disableRawMode(STDIN_FILENO); } +/* This function calls the line editing function linenoiseEdit() using + * the STDIN file descriptor set in raw mode. */ static int linenoiseRaw(char *buf, size_t buflen, const char *prompt) { - int fd = STDIN_FILENO; int count; if (buflen == 0) { @@ -496,6 +920,7 @@ static int linenoiseRaw(char *buf, size_t buflen, const char *prompt) { return -1; } if (!isatty(STDIN_FILENO)) { + /* Not a tty: read from file / pipe. */ if (fgets(buf, buflen, stdin) == NULL) return -1; count = strlen(buf); if (count && buf[count-1] == '\n') { @@ -503,14 +928,20 @@ static int linenoiseRaw(char *buf, size_t buflen, const char *prompt) { buf[count] = '\0'; } } else { - if (enableRawMode(fd) == -1) return -1; - count = linenoisePrompt(fd, buf, buflen, prompt); - disableRawMode(fd); + /* Interactive editing. */ + if (enableRawMode(STDIN_FILENO) == -1) return -1; + count = linenoiseEdit(STDIN_FILENO, STDOUT_FILENO, buf, buflen, prompt); + disableRawMode(STDIN_FILENO); printf("\n"); } return count; } +/* The high level function that is the main API of the linenoise library. + * This function checks if the terminal has basic capabilities, just checking + * for a blacklist of stupid terminals, and later either calls the line + * editing function or uses dummy fgets() so that you will be able to type + * something even in the most desperate of the conditions. */ char *linenoise(const char *prompt) { char buf[LINENOISE_MAX_LINE]; int count; @@ -534,29 +965,50 @@ char *linenoise(const char *prompt) { } } -/* Register a callback function to be called for tab-completion. */ -void linenoiseSetCompletionCallback(linenoiseCompletionCallback *fn) { - completionCallback = fn; +/* ================================ History ================================= */ + +/* Free the history, but does not reset it. Only used when we have to + * exit() to avoid memory leaks are reported by valgrind & co. */ +static void freeHistory(void) { + if (history) { + int j; + + for (j = 0; j < history_len; j++) + free(history[j]); + free(history); + } } -void linenoiseAddCompletion(linenoiseCompletions *lc, char *str) { - size_t len = strlen(str); - char *copy = malloc(len+1); - memcpy(copy,str,len+1); - lc->cvec = realloc(lc->cvec,sizeof(char*)*(lc->len+1)); - lc->cvec[lc->len++] = copy; +/* At exit we'll try to fix the terminal to the initial conditions. */ +static void linenoiseAtExit(void) { + disableRawMode(STDIN_FILENO); + freeHistory(); } -/* Using a circular buffer is smarter, but a bit more complex to handle. */ +/* This is the API call to add a new entry in the linenoise history. + * It uses a fixed array of char pointers that are shifted (memmoved) + * when the history max length is reached in order to remove the older + * entry and make room for the new one, so it is not exactly suitable for huge + * histories, but will work well for a few hundred of entries. + * + * Using a circular buffer is smarter, but a bit more complex to handle. */ int linenoiseHistoryAdd(const char *line) { char *linecopy; if (history_max_len == 0) return 0; + + /* Initialization on first call. */ if (history == NULL) { history = malloc(sizeof(char*)*history_max_len); if (history == NULL) return 0; memset(history,0,(sizeof(char*)*history_max_len)); } + + /* Don't add duplicated lines. */ + if (history_len && !strcmp(history[history_len-1], line)) return 0; + + /* Add an heap allocated copy of the line in the history. + * If we reached the max length, remove the older line. */ linecopy = strdup(line); if (!linecopy) return 0; if (history_len == history_max_len) { @@ -569,6 +1021,10 @@ int linenoiseHistoryAdd(const char *line) { return 1; } +/* Set the maximum length for the history. This function can be called even + * if there is already some history, the function will make sure to retain + * just the latest 'len' elements if the new history length value is smaller + * than the amount of items already inside the history. */ int linenoiseHistorySetMaxLen(int len) { char **new; @@ -578,8 +1034,16 @@ int linenoiseHistorySetMaxLen(int len) { new = malloc(sizeof(char*)*len); if (new == NULL) return 0; - if (len < tocopy) tocopy = len; - memcpy(new,history+(history_max_len-tocopy), sizeof(char*)*tocopy); + + /* If we can't copy everything, free the elements we'll not use. */ + if (len < tocopy) { + int j; + + for (j = 0; j < tocopy-len; j++) free(history[j]); + tocopy = len; + } + memset(new,0,sizeof(char*)*len); + memcpy(new,history+(history_len-tocopy), sizeof(char*)*tocopy); free(history); history = new; } @@ -591,7 +1055,7 @@ int linenoiseHistorySetMaxLen(int len) { /* Save the history in the specified file. On success 0 is returned * otherwise -1 is returned. */ -int linenoiseHistorySave(char *filename) { +int linenoiseHistorySave(const char *filename) { FILE *fp = fopen(filename,"w"); int j; @@ -607,7 +1071,7 @@ int linenoiseHistorySave(char *filename) { * * If the file exists and the operation succeeded 0 is returned, otherwise * on error -1 is returned. */ -int linenoiseHistoryLoad(char *filename) { +int linenoiseHistoryLoad(const char *filename) { FILE *fp = fopen(filename,"r"); char buf[LINENOISE_MAX_LINE]; diff --git a/deps/linenoise/linenoise.h b/deps/linenoise/linenoise.h index 76a703c28..e22ebd3fd 100644 --- a/deps/linenoise/linenoise.h +++ b/deps/linenoise/linenoise.h @@ -3,39 +3,44 @@ * * See linenoise.c for more information. * + * ------------------------------------------------------------------------ + * * Copyright (c) 2010, Salvatore Sanfilippo * Copyright (c) 2010, Pieter Noordhuis * * All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. * - * * Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright + * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * * Neither the name of Redis nor the names of its contributors may be used - * to endorse or promote products derived from this software without - * specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __LINENOISE_H #define __LINENOISE_H +#ifdef __cplusplus +extern "C" { +#endif + typedef struct linenoiseCompletions { size_t len; char **cvec; @@ -43,13 +48,19 @@ typedef struct linenoiseCompletions { typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *); void linenoiseSetCompletionCallback(linenoiseCompletionCallback *); -void linenoiseAddCompletion(linenoiseCompletions *, char *); +void linenoiseAddCompletion(linenoiseCompletions *, const char *); char *linenoise(const char *prompt); int linenoiseHistoryAdd(const char *line); int linenoiseHistorySetMaxLen(int len); -int linenoiseHistorySave(char *filename); -int linenoiseHistoryLoad(char *filename); +int linenoiseHistorySave(const char *filename); +int linenoiseHistoryLoad(const char *filename); void linenoiseClearScreen(void); +void linenoiseSetMultiLine(int ml); +void linenoisePrintKeyCodes(void); + +#ifdef __cplusplus +} +#endif #endif /* __LINENOISE_H */ diff --git a/redis.conf b/redis.conf index 7fb4e4953..00a2f9193 100644 --- a/redis.conf +++ b/redis.conf @@ -44,6 +44,15 @@ pidfile /var/run/redis.pid # If port 0 is specified Redis will not listen on a TCP socket. port 6379 +# TCP listen() backlog. +# +# In high requests-per-second environments you need an high backlog in order +# to avoid slow clients connections issues. Note that the Linux kernel +# will silently truncate it to the value of /proc/sys/net/core/somaxconn so +# make sure to raise both the value of somaxconn and tcp_max_syn_backlog +# in order to get the desired effect. +tcp-backlog 511 + # By default Redis listens for connections from all the network interfaces # available on the server. It is possible to listen to just one or multiple # interfaces using the "bind" configuration directive, followed by one or @@ -407,15 +416,18 @@ slave-priority 100 # # The default is: # -# maxmemory-policy volatile-lru +# maxmemory-policy noeviction # LRU and minimal TTL algorithms are not precise algorithms but approximated -# algorithms (in order to save memory), so you can select as well the sample -# size to check. For instance for default Redis will check three keys and -# pick the one that was used less recently, you can change the sample size -# using the following configuration directive. +# algorithms (in order to save memory), so you can tune it for speed or +# accuracy. For default Redis will check five keys and pick the one that was +# used less recently, you can change the sample size using the following +# configuration directive. +# +# The default of 5 produces good enough results. 10 Approximates very closely +# true LRU but costs a bit more CPU. 3 is very fast but not very accurate. # -# maxmemory-samples 3 +# maxmemory-samples 5 ############################## APPEND ONLY MODE ############################### @@ -551,6 +563,25 @@ lua-time-limit 5000 # # cluster-node-timeout 15000 +# Cluster slaves are able to migrate to orphaned masters, that are masters +# that are left without working slaves. This improves the cluster ability +# to resist to failures as otherwise an orphaned master can't be failed over +# in case of failure if it has no working slaves. +# +# Slaves migrate to orphaned masters only if there are still at least a +# given number of other working slaves for their old master. This number +# is the "migration barrier". A migration barrier of 1 means that a slave +# will migrate only if there is at least 1 other working slave for its master +# and so forth. It usually reflects the number of slaves you want for every +# master in your cluster. +# +# Default is 1 (slaves migrate only if their masters remain with at least +# one slave). To disable migration just set it to a very large value. +# A value of 0 can be set but is useful only for debugging and dangerous +# in production. +# +# cluster-migration-barrier 1 + # In order to setup your cluster make sure to read the documentation # available at http://redis.io web site. @@ -651,6 +682,20 @@ set-max-intset-entries 512 zset-max-ziplist-entries 128 zset-max-ziplist-value 64 +# HyperLogLog sparse representation bytes limit. The limit includes the +# 16 bytes header. When an HyperLogLog using the sparse representation crosses +# this limit, it is converted into the dense representation. +# +# A value greater than 16000 is totally useless, since at that point the +# dense representation is more memory efficient. +# +# The suggested value is ~ 3000 in order to have the benefits of +# the space efficient encoding without slowing down too much PFADD, +# which is O(N) with the sparse encoding. The value can be raised to +# ~ 10000 when CPU is not a concern, but space is, and the data set is +# composed of many HyperLogLogs with cardinality in the 0 - 15000 range. +hll-sparse-max-bytes 3000 + # Active rehashing uses 1 millisecond every 100 milliseconds of CPU time in # order to help rehashing the main Redis hash table (the one mapping top-level # keys to values). The hash table implementation Redis uses (see dict.c) diff --git a/runtest-cluster b/runtest-cluster new file mode 100755 index 000000000..27829a5fe --- /dev/null +++ b/runtest-cluster @@ -0,0 +1,14 @@ +#!/bin/sh +TCL_VERSIONS="8.5 8.6" +TCLSH="" + +for VERSION in $TCL_VERSIONS; do + TCL=`which tclsh$VERSION 2>/dev/null` && TCLSH=$TCL +done + +if [ -z $TCLSH ] +then + echo "You need tcl 8.5 or newer in order to run the Redis Sentinel test" + exit 1 +fi +$TCLSH tests/cluster/run.tcl $* diff --git a/runtest-sentinel b/runtest-sentinel new file mode 100755 index 000000000..3fb1ef615 --- /dev/null +++ b/runtest-sentinel @@ -0,0 +1,14 @@ +#!/bin/sh +TCL_VERSIONS="8.5 8.6" +TCLSH="" + +for VERSION in $TCL_VERSIONS; do + TCL=`which tclsh$VERSION 2>/dev/null` && TCLSH=$TCL +done + +if [ -z $TCLSH ] +then + echo "You need tcl 8.5 or newer in order to run the Redis Sentinel test" + exit 1 +fi +$TCLSH tests/sentinel/run.tcl $* diff --git a/sentinel.conf b/sentinel.conf index e44342221..114b8474f 100644 --- a/sentinel.conf +++ b/sentinel.conf @@ -4,6 +4,13 @@ # The port that this sentinel instance will run on port 26379 +# dir +# Every long running process should have a well-defined working directory. +# For Redis Sentinel to chdir to /tmp at startup is the simplest thing +# for the process to don't interferer with administrative tasks such as +# unmounting filesystems. +dir /tmp + # sentinel monitor # # Tells Sentinel to monitor this master, and to consider it in O_DOWN @@ -86,10 +93,10 @@ sentinel failover-timeout mymaster 180000 # or to reconfigure clients after a failover. The scripts are executed # with the following rules for error handling: # -# If script exists with "1" the execution is retried later (up to a maximum +# If script exits with "1" the execution is retried later (up to a maximum # number of times currently set to 10). # -# If script exists with "2" (or an higher value) the script execution is +# If script exits with "2" (or an higher value) the script execution is # not retried. # # If script terminates because it receives a signal the behavior is the same diff --git a/src/Makefile b/src/Makefile index 0b4cff7a1..289371666 100644 --- a/src/Makefile +++ b/src/Makefile @@ -107,7 +107,7 @@ endif REDIS_SERVER_NAME=redis-server REDIS_SENTINEL_NAME=redis-sentinel -REDIS_SERVER_OBJ=adlist.o ae.o anet.o dict.o redis.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o +REDIS_SERVER_OBJ=adlist.o ae.o anet.o dict.o redis.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o REDIS_CLI_NAME=redis-cli REDIS_CLI_OBJ=anet.o sds.o adlist.o redis-cli.o zmalloc.o release.o anet.o ae.o crc64.o REDIS_BENCHMARK_NAME=redis-benchmark @@ -204,6 +204,9 @@ distclean: clean test: $(REDIS_SERVER_NAME) $(REDIS_CHECK_AOF_NAME) @(cd ..; ./runtest) +test-sentinel: $(REDIS_SENTINEL_NAME) + @(cd ..; ./runtest-sentinel) + check: test lcov: diff --git a/src/Makefile.dep b/src/Makefile.dep index b66e00df4..d118050fd 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -14,6 +14,9 @@ bio.o: bio.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ bitops.o: bitops.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ ziplist.h intset.h version.h util.h rdb.h rio.h +blocked.o: blocked.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h cluster.o: cluster.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ ziplist.h intset.h version.h util.h rdb.h rio.h cluster.h endianconv.h @@ -32,6 +35,10 @@ debug.o: debug.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ ziplist.h intset.h version.h util.h rdb.h rio.h sha1.h crc64.h bio.h dict.o: dict.c fmacros.h dict.h zmalloc.h redisassert.h endianconv.o: endianconv.c +hyperloglog.o: hyperloglog.c redis.h fmacros.h config.h \ + ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \ + adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h rdb.h \ + rio.h intset.o: intset.c intset.h zmalloc.h endianconv.h config.h lzf_c.o: lzf_c.c lzfP.h lzf_d.o: lzf_d.c lzfP.h @@ -117,6 +124,3 @@ ziplist.o: ziplist.c zmalloc.h util.h sds.h ziplist.h endianconv.h \ config.h redisassert.h zipmap.o: zipmap.c zmalloc.h endianconv.h config.h zmalloc.o: zmalloc.c config.h zmalloc.h -blocked.o: blocked.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h diff --git a/src/anet.c b/src/anet.c index a42fde304..cc850a1f8 100644 --- a/src/anet.c +++ b/src/anet.c @@ -261,11 +261,12 @@ static int anetCreateSocket(char *err, int domain) { #define ANET_CONNECT_NONE 0 #define ANET_CONNECT_NONBLOCK 1 -static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) +static int anetTcpGenericConnect(char *err, char *addr, int port, + char *source_addr, int flags) { int s = ANET_ERR, rv; char portstr[6]; /* strlen("65535") + 1; */ - struct addrinfo hints, *servinfo, *p; + struct addrinfo hints, *servinfo, *bservinfo, *p, *b; snprintf(portstr,sizeof(portstr),"%d",port); memset(&hints,0,sizeof(hints)); @@ -285,6 +286,24 @@ static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) if (anetSetReuseAddr(err,s) == ANET_ERR) goto error; if (flags & ANET_CONNECT_NONBLOCK && anetNonBlock(err,s) != ANET_OK) goto error; + if (source_addr) { + int bound = 0; + /* Using getaddrinfo saves us from self-determining IPv4 vs IPv6 */ + if ((rv = getaddrinfo(source_addr, NULL, &hints, &bservinfo)) != 0) { + anetSetError(err, "%s", gai_strerror(rv)); + goto end; + } + for (b = bservinfo; b != NULL; b = b->ai_next) { + if (bind(s,b->ai_addr,b->ai_addrlen) != -1) { + bound = 1; + break; + } + } + if (!bound) { + anetSetError(err, "bind: %s", strerror(errno)); + goto end; + } + } if (connect(s,p->ai_addr,p->ai_addrlen) == -1) { /* If the socket is non-blocking, it is ok for connect() to * return an EINPROGRESS error here. */ @@ -317,7 +336,7 @@ static int anetTcpGenericConnect(char *err, char *addr, int port, int flags) */ int anetTcpConnect(char *err, char *addr, int port) { - return anetTcpGenericConnect(err,addr,port,ANET_CONNECT_NONE); + return anetTcpGenericConnect(err,addr,port,NULL,ANET_CONNECT_NONE); } /* @@ -325,7 +344,12 @@ int anetTcpConnect(char *err, char *addr, int port) */ int anetTcpNonBlockConnect(char *err, char *addr, int port) { - return anetTcpGenericConnect(err,addr,port,ANET_CONNECT_NONBLOCK); + return anetTcpGenericConnect(err,addr,port,NULL,ANET_CONNECT_NONBLOCK); +} + +int anetTcpNonBlockBindConnect(char *err, char *addr, int port, char *source_addr) +{ + return anetTcpGenericConnect(err,addr,port,source_addr,ANET_CONNECT_NONBLOCK); } int anetUnixGenericConnect(char *err, char *path, int flags) @@ -409,17 +433,14 @@ int anetWrite(int fd, char *buf, int count) /* * 绑定并创建监听套接字 */ -static int anetListen(char *err, int s, struct sockaddr *sa, socklen_t len) { +static int anetListen(char *err, int s, struct sockaddr *sa, socklen_t len, int backlog) { if (bind(s,sa,len) == -1) { anetSetError(err, "bind: %s", strerror(errno)); close(s); return ANET_ERR; } - /* Use a backlog of 512 entries. We pass 511 to the listen() call because - * the kernel does: backlogsize = roundup_pow_of_two(backlogsize + 1); - * which will thus give us a backlog of 512 entries */ - if (listen(s, 511) == -1) { + if (listen(s, backlog) == -1) { anetSetError(err, "listen: %s", strerror(errno)); close(s); return ANET_ERR; @@ -437,7 +458,7 @@ static int anetV6Only(char *err, int s) { return ANET_OK; } -static int _anetTcpServer(char *err, int port, char *bindaddr, int af) +static int _anetTcpServer(char *err, int port, char *bindaddr, int af, int backlog) { int s, rv; char _port[6]; /* strlen("65535") */ @@ -459,7 +480,7 @@ static int _anetTcpServer(char *err, int port, char *bindaddr, int af) if (af == AF_INET6 && anetV6Only(err,s) == ANET_ERR) goto error; if (anetSetReuseAddr(err,s) == ANET_ERR) goto error; - if (anetListen(err,s,p->ai_addr,p->ai_addrlen) == ANET_ERR) goto error; + if (anetListen(err,s,p->ai_addr,p->ai_addrlen,backlog) == ANET_ERR) goto error; goto end; } if (p == NULL) { @@ -474,20 +495,20 @@ static int _anetTcpServer(char *err, int port, char *bindaddr, int af) return s; } -int anetTcpServer(char *err, int port, char *bindaddr) +int anetTcpServer(char *err, int port, char *bindaddr, int backlog) { - return _anetTcpServer(err, port, bindaddr, AF_INET); + return _anetTcpServer(err, port, bindaddr, AF_INET, backlog); } -int anetTcp6Server(char *err, int port, char *bindaddr) +int anetTcp6Server(char *err, int port, char *bindaddr, int backlog) { - return _anetTcpServer(err, port, bindaddr, AF_INET6); + return _anetTcpServer(err, port, bindaddr, AF_INET6, backlog); } /* * 创建一个本地连接用的服务器监听套接字 */ -int anetUnixServer(char *err, char *path, mode_t perm) +int anetUnixServer(char *err, char *path, mode_t perm, int backlog) { int s; struct sockaddr_un sa; @@ -498,7 +519,7 @@ int anetUnixServer(char *err, char *path, mode_t perm) memset(&sa,0,sizeof(sa)); sa.sun_family = AF_LOCAL; strncpy(sa.sun_path,path,sizeof(sa.sun_path)-1); - if (anetListen(err,s,(struct sockaddr*)&sa,sizeof(sa)) == ANET_ERR) + if (anetListen(err,s,(struct sockaddr*)&sa,sizeof(sa),backlog) == ANET_ERR) return ANET_ERR; if (perm) chmod(sa.sun_path, perm); @@ -529,7 +550,7 @@ int anetTcpAccept(char *err, int s, char *ip, size_t ip_len, int *port) { int fd; struct sockaddr_storage sa; socklen_t salen = sizeof(sa); - if ((fd = anetGenericAccept(err,s,(struct sockaddr*)&sa,&salen)) == ANET_ERR) + if ((fd = anetGenericAccept(err,s,(struct sockaddr*)&sa,&salen)) == -1) return ANET_ERR; if (sa.ss_family == AF_INET) { @@ -551,7 +572,7 @@ int anetUnixAccept(char *err, int s) { int fd; struct sockaddr_un sa; socklen_t salen = sizeof(sa); - if ((fd = anetGenericAccept(err,s,(struct sockaddr*)&sa,&salen)) == ANET_ERR) + if ((fd = anetGenericAccept(err,s,(struct sockaddr*)&sa,&salen)) == -1) return ANET_ERR; return fd; diff --git a/src/anet.h b/src/anet.h index 2ab9398ad..c4659cd35 100644 --- a/src/anet.h +++ b/src/anet.h @@ -45,14 +45,15 @@ int anetTcpConnect(char *err, char *addr, int port); int anetTcpNonBlockConnect(char *err, char *addr, int port); +int anetTcpNonBlockBindConnect(char *err, char *addr, int port, char *source_addr); int anetUnixConnect(char *err, char *path); int anetUnixNonBlockConnect(char *err, char *path); int anetRead(int fd, char *buf, int count); int anetResolve(char *err, char *host, char *ipbuf, size_t ipbuf_len); int anetResolveIP(char *err, char *host, char *ipbuf, size_t ipbuf_len); -int anetTcpServer(char *err, int port, char *bindaddr); -int anetTcp6Server(char *err, int port, char *bindaddr); -int anetUnixServer(char *err, char *path, mode_t perm); +int anetTcpServer(char *err, int port, char *bindaddr, int backlog); +int anetTcp6Server(char *err, int port, char *bindaddr, int backlog); +int anetUnixServer(char *err, char *path, mode_t perm, int backlog); int anetTcpAccept(char *err, int serversock, char *ip, size_t ip_len, int *port); int anetUnixAccept(char *err, int serversock); int anetWrite(int fd, char *buf, int count); diff --git a/src/aof.c b/src/aof.c index 787533fe8..a9a1fba6a 100644 --- a/src/aof.c +++ b/src/aof.c @@ -354,6 +354,7 @@ int startAppendOnly(void) { * 不过,如果 force 为 1 的话,那么不管后台是否正在 fsync , * 程序都直接进行写入。 */ +#define AOF_WRITE_LOG_ERROR_RATE 30 /* Seconds between errors logging. */ void flushAppendOnlyFile(int force) { ssize_t nwritten; int sync_in_progress = 0; @@ -439,39 +440,80 @@ void flushAppendOnlyFile(int force) { */ nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf)); if (nwritten != (signed)sdslen(server.aof_buf)) { - /* Ooops, we are in troubles. The best thing to do for now is - * aborting instead of giving the illusion that everything is - * working as expected. - * - * 糟糕了,成功写入的字节数不等于缓存的字节数 - * 可能是磁盘满了 0 <= nwritten < sdslen(server.aof_buf) , - * 也可能是写入失败 nwritten == -1 - * - * 立即停机,向用户报告错误 - */ + + static time_t last_write_error_log = 0; + int can_log = 0; + + /* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */ + // 将日志的记录频率限制在每行 AOF_WRITE_LOG_ERROR_RATE 秒 + if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) { + can_log = 1; + last_write_error_log = server.unixtime; + } + + /* Lof the AOF write error and record the error code. */ + // 如果写入出错,那么尝试将该情况写入到日志里面 if (nwritten == -1) { - // 写入出错 - redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno)); + if (can_log) { + redisLog(REDIS_WARNING,"Error writing to the AOF file: %s", + strerror(errno)); + server.aof_last_write_errno = errno; + } } else { - // 写入不完整 - redisLog(REDIS_WARNING,"Exiting on short write while writing to " - "the append-only file: %s (nwritten=%ld, " - "expected=%ld)", - strerror(errno), - (long)nwritten, - (long)sdslen(server.aof_buf)); + if (can_log) { + redisLog(REDIS_WARNING,"Short write while writing to " + "the AOF file: (nwritten=%lld, " + "expected=%lld)", + (long long)nwritten, + (long long)sdslen(server.aof_buf)); + } // 尝试移除新追加的不完整内容 if (ftruncate(server.aof_fd, server.aof_current_size) == -1) { - redisLog(REDIS_WARNING, "Could not remove short write " - "from the append-only file. Redis may refuse " - "to load the AOF the next time it starts. " - "ftruncate: %s", strerror(errno)); + if (can_log) { + redisLog(REDIS_WARNING, "Could not remove short write " + "from the append-only file. Redis may refuse " + "to load the AOF the next time it starts. " + "ftruncate: %s", strerror(errno)); + } + } else { + /* If the ftrunacate() succeeded we can set nwritten to + * -1 since there is no longer partial data into the AOF. */ + nwritten = -1; } + server.aof_last_write_errno = ENOSPC; } - // 服务器退出 - exit(1); + /* Handle the AOF write error. */ + if (server.aof_fsync == AOF_FSYNC_ALWAYS) { + /* We can't recover when the fsync policy is ALWAYS since the + * reply for the client is already in the output buffers, and we + * have the contract with the user that on acknowledged write data + * is synched on disk. */ + redisLog(REDIS_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting..."); + exit(1); + } else { + /* Recover from failed write leaving data into the buffer. However + * set an error to stop accepting writes as long as the error + * condition is not cleared. */ + server.aof_last_write_status = REDIS_ERR; + + /* Trim the sds buffer if there was a partial write, and there + * was no way to undo it with ftruncate(2). */ + if (nwritten > 0) { + server.aof_current_size += nwritten; + sdsrange(server.aof_buf,nwritten,-1); + } + return; /* We'll try again on the next call... */ + } + } else { + /* Successful write(2). If AOF was in error state, restore the + * OK state and log the event. */ + if (server.aof_last_write_status == REDIS_ERR) { + redisLog(REDIS_WARNING, + "AOF write error looks solved, Redis can write again."); + server.aof_last_write_status = REDIS_OK; + } } // 更新写入后的 AOF 文件大小 @@ -748,6 +790,7 @@ struct redisClient *createFakeClient(void) { c->reply_bytes = 0; c->obuf_soft_limit_reached_time = 0; c->watched_keys = listCreate(); + c->peerid = NULL; listSetFreeMethod(c->reply,decrRefCountVoid); listSetDupMethod(c->reply,dupClientReplyValue); initClientMultiState(c); @@ -841,7 +884,7 @@ int loadAppendOnlyFile(char *filename) { */ if (!(loops++ % 1000)) { loadingProgress(ftello(fp)); - aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT); + processEventsWhileBlocked(); } // 读入文件内容到缓存 @@ -1405,9 +1448,9 @@ int rewriteAppendOnlyFile(char *filename) { /* Make sure data will not remain on the OS's output buffers */ // 冲洗并关闭新 AOF 文件 - fflush(fp); - aof_fsync(fileno(fp)); - fclose(fp); + if (fflush(fp) == EOF) goto werr; + if (aof_fsync(fileno(fp)) == -1) goto werr; + if (fclose(fp) == EOF) goto werr; /* Use RENAME to make sure the DB file is changed atomically only * if the generate DB file is ok. diff --git a/src/bitops.c b/src/bitops.c index 3d24ef626..6d2e86566 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -64,19 +64,25 @@ static int getBitOffsetFromArgument(redisClient *c, robj *o, size_t *offset) { // 这个函数只能在最大为 512 MB 的字符串上使用 size_t redisPopcount(void *s, long count) { size_t bits = 0; - unsigned char *p; - uint32_t *p4 = s; - + unsigned char *p = s; + uint32_t *p4; // 通过查表来计算,对于 1 字节所能表示的值来说 // 这些值的二进制表示所带有的 1 的数量 // 比如整数 3 的二进制表示 0011 ,带有两个 1 // 正好是查表 bitsinbyte[3] == 2 static const unsigned char bitsinbyte[256] = {0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8}; + /* Count initial bytes not aligned to 32 bit. */ + while((unsigned long)p & 3 && count) { + bits += bitsinbyte[*p++]; + count--; + } + /* Count bits 16 bytes at a time */ // 每次统计 16 字节 // 关于这里所使用的优化算法,可以参考: // http://yesteapea.wordpress.com/2013/03/03/counting-the-number-of-set-bits-in-an-integer/ + p4 = (uint32_t*)p; while(count>=16) { uint32_t aux1, aux2, aux3, aux4; @@ -100,13 +106,100 @@ size_t redisPopcount(void *s, long count) { ((((aux4 + (aux4 >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24); } - /* Count the remaining bytes */ + /* Count the remaining bytes. */ // 不足 16 字节的,剩下的每个字节通过查表来完成 p = (unsigned char*)p4; while(count--) bits += bitsinbyte[*p++]; return bits; } +/* Return the position of the first bit set to one (if 'bit' is 1) or + * zero (if 'bit' is 0) in the bitmap starting at 's' and long 'count' bytes. + * + * The function is guaranteed to return a value >= 0 if 'bit' is 0 since if + * no zero bit is found, it returns count*8 assuming the string is zero + * padded on the right. However if 'bit' is 1 it is possible that there is + * not a single set bit in the bitmap. In this special case -1 is returned. */ +long redisBitpos(void *s, long count, int bit) { + unsigned long *l; + unsigned char *c; + unsigned long skipval, word = 0, one; + long pos = 0; /* Position of bit, to return to the caller. */ + int j; + + /* Process whole words first, seeking for first word that is not + * all ones or all zeros respectively if we are lookig for zeros + * or ones. This is much faster with large strings having contiguous + * blocks of 1 or 0 bits compared to the vanilla bit per bit processing. + * + * Note that if we start from an address that is not aligned + * to sizeof(unsigned long) we consume it byte by byte until it is + * aligned. */ + + /* Skip initial bits not aligned to sizeof(unsigned long) byte by byte. */ + skipval = bit ? 0 : UCHAR_MAX; + c = (unsigned char*) s; + while((unsigned long)c & (sizeof(*l)-1) && count) { + if (*c != skipval) break; + c++; + count--; + pos += 8; + } + + /* Skip bits with full word step. */ + skipval = bit ? 0 : ULONG_MAX; + l = (unsigned long*) c; + while (count >= sizeof(*l)) { + if (*l != skipval) break; + l++; + count -= sizeof(*l); + pos += sizeof(*l)*8; + } + + /* Load bytes into "word" considering the first byte as the most significant + * (we basically consider it as written in big endian, since we consider the + * string as a set of bits from left to right, with the first bit at position + * zero. + * + * Note that the loading is designed to work even when the bytes left + * (count) are less than a full word. We pad it with zero on the right. */ + c = (unsigned char*)l; + for (j = 0; j < sizeof(*l); j++) { + word <<= 8; + if (count) { + word |= *c; + c++; + count--; + } + } + + /* Special case: + * If bits in the string are all zero and we are looking for one, + * return -1 to signal that there is not a single "1" in the whole + * string. This can't happen when we are looking for "0" as we assume + * that the right of the string is zero padded. */ + if (bit == 1 && word == 0) return -1; + + /* Last word left, scan bit by bit. The first thing we need is to + * have a single "1" set in the most significant position in an + * unsigned long. We don't know the size of the long so we use a + * simple trick. */ + one = ULONG_MAX; /* All bits set to 1.*/ + one >>= 1; /* All bits set to 1 but the MSB. */ + one = ~one; /* All bits set to 0 but the MSB. */ + + while(one) { + if (((one & word) != 0) == bit) return pos; + pos++; + one >>= 1; + } + + /* If we reached this point, there is a bug in the algorithm, since + * the case of no match is handled as a special case before. */ + redisPanic("End of redisBitpos() reached."); + return 0; /* Just to avoid warnings. */ +} + /* ----------------------------------------------------------------------------- * Bits related string commands: GETBIT, SETBIT, BITCOUNT, BITOP. * -------------------------------------------------------------------------- */ @@ -155,14 +248,7 @@ void setbitCommand(redisClient *c) { // 对象存在,检查类型是否字符串 if (checkType(c,o,REDIS_STRING)) return; - /* Create a copy when the object is shared or encoded. */ - // 如果对象被共享或者编码,那么创建一个复制对象 - if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) { - robj *decoded = getDecodedObject(o); - o = createRawStringObject(decoded->ptr, sdslen(decoded->ptr)); - decrRefCount(decoded); - dbOverwrite(c->db,c->argv[1],o); - } + o = dbUnshareStringValue(c->db,c->argv[1],o); } /* Grow sds value to the right length if necessary */ @@ -498,3 +584,90 @@ void bitcountCommand(redisClient *c) { addReplyLongLong(c,redisPopcount(p+start,bytes)); } } + +/* BITPOS key bit [start [end]] */ +void bitposCommand(redisClient *c) { + robj *o; + long bit, start, end, strlen; + unsigned char *p; + char llbuf[32]; + int end_given = 0; + + /* Parse the bit argument to understand what we are looking for, set + * or clear bits. */ + if (getLongFromObjectOrReply(c,c->argv[2],&bit,NULL) != REDIS_OK) + return; + if (bit != 0 && bit != 1) { + addReplyError(c, "The bit argument must be 1 or 0."); + return; + } + + /* If the key does not exist, from our point of view it is an infinite + * array of 0 bits. If the user is looking for the fist clear bit return 0, + * If the user is looking for the first set bit, return -1. */ + if ((o = lookupKeyRead(c->db,c->argv[1])) == NULL) { + addReplyLongLong(c, bit ? -1 : 0); + return; + } + if (checkType(c,o,REDIS_STRING)) return; + + /* Set the 'p' pointer to the string, that can be just a stack allocated + * array if our string was integer encoded. */ + if (o->encoding == REDIS_ENCODING_INT) { + p = (unsigned char*) llbuf; + strlen = ll2string(llbuf,sizeof(llbuf),(long)o->ptr); + } else { + p = (unsigned char*) o->ptr; + strlen = sdslen(o->ptr); + } + + /* Parse start/end range if any. */ + if (c->argc == 4 || c->argc == 5) { + if (getLongFromObjectOrReply(c,c->argv[3],&start,NULL) != REDIS_OK) + return; + if (c->argc == 5) { + if (getLongFromObjectOrReply(c,c->argv[4],&end,NULL) != REDIS_OK) + return; + end_given = 1; + } else { + end = strlen-1; + } + /* Convert negative indexes */ + if (start < 0) start = strlen+start; + if (end < 0) end = strlen+end; + if (start < 0) start = 0; + if (end < 0) end = 0; + if (end >= strlen) end = strlen-1; + } else if (c->argc == 3) { + /* The whole string. */ + start = 0; + end = strlen-1; + } else { + /* Syntax error. */ + addReply(c,shared.syntaxerr); + return; + } + + /* For empty ranges (start > end) we return -1 as an empty range does + * not contain a 0 nor a 1. */ + if (start > end) { + addReplyLongLong(c, -1); + } else { + long bytes = end-start+1; + long pos = redisBitpos(p+start,bytes,bit); + + /* If we are looking for clear bits, and the user specified an exact + * range with start-end, we can't consider the right of the range as + * zero padded (as we do when no explicit end is given). + * + * So if redisBitpos() returns the first bit outside the range, + * we return -1 to the caller, to mean, in the specified range there + * is not a single "0" bit. */ + if (end_given && bit == 0 && pos == bytes*8) { + addReplyLongLong(c,-1); + return; + } + if (pos != -1) pos += start*8; /* Adjust for the bytes we skipped. */ + addReplyLongLong(c,pos); + } +} diff --git a/src/cluster.c b/src/cluster.c index 383fe3704..f53bc3652 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -39,6 +39,12 @@ #include #include #include +#include + +/* A global reference to myself is handy to make code more clear. + * Myself always points to server.cluster->myself, that is, the clusterNode + * that represents this node. */ +clusterNode *myself = NULL; clusterNode *createClusterNode(char *nodename, int flags); int clusterAddNode(clusterNode *node); @@ -58,20 +64,22 @@ int clusterDelNodeSlots(clusterNode *node); int clusterNodeSetSlotBit(clusterNode *n, int slot); void clusterSetMaster(clusterNode *n); void clusterHandleSlaveFailover(void); +void clusterHandleSlaveMigration(int max_slaves); int bitmapTestBit(unsigned char *bitmap, int pos); void clusterDoBeforeSleep(int flags); void clusterSendUpdate(clusterLink *link, clusterNode *node); +void resetManualFailover(void); +void clusterCloseAllSlots(void); +void clusterSetNodeAsMaster(clusterNode *n); +void clusterDelNode(clusterNode *delnode); /* ----------------------------------------------------------------------------- * Initialization * -------------------------------------------------------------------------- */ -/* This function is called at startup in order to set the currentEpoch - * (which is not saved on permanent storage) to the greatest configEpoch found - * in the loaded nodes (configEpoch is stored on permanent storage as soon as - * it changes for some node). */ -// 设置配置纪元 -void clusterSetStartupEpoch() { +/* Return the greatest configEpoch found in the cluster. */ +uint64_t clusterGetMaxEpoch(void) { + uint64_t max = 0; dictIterator *di; dictEntry *de; @@ -79,19 +87,43 @@ void clusterSetStartupEpoch() { di = dictGetSafeIterator(server.cluster->nodes); while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); - if (node->configEpoch > server.cluster->currentEpoch) - server.cluster->currentEpoch = node->configEpoch; + if (node->configEpoch > max) max = node->configEpoch; } dictReleaseIterator(di); + if (max < server.cluster->currentEpoch) max = server.cluster->currentEpoch; + return max; } // 载入集群配置 +/* Load the cluster config from 'filename'. + * + * If the file does not exist or is zero-length (this may happen because + * when we lock the nodes.conf file, we create a zero-length one for the + * sake of locking if it does not already exist), REDIS_ERR is returned. + * If the configuration was loaded from the file, REDIS_OK is returned. */ int clusterLoadConfig(char *filename) { FILE *fp = fopen(filename,"r"); + struct stat sb; char *line; int maxline, j; - - if (fp == NULL) return REDIS_ERR; + + if (fp == NULL) { + if (errno == ENOENT) { + return REDIS_ERR; + } else { + redisLog(REDIS_WARNING, + "Loading the cluster node config from %s: %s", + filename, strerror(errno)); + exit(1); + } + } + + /* Check if the file is zero-length: if so return REDIS_ERR to signal + * we have to write the config. */ + if (fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) { + fclose(fp); + return REDIS_ERR; + } /* Parse the file. Note that single liens of the cluster config file can * be really long as they include all the hash slots of the node. @@ -126,6 +158,25 @@ int clusterLoadConfig(char *filename) { argv = sdssplitargs(line,&argc); if (argv == NULL) goto fmterr; + /* Handle the special "vars" line. Don't pretend it is the last + * line even if it actually is when generated by Redis. */ + if (strcasecmp(argv[0],"vars") == 0) { + for (j = 1; j < argc; j += 2) { + if (strcasecmp(argv[j],"currentEpoch") == 0) { + server.cluster->currentEpoch = + strtoull(argv[j+1],NULL,10); + } else if (strcasecmp(argv[j],"lastVoteEpoch") == 0) { + server.cluster->lastVoteEpoch = + strtoull(argv[j+1],NULL,10); + } else { + redisLog(REDIS_WARNING, + "Skipping unknown cluster config variable '%s'", + argv[j]); + } + } + continue; + } + /* Create this node if it does not exist */ // 检查节点是否已经存在 n = clusterLookupNode(argv[0]); @@ -150,7 +201,7 @@ int clusterLoadConfig(char *filename) { // 这是节点本身 if (!strcasecmp(s,"myself")) { redisAssert(server.cluster->myself == NULL); - server.cluster->myself = n; + myself = server.cluster->myself = n; n->flags |= REDIS_NODE_MYSELF; // 这是一个主节点 } else if (!strcasecmp(s,"master")) { @@ -264,15 +315,19 @@ int clusterLoadConfig(char *filename) { /* Config sanity check */ redisAssert(server.cluster->myself != NULL); - redisLog(REDIS_NOTICE,"Node configuration loaded, I'm %.40s", - server.cluster->myself->name); - // 设置配置纪元 - clusterSetStartupEpoch(); + redisLog(REDIS_NOTICE,"Node configuration loaded, I'm %.40s", myself->name); + /* Something that should never happen: currentEpoch smaller than + * the max epoch found in the nodes configuration. However we handle this + * as some form of protection against manual editing of critical files. */ + if (clusterGetMaxEpoch() > server.cluster->currentEpoch) { + server.cluster->currentEpoch = clusterGetMaxEpoch(); + } return REDIS_OK; fmterr: - redisLog(REDIS_WARNING,"Unrecoverable error: corrupted cluster config file."); + redisLog(REDIS_WARNING, + "Unrecoverable error: corrupted cluster config file."); fclose(fp); exit(1); } @@ -291,11 +346,21 @@ int clusterLoadConfig(char *filename) { * the file afterward. */ // 写入 nodes.conf 文件 int clusterSaveConfig(int do_fsync) { - sds ci = clusterGenNodesDescription(REDIS_NODE_HANDSHAKE); - size_t content_size = sdslen(ci); + sds ci; + size_t content_size; struct stat sb; int fd; - + + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG; + + /* Get the nodes description and concatenate our "vars" directive to + * save currentEpoch and lastVoteEpoch. */ + ci = clusterGenNodesDescription(REDIS_NODE_HANDSHAKE); + ci = sdscatprintf(ci,"vars currentEpoch %llu lastVoteEpoch %llu\n", + (unsigned long long) server.cluster->currentEpoch, + (unsigned long long) server.cluster->lastVoteEpoch); + content_size = sdslen(ci); + if ((fd = open(server.cluster_configfile,O_WRONLY|O_CREAT,0644)) == -1) goto err; @@ -307,7 +372,10 @@ int clusterSaveConfig(int do_fsync) { } } if (write(fd,ci,sdslen(ci)) != (ssize_t)sdslen(ci)) goto err; - if (do_fsync) fsync(fd); + if (do_fsync) { + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG; + fsync(fd); + } /* Truncate the file if needed to remove the final \n padding that * is just garbage. */ @@ -332,6 +400,46 @@ void clusterSaveConfigOrDie(int do_fsync) { } } +/* Lock the cluster config using flock(), and leaks the file descritor used to + * acquire the lock so that the file will be locked forever. + * + * This works because we always update nodes.conf with a new version + * in-place, reopening the file, and writing to it in place (later adjusting + * the length with ftruncate()). + * + * On success REDIS_OK is returned, otherwise an error is logged and + * the function returns REDIS_ERR to signal a lock was not acquired. */ +int clusterLockConfig(char *filename) { + /* To lock it, we need to open the file in a way it is created if + * it does not exist, otherwise there is a race condition with other + * processes. */ + int fd = open(filename,O_WRONLY|O_CREAT,0644); + if (fd == -1) { + redisLog(REDIS_WARNING, + "Can't open %s in order to acquire a lock: %s", + filename, strerror(errno)); + return REDIS_ERR; + } + + if (flock(fd,LOCK_EX|LOCK_NB) == -1) { + if (errno == EWOULDBLOCK) { + redisLog(REDIS_WARNING, + "Sorry, the cluster configuration file %s is already used " + "by a different Redis Cluster node. Please make sure that " + "different nodes use different cluster configuration " + "files.", filename); + } else { + redisLog(REDIS_WARNING, + "Impossible to lock %s: %s", filename, strerror(errno)); + } + close(fd); + return REDIS_ERR; + } + /* Lock acquired: leak the 'fd' by not closing it, so that we'll retain the + * lock to the file as long as the process exists. */ + return REDIS_OK; +} + // 初始化集群 void clusterInit(void) { int saveconf = 0; @@ -348,30 +456,28 @@ void clusterInit(void) { dictCreate(&clusterNodesBlackListDictType,NULL); server.cluster->failover_auth_time = 0; server.cluster->failover_auth_count = 0; + server.cluster->failover_auth_rank = 0; server.cluster->failover_auth_epoch = 0; - server.cluster->last_vote_epoch = 0; + server.cluster->lastVoteEpoch = 0; server.cluster->stats_bus_messages_sent = 0; server.cluster->stats_bus_messages_received = 0; - memset(server.cluster->migrating_slots_to,0, - sizeof(server.cluster->migrating_slots_to)); - memset(server.cluster->importing_slots_from,0, - sizeof(server.cluster->importing_slots_from)); - memset(server.cluster->slots,0, - sizeof(server.cluster->slots)); + memset(server.cluster->slots,0, sizeof(server.cluster->slots)); + clusterCloseAllSlots(); + + /* Lock the cluster config file to make sure every node uses + * its own nodes.conf. */ + if (clusterLockConfig(server.cluster_configfile) == REDIS_ERR) + exit(1); - // 载入 nodes.conf 配置文件 + /* Load or create a new nodes configuration. */ if (clusterLoadConfig(server.cluster_configfile) == REDIS_ERR) { /* No configuration found. We will just use the random name provided * by the createClusterNode() function. */ - // 未载入到配置文件,为节点创建一个随机名字 - server.cluster->myself = + myself = server.cluster->myself = createClusterNode(NULL,REDIS_NODE_MYSELF|REDIS_NODE_MASTER); redisLog(REDIS_NOTICE,"No cluster configuration found, I'm %.40s", - server.cluster->myself->name); - - // 将节点添加到集群中 - clusterAddNode(server.cluster->myself); - + myself->name); + clusterAddNode(myself); saveconf = 1; } @@ -381,6 +487,19 @@ void clusterInit(void) { /* We need a listening TCP port for our cluster messaging needs. */ // 监听 TCP 端口 server.cfd_count = 0; + + /* Port sanity check II + * The other handshake port check is triggered too late to stop + * us from trying to use a too-high cluster port number. */ + if (server.port > (65535-REDIS_CLUSTER_PORT_INCR)) { + redisLog(REDIS_WARNING, "Redis port number too high. " + "Cluster communication port is 10,000 port " + "numbers higher than your Redis port. " + "Your Redis port number must be " + "lower than 55535."); + exit(1); + } + if (listenToPort(server.port+REDIS_CLUSTER_PORT_INCR, server.cfd,&server.cfd_count) == REDIS_ERR) { @@ -400,6 +519,66 @@ void clusterInit(void) { /* The slots -> keys map is a sorted set. Init it. */ // slots -> keys 映射是一个有序集合 server.cluster->slots_to_keys = zslCreate(); + resetManualFailover(); +} + +/* Reset a node performing a soft or hard reset: + * + * 1) All other nodes are forget. + * 2) All the assigned / open slots are released. + * 3) If the node is a slave, it turns into a master. + * 5) Only for hard reset: a new Node ID is generated. + * 6) Only for hard reset: currentEpoch and configEpoch are set to 0. + * 7) The new configuration is saved and the cluster state updated. */ +void clusterReset(int hard) { + dictIterator *di; + dictEntry *de; + int j; + + /* Turn into master. */ + if (nodeIsSlave(myself)) { + clusterSetNodeAsMaster(myself); + replicationUnsetMaster(); + } + + /* Close slots, reset manual failover state. */ + clusterCloseAllSlots(); + resetManualFailover(); + + /* Unassign all the slots. */ + for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) clusterDelSlot(j); + + /* Forget all the nodes, but myself. */ + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + if (node == myself) continue; + clusterDelNode(node); + } + dictReleaseIterator(di); + + /* Hard reset only: set epochs to 0, change node ID. */ + if (hard) { + sds oldname; + + server.cluster->currentEpoch = 0; + server.cluster->lastVoteEpoch = 0; + myself->configEpoch = 0; + + /* To change the Node ID we need to remove the old name from the + * nodes table, change the ID, and re-add back with new name. */ + oldname = sdsnewlen(myself->name, REDIS_CLUSTER_NAMELEN); + dictDelete(server.cluster->nodes,oldname); + sdsfree(oldname); + getRandomHexChars(myself->name, REDIS_CLUSTER_NAMELEN); + clusterAddNode(myself); + } + + /* Make sure to persist the new config and update the state. */ + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_FSYNC_CONFIG); } /* ----------------------------------------------------------------------------- @@ -446,35 +625,42 @@ void freeClusterLink(clusterLink *link) { } // 监听事件处理器 +#define MAX_CLUSTER_ACCEPTS_PER_CALL 1000 void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { int cport, cfd; + int max = MAX_CLUSTER_ACCEPTS_PER_CALL; char cip[REDIS_IP_STR_LEN]; clusterLink *link; REDIS_NOTUSED(el); REDIS_NOTUSED(mask); REDIS_NOTUSED(privdata); - // accept 连接 - cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport); - if (cfd == AE_ERR) { - redisLog(REDIS_VERBOSE,"Accepting cluster node: %s", server.neterr); - return; + /* If the server is starting up, don't accept cluster connections: + * UPDATE messages may interact with the database content. */ + if (server.masterhost == NULL && server.loading) return; + + while(max--) { + cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport); + if (cfd == ANET_ERR) { + if (errno != EWOULDBLOCK) + redisLog(REDIS_VERBOSE, + "Accepting cluster node: %s", server.neterr); + return; + } + anetNonBlock(NULL,cfd); + anetEnableTcpNoDelay(NULL,cfd); + + /* Use non-blocking I/O for cluster messages. */ + redisLog(REDIS_VERBOSE,"Accepted cluster node %s:%d", cip, cport); + /* Create a link object we use to handle the connection. + * It gets passed to the readable handler when data is available. + * Initiallly the link->node pointer is set to NULL as we don't know + * which node is, but the right node is references once we know the + * node identity. */ + link = createClusterLink(NULL); + link->fd = cfd; + aeCreateFileEvent(server.el,cfd,AE_READABLE,clusterReadHandler,link); } - anetNonBlock(NULL,cfd); - anetEnableTcpNoDelay(NULL,cfd); - - /* Use non-blocking I/O for cluster messages. */ - /* IPV6: might want to wrap a v6 address in [] */ - redisLog(REDIS_VERBOSE,"Accepted cluster node %s:%d", cip, cport); - /* We need to create a temporary node in order to read the incoming - * packet in a valid contest. This node will be released once we - * read the packet and reply. */ - // 创建一个临时节点,并将其用于测试连接是否正常 - // 一旦连接测试完成,这个临时节点就会被释放 - link = createClusterLink(NULL); - link->fd = cfd; - // 关联读事件 - aeCreateFileEvent(server.el,cfd,AE_READABLE,clusterReadHandler,link); } /* ----------------------------------------------------------------------------- @@ -482,10 +668,31 @@ void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) { * -------------------------------------------------------------------------- */ /* We have 16384 hash slots. The hash slot of a given key is obtained - * as the least significant 14 bits of the crc16 of the key. */ + * as the least significant 14 bits of the crc16 of the key. + * + * However if the key contains the {...} pattern, only the part between + * { and } is hashed. This may be useful in the future to force certain + * keys to be in the same node (assuming no resharding is in progress). */ // 计算给定键应该被分配到那个槽 unsigned int keyHashSlot(char *key, int keylen) { - return crc16(key,keylen) & 0x3FFF; + int s, e; /* start-end indexes of { and } */ + + for (s = 0; s < keylen; s++) + if (key[s] == '{') break; + + /* No '{' ? Hash the whole key. This is the base case. */ + if (s == keylen) return crc16(key,keylen) & 0x3FFF; + + /* '{' found? Check if we have the corresponding '}'. */ + for (e = s+1; e < keylen; e++) + if (key[e] == '}') break; + + /* No '}' or nothing betweeen {} ? Hash the whole key. */ + if (e == keylen || e == s+1) return crc16(key,keylen) & 0x3FFF; + + /* If we are here there is both a { and a } on its right. Hash + * what is in the middle between { and }. */ + return crc16(key+s+1,e-s-1) & 0x3FFF; } /* ----------------------------------------------------------------------------- @@ -745,6 +952,14 @@ void clusterNodeResetSlaves(clusterNode *n) { n->slaves = NULL; } +int clusterCountNonFailingSlaves(clusterNode *n) { + int j, okslaves = 0; + + for (j = 0; j < n->numslaves; j++) + if (!nodeFailed(n->slaves[j])) okslaves++; + return okslaves; +} + // 释放节点 void freeClusterNode(clusterNode *n) { sds nodename; @@ -772,7 +987,6 @@ void freeClusterNode(clusterNode *n) { // 将给定 node 添加到节点表里面 int clusterAddNode(clusterNode *node) { int retval; - // 将 node 添加到当前节点的 nodes 表中 // 这样接下来当前节点就会创建连向 node 的节点 retval = dictAdd(server.cluster->nodes, @@ -824,7 +1038,12 @@ void clusterDelNode(clusterNode *delnode) { } dictReleaseIterator(di); - /* 3) Free the node, unlinking it from the cluster. */ + /* 3) Remove this node from its master's slaves if needed. */ + // 将节点从它的主节点的从节点列表中移除 + if (nodeIsSlave(delnode) && delnode->slaveof) + clusterNodeRemoveSlave(delnode->slaveof,delnode); + + /* 4) Free the node, unlinking it from the cluster. */ // 释放节点 freeClusterNode(delnode); } @@ -833,7 +1052,7 @@ void clusterDelNode(clusterNode *delnode) { // 根据名字,查找给定的节点 clusterNode *clusterLookupNode(char *name) { sds s = sdsnewlen(name, REDIS_CLUSTER_NAMELEN); - struct dictEntry *de; + dictEntry *de; de = dictFind(server.cluster->nodes,s); sdsfree(s); @@ -855,7 +1074,7 @@ clusterNode *clusterLookupNode(char *name) { void clusterRenameNode(clusterNode *node, char *newname) { int retval; sds s = sdsnewlen(node->name, REDIS_CLUSTER_NAMELEN); - + redisLog(REDIS_DEBUG,"Renaming node %.40s into %.40s", node->name, newname); retval = dictDelete(server.cluster->nodes, s); @@ -1035,20 +1254,15 @@ void markNodeAsFailingIfNeeded(clusterNode *node) { // 标记为 FAIL 所需的节点数量,需要超过集群节点数量的一半 int needed_quorum = (server.cluster->size / 2) + 1; - // 不能对未进入 PFAIL 状态的节点标记 FAIL 状态 - if (!(node->flags & REDIS_NODE_PFAIL)) return; /* We can reach it. */ - - // 节点已经是 FAIL 状态的了 - if (node->flags & REDIS_NODE_FAIL) return; /* Already FAILing. */ + if (!nodeTimedOut(node)) return; /* We can reach it. */ + if (nodeFailed(node)) return; /* Already FAILing. */ // 统计将 node 标记为 PFAIL 或者 FAIL 的节点数量(不包括当前节点) failures = clusterNodeFailureReportsCount(node); /* Also count myself as a voter if I'm a master. */ // 如果当前节点是主节点,那么将当前节点也算在 failures 之内 - if (server.cluster->myself->flags & REDIS_NODE_MASTER) - failures += 1; - + if (nodeIsMaster(myself)) failures++; // 报告下线节点的数量不足节点总数的一半,不能将节点判断为 FAIL ,返回 if (failures < needed_quorum) return; /* No weak agreement from masters. */ @@ -1065,9 +1279,7 @@ void markNodeAsFailingIfNeeded(clusterNode *node) { * reachable nodes to flag the node as FAIL. */ // 如果当前节点是主节点的话,那么向其他节点发送报告 node 的 FAIL 信息 // 让其他节点也将 node 标记为 FAIL - if (server.cluster->myself->flags & REDIS_NODE_MASTER) - clusterSendFail(node->name); - + if (nodeIsMaster(myself)) clusterSendFail(node->name); clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); } @@ -1081,16 +1293,16 @@ void markNodeAsFailingIfNeeded(clusterNode *node) { void clearNodeFailureIfNeeded(clusterNode *node) { mstime_t now = mstime(); - redisAssert(node->flags & REDIS_NODE_FAIL); + redisAssert(nodeFailed(node)); /* For slaves we always clear the FAIL flag if we can contact the * node again. */ // 如果 FAIL 的是从节点,那么当前节点会直接移除该节点的 FAIL - if (node->flags & REDIS_NODE_SLAVE) { + if (nodeIsSlave(node) || node->numslots == 0) { redisLog(REDIS_NOTICE, - "Clear FAIL state for node %.40s: slave is reachable again.", - node->name); - + "Clear FAIL state for node %.40s: %s is reachable again.", + node->name, + nodeIsSlave(node) ? "slave" : "master without slots"); // 移除 node->flags &= ~REDIS_NODE_FAIL; @@ -1111,8 +1323,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) { * * 那么说明 FAIL 节点仍然有槽没有迁移完,那么当前节点移除该节点的 FAIL 标识。 */ - if (node->flags & REDIS_NODE_MASTER && - node->numslots > 0 && + if (nodeIsMaster(node) && node->numslots > 0 && (now - node->fail_time) > (server.cluster_node_timeout * REDIS_CLUSTER_FAIL_UNDO_TIME_MULT)) { @@ -1146,7 +1357,7 @@ int clusterHandshakeInProgress(char *ip, int port) { clusterNode *node = dictGetVal(de); // 跳过非握手状态的节点,之后剩下的都是正在握手的节点 - if (!(node->flags & REDIS_NODE_HANDSHAKE)) continue; + if (!nodeInHandshake(node)) continue; // 给定 ip 和 port 的节点正在进行握手 if (!strcasecmp(node->ip,ip) && node->port == port) break; @@ -1203,11 +1414,11 @@ int clusterStartHandshake(char *ip, int port) { if (sa.ss_family == AF_INET) inet_ntop(AF_INET, (void*)&(((struct sockaddr_in *)&sa)->sin_addr), - norm_ip,REDIS_CLUSTER_IPLEN); + norm_ip,REDIS_IP_STR_LEN); else inet_ntop(AF_INET6, (void*)&(((struct sockaddr_in6 *)&sa)->sin6_addr), - norm_ip,REDIS_CLUSTER_IPLEN); + norm_ip,REDIS_IP_STR_LEN); // 检查节点是否已经发送握手请求,如果是的话,那么直接返回,防止出现重复握手 if (clusterHandshakeInProgress(norm_ip,port)) { @@ -1287,12 +1498,8 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { if (node) { /* We already know this node. Handle failure reports, only when the sender is a master. */ - // 如果 sender 是一个主节点,那么我们需要处理下线报告 - if (sender && sender->flags & REDIS_NODE_MASTER && - node != server.cluster->myself) - { - + if (sender && nodeIsMaster(sender) && node != myself) { // 节点处于 FAIL 或者 PFAIL 状态 if (flags & (REDIS_NODE_FAIL|REDIS_NODE_PFAIL)) { @@ -1365,22 +1572,9 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { /* IP -> string conversion. 'buf' is supposed to at least be 46 bytes. */ // 将 ip 转换为字符串 void nodeIp2String(char *buf, clusterLink *link) { - struct sockaddr_storage sa; - socklen_t salen = sizeof(sa); - - if (getpeername(link->fd, (struct sockaddr*) &sa, &salen) == -1) - redisPanic("getpeername() failed."); - - if (sa.ss_family == AF_INET) { - struct sockaddr_in *s = (struct sockaddr_in *)&sa; - inet_ntop(AF_INET,(void*)&(s->sin_addr),buf,REDIS_CLUSTER_IPLEN); - } else { - struct sockaddr_in6 *s = (struct sockaddr_in6 *)&sa; - inet_ntop(AF_INET6,(void*)&(s->sin6_addr),buf,REDIS_CLUSTER_IPLEN); - } + anetPeerToString(link->fd, buf, REDIS_IP_STR_LEN, NULL); } - /* Update the node address to the IP address that can be extracted * from link->fd, and at the specified port. * @@ -1431,11 +1625,8 @@ int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, int port) { /* Check if this is our master and we have to change the * replication target as well. */ // 如果连接来自当前节点(从节点)的主节点,那么根据新地址设置复制对象 - if (server.cluster->myself->flags & REDIS_NODE_SLAVE && - server.cluster->myself->slaveof == node) - { + if (nodeIsSlave(myself) && myself->slaveof == node) replicationSetMaster(node->ip, node->port); - } return 1; } @@ -1448,7 +1639,7 @@ int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, int port) { void clusterSetNodeAsMaster(clusterNode *n) { // 已经是主节点了。 - if (n->flags & REDIS_NODE_MASTER) return; + if (nodeIsMaster(n)) return; // 移除 slaveof if (n->slaveof) clusterNodeRemoveSlave(n->slaveof,n); @@ -1490,11 +1681,18 @@ void clusterSetNodeAsMaster(clusterNode *n) { * * 根据情况, sender 参数可以是消息的发送者,也可以是消息发送者的主节点。 */ -void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch, - unsigned char *slots) -{ +void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoch, unsigned char *slots) { int j; clusterNode *curmaster, *newmaster = NULL; + /* The dirty slots list is a list of slots for which we lose the ownership + * while having still keys inside. This usually happens after a failover + * or after a manual cluster reconfiguration operated by the admin. + * + * If the update message is not able to demote a master to slave (in this + * case we'll resync with the master updating the whole key space), we + * need to delete all the keys in the slots we lost ownership. */ + uint16_t dirty_slots[REDIS_CLUSTER_SLOTS]; + int dirty_slots_count = 0; /* Here we set curmaster to this node or the node this node * replicates to if it's a slave. In the for loop we are @@ -1502,30 +1700,44 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc // 1)如果当前节点是主节点,那么将 curmaster 设置为当前节点 // 2)如果当前节点是从节点,那么将 curmaster 设置为当前节点正在复制的主节点 // 稍后在 for 循环中我们将使用 curmaster 检查与当前节点有关的槽是否发生了变动 - if (server.cluster->myself->flags & REDIS_NODE_MASTER) - curmaster = server.cluster->myself; - else - curmaster = server.cluster->myself->slaveof; + curmaster = nodeIsMaster(myself) ? myself : myself->slaveof; + + if (sender == myself) { + redisLog(REDIS_WARNING,"Discarding UPDATE message about myself."); + return; + } // 更新槽布局 for (j = 0; j < REDIS_CLUSTER_SLOTS; j++) { // 如果 slots 中的槽 j 已经被指派,那么执行以下代码 if (bitmapTestBit(slots,j)) { - /* We rebind the slot to the new node claiming it if: - * 1) The slot was unassigned. - * 2) The new node claims it with a greater configEpoch. */ - - // 槽 j 已经指派给 sender 了,略过 + /* The slot is already bound to the sender of this message. */ if (server.cluster->slots[j] == sender) continue; - // 槽 j 未指派 - // 或者当前槽 j 指派的节点的配置纪元比 sender 的配置纪元要低(可能发生了自动故障转移) - // 那么更新槽 j 的指派节点 + /* The slot is in importing state, it should be modified only + * manually via redis-trib (example: a resharding is in progress + * and the migrating side slot was already closed and is advertising + * a new config. We still want the slot to be closed manually). */ + if (server.cluster->importing_slots_from[j]) continue; + + /* We rebind the slot to the new node claiming it if: + * 1) The slot was unassigned or the new node claims it with a + * greater configEpoch. + * 2) We are not currently importing the slot. */ if (server.cluster->slots[j] == NULL || - server.cluster->slots[j]->configEpoch < - senderConfigEpoch) + server.cluster->slots[j]->configEpoch < senderConfigEpoch) { + /* Was this slot mine, and still contains keys? Mark it as + * a dirty slot. */ + if (server.cluster->slots[j] == myself && + countKeysInSlot(j) && + sender != myself) + { + dirty_slots[dirty_slots_count] = j; + dirty_slots_count++; + } + // 负责槽 j 的原节点是当前节点的主节点? // 如果是的话,说明故障转移发生了,将当前节点的复制对象设置为新的主节点 if (server.cluster->slots[j] == curmaster) @@ -1563,17 +1775,91 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc * 这时应该将当前节点设置为新主节点的从节点。 */ if (newmaster && curmaster->numslots == 0) { - redisLog(REDIS_WARNING,"Configuration change detected. Reconfiguring myself as a replica of %.40s", sender->name); - + redisLog(REDIS_WARNING, + "Configuration change detected. Reconfiguring myself " + "as a replica of %.40s", sender->name); // 将 sender 设置为当前节点的主节点 clusterSetMaster(sender); clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| CLUSTER_TODO_UPDATE_STATE| CLUSTER_TODO_FSYNC_CONFIG); + } else if (dirty_slots_count) { + /* If we are here, we received an update message which removed + * ownership for certain slots we still have keys about, but still + * we are serving some slots, so this master node was not demoted to + * a slave. + * + * In order to maintain a consistent state between keys and slots + * we need to remove all the keys from the slots we lost. */ + for (j = 0; j < dirty_slots_count; j++) + delKeysInSlot(dirty_slots[j]); } } +/* This function is called when this node is a master, and we receive from + * another master a configuration epoch that is equal to our configuration + * epoch. + * + * BACKGROUND + * + * It is not possible that different slaves get the same config + * epoch during a failover election, because the slaves need to get voted + * by a majority. However when we perform a manual resharding of the cluster + * the node will assign a configuration epoch to itself without to ask + * for agreement. Usually resharding happens when the cluster is working well + * and is supervised by the sysadmin, however it is possible for a failover + * to happen exactly while the node we are resharding a slot to assigns itself + * a new configuration epoch, but before it is able to propagate it. + * + * So technically it is possible in this condition that two nodes end with + * the same configuration epoch. + * + * Another possibility is that there are bugs in the implementation causing + * this to happen. + * + * Moreover when a new cluster is created, all the nodes start with the same + * configEpoch. This collision resolution code allows nodes to automatically + * end with a different configEpoch at startup automatically. + * + * In all the cases, we want a mechanism that resolves this issue automatically + * as a safeguard. The same configuration epoch for masters serving different + * set of slots is not harmful, but it is if the nodes end serving the same + * slots for some reason (manual errors or software bugs) without a proper + * failover procedure. + * + * In general we want a system that eventually always ends with different + * masters having different configuration epochs whatever happened, since + * nothign is worse than a split-brain condition in a distributed system. + * + * BEHAVIOR + * + * When this function gets called, what happens is that if this node + * has the lexicographically smaller Node ID compared to the other node + * with the conflicting epoch (the 'sender' node), it will assign itself + * the greatest configuration epoch currently detected among nodes plus 1. + * + * This means that even if there are multiple nodes colliding, the node + * with the greatest Node ID never moves forward, so eventually all the nodes + * end with a different configuration epoch. + */ +void clusterHandleConfigEpochCollision(clusterNode *sender) { + /* Prerequisites: nodes have the same configEpoch and are both masters. */ + if (sender->configEpoch != myself->configEpoch || + !nodeIsMaster(sender) || !nodeIsMaster(myself)) return; + /* Don't act if the colliding node has a smaller Node ID. */ + if (memcmp(sender->name,myself->name,REDIS_CLUSTER_NAMELEN) <= 0) return; + /* Get the next ID available at the best of this node knowledge. */ + server.cluster->currentEpoch++; + myself->configEpoch = server.cluster->currentEpoch; + clusterSaveConfigOrDie(1); + redisLog(REDIS_VERBOSE, + "WARNING: configEpoch collision with node %.40s." + " Updating my configEpoch to %llu", + sender->name, + (unsigned long long) myself->configEpoch); +} + /* When this function is called, there is a packet to process starting * at node->rcvbuf. Releasing the buffer is up to the caller, so this * function should just handle the higher level stuff of processing the @@ -1617,7 +1903,8 @@ int clusterProcessPacket(clusterLink *link) { /* Perform sanity checks */ // 合法性检查 - if (totlen < 8) return 1; + if (totlen < 16) return 1; /* At least signature, version, totlen, count. */ + if (ntohs(hdr->ver) != 0) return 1; /* Can't handle versions other than 0.*/ if (totlen > sdslen(link->rcvbuf)) return 1; if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || type == CLUSTERMSG_TYPE_MEET) @@ -1641,7 +1928,9 @@ int clusterProcessPacket(clusterLink *link) { ntohl(hdr->data.publish.msg.message_len); if (totlen != explen) return 1; } else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST || - type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) { + type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK || + type == CLUSTERMSG_TYPE_MFSTART) + { uint32_t explen = sizeof(clusterMsg)-sizeof(union clusterMsgData); if (totlen != explen) return 1; @@ -1655,10 +1944,9 @@ int clusterProcessPacket(clusterLink *link) { /* Check if the sender is a known node. */ // 查找发送者节点 sender = clusterLookupNode(hdr->sender); - // 节点存在,并且不是 HANDSHAKE 节点 // 那么个更新节点的配置纪元信息 - if (sender && !(sender->flags & REDIS_NODE_HANDSHAKE)) { + if (sender && !nodeInHandshake(sender)) { /* Update our curretEpoch if we see a newer epoch in the cluster. */ senderCurrentEpoch = ntohu64(hdr->currentEpoch); senderConfigEpoch = ntohu64(hdr->configEpoch); @@ -1667,7 +1955,25 @@ int clusterProcessPacket(clusterLink *link) { /* Update the sender configEpoch if it is publishing a newer one. */ if (senderConfigEpoch > sender->configEpoch) { sender->configEpoch = senderConfigEpoch; - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_FSYNC_CONFIG); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_FSYNC_CONFIG); + } + /* Update the replication offset info for this node. */ + sender->repl_offset = ntohu64(hdr->offset); + sender->repl_offset_time = mstime(); + /* If we are a slave performing a manual failover and our master + * sent its offset while already paused, populate the MF state. */ + if (server.cluster->mf_end && + nodeIsSlave(myself) && + myself->slaveof == sender && + hdr->mflags[0] & CLUSTERMSG_FLAG0_PAUSED && + server.cluster->mf_master_offset == 0) + { + server.cluster->mf_master_offset = sender->repl_offset; + redisLog(REDIS_WARNING, + "Received replication offset for paused " + "master manual failover: %lld", + server.cluster->mf_master_offset); } } @@ -1727,15 +2033,14 @@ int clusterProcessPacket(clusterLink *link) { // 连接的 clusterNode 结构存在 if (link->node) { - // 节点处于 HANDSHAKE 状态 - if (link->node->flags & REDIS_NODE_HANDSHAKE) { - + if (nodeInHandshake(link->node)) { /* If we already have this node, try to change the * IP/port of the node with the new one. */ if (sender) { - redisLog(REDIS_WARNING, - "Handshake error: we already know node %.40s, updating the address if needed.", sender->name); + redisLog(REDIS_VERBOSE, + "Handshake: we already know node %.40s, " + "updating the address if needed.", sender->name); // 如果有需要的话,更新节点的地址 if (nodeUpdateAddressIfNeeded(sender,link,ntohs(hdr->port))) { @@ -1791,10 +2096,11 @@ int clusterProcessPacket(clusterLink *link) { // 并且发送者不在 HANDSHAKE 状态 // 那么更新发送者的信息 if (sender && type == CLUSTERMSG_TYPE_PING && - !(sender->flags & REDIS_NODE_HANDSHAKE) && + !nodeInHandshake(sender) && nodeUpdateAddressIfNeeded(sender,link,ntohs(hdr->port))) { - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); } /* Update our info about the node */ @@ -1819,14 +2125,13 @@ int clusterProcessPacket(clusterLink *link) { * 如果节点的状态为 FAIL , * 那么是否撤销该状态要根据 clearNodeFailureIfNeeded() 函数来决定。 */ - if (link->node->flags & REDIS_NODE_PFAIL) { - + if (nodeTimedOut(link->node)) { // 撤销 PFAIL link->node->flags &= ~REDIS_NODE_PFAIL; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| CLUSTER_TODO_UPDATE_STATE); - } else if (link->node->flags & REDIS_NODE_FAIL) { + } else if (nodeFailed(link->node)) { // 看是否可以撤销 FAIL clearNodeFailureIfNeeded(link->node); } @@ -1853,8 +2158,7 @@ int clusterProcessPacket(clusterLink *link) { clusterNode *master = clusterLookupNode(hdr->slaveof); // sender 由主节点变成了从节点,重新配置 sender - if (sender->flags & REDIS_NODE_MASTER) { - + if (nodeIsMaster(sender)) { /* Master turned into a slave! Reconfigure the node. */ // 删除所有由该节点负责的槽 @@ -1874,9 +2178,9 @@ int clusterProcessPacket(clusterLink *link) { } /* Master node changed for this slave? */ - // 检查 sender 的主节点是否变更 - if (sender->slaveof != master) { + // 检查 sender 的主节点是否变更 + if (master && sender->slaveof != master) { // 如果 sender 之前的主节点不是现在的主节点 // 那么在旧主节点的从节点列表中移除 sender if (sender->slaveof) @@ -1913,8 +2217,7 @@ int clusterProcessPacket(clusterLink *link) { int dirty_slots = 0; /* Sender claimed slots don't match my view? */ if (sender) { - sender_master = (sender->flags & REDIS_NODE_MASTER) ? - sender : sender->slaveof; + sender_master = nodeIsMaster(sender) ? sender : sender->slaveof; if (sender_master) { dirty_slots = memcmp(sender_master->slots, hdr->myslots,sizeof(hdr->myslots)) != 0; @@ -1926,9 +2229,8 @@ int clusterProcessPacket(clusterLink *link) { * need to update our configuration. */ // 如果 sender 是主节点,并且 sender 的槽布局出现了变动 // 那么检查当前节点对 sender 的槽布局设置,看是否需要进行更新 - if (sender && sender->flags & REDIS_NODE_MASTER && dirty_slots) { + if (sender && nodeIsMaster(sender) && dirty_slots) clusterUpdateSlotsConfigWith(sender,senderConfigEpoch,hdr->myslots); - } /* 2) We also check for the reverse condition, that is, the sender * claims to serve slots we know are served by a master with a @@ -1987,13 +2289,14 @@ int clusterProcessPacket(clusterLink *link) { if (server.cluster->slots[j]->configEpoch > senderConfigEpoch) { - redisLog(REDIS_WARNING, + redisLog(REDIS_VERBOSE, "Node %.40s has old slots configuration, sending " "an UPDATE message about %.40s", sender->name, server.cluster->slots[j]->name); // 向 sender 发送关于槽 j 的更新信息 - clusterSendUpdate(sender->link,server.cluster->slots[j]); + clusterSendUpdate(sender->link, + server.cluster->slots[j]); /* TODO: instead of exiting the loop send every other * UPDATE packet for other nodes that are the new owner @@ -2004,6 +2307,15 @@ int clusterProcessPacket(clusterLink *link) { } } + /* If our config epoch collides with the sender's try to fix + * the problem. */ + if (sender && + nodeIsMaster(myself) && nodeIsMaster(sender) && + senderConfigEpoch == myself->configEpoch) + { + clusterHandleConfigEpochCollision(sender); + } + /* Get info from the gossip section */ // 分析并提取出消息 gossip 协议部分的信息 clusterProcessGossipSection(hdr,link); @@ -2029,8 +2341,8 @@ int clusterProcessPacket(clusterLink *link) { failing->fail_time = mstime(); // 关闭 PFAIL 状态 failing->flags &= ~REDIS_NODE_PFAIL; - - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_UPDATE_STATE); } } else { redisLog(REDIS_NOTICE, @@ -2088,8 +2400,7 @@ int clusterProcessPacket(clusterLink *link) { // 1) sender 是主节点 // 2) sender 正在处理至少一个槽 // 3) sender 的配置纪元大于等于当前节点的配置纪元 - if (sender->flags & REDIS_NODE_MASTER && - sender->numslots > 0 && + if (nodeIsMaster(sender) && sender->numslots > 0 && senderCurrentEpoch >= server.cluster->failover_auth_epoch) { // 增加支持票数 @@ -2100,12 +2411,22 @@ int clusterProcessPacket(clusterLink *link) { clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); } - // 这是一条更新消息: sender 告知当前节点,当前节点需要更新某个节点的槽布局 + } else if (type == CLUSTERMSG_TYPE_MFSTART) { + /* This message is acceptable only if I'm a master and the sender + * is one of my slaves. */ + if (!sender || sender->slaveof != myself) return 1; + /* Manual failover requested from slaves. Initialize the state + * accordingly. */ + resetManualFailover(); + server.cluster->mf_end = mstime() + REDIS_CLUSTER_MF_TIMEOUT; + server.cluster->mf_slave = sender; + pauseClients(mstime()+(REDIS_CLUSTER_MF_TIMEOUT*2)); + redisLog(REDIS_WARNING,"Manual failover requested by slave %.40s.", + sender->name); } else if (type == CLUSTERMSG_TYPE_UPDATE) { clusterNode *n; /* The node the update is about. */ - - // 消息中的配置纪元 - uint64_t reportedConfigEpoch = ntohu64(hdr->data.update.nodecfg.configEpoch); + uint64_t reportedConfigEpoch = + ntohu64(hdr->data.update.nodecfg.configEpoch); if (!sender) return 1; /* We don't know the sender. */ @@ -2120,7 +2441,12 @@ int clusterProcessPacket(clusterLink *link) { /* If in our current config the node is a slave, set it as a master. */ // 如果节点 n 为从节点,但它的槽配置更新了 // 那么说明这个节点已经变为主节点,将它设置为主节点 - if (n->flags & REDIS_NODE_SLAVE) clusterSetNodeAsMaster(n); + if (nodeIsSlave(n)) clusterSetNodeAsMaster(n); + + /* Update the node's configEpoch. */ + n->configEpoch = reportedConfigEpoch; + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG| + CLUSTER_TODO_FSYNC_CONFIG); /* Check the bitmap of served slots and udpate our * config accordingly. */ @@ -2142,7 +2468,7 @@ int clusterProcessPacket(clusterLink *link) { this connection and will try to get it connected again. 我们将节点的状态设置为断开状态,Cluster Cron 会根据该状态尝试重新连接节点。 - + Instead if the node is a temporary node used to accept a query, we completely free the node on error. @@ -2206,23 +2532,24 @@ void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) { // 检查输入缓冲区的长度 rcvbuflen = sdslen(link->rcvbuf); - - // 头信息(4字节)未读入完 - if (rcvbuflen < 4) { - /* First, obtain the first four bytes to get the full message + // 头信息(8 字节)未读入完 + if (rcvbuflen < 8) { + /* First, obtain the first 8 bytes to get the full message * length. */ - readlen = 4 - rcvbuflen; - - // 已读入完整的头信息 + readlen = 8 - rcvbuflen; + // 已读入完整的信息 } else { /* Finally read the full message. */ hdr = (clusterMsg*) link->rcvbuf; - if (rcvbuflen == 4) { - /* Perform some sanity check on the message length. */ - // 检查信息长度是否在合理范围 - if (ntohl(hdr->totlen) < CLUSTERMSG_MIN_LEN) { + if (rcvbuflen == 8) { + /* Perform some sanity check on the message signature + * and length. */ + if (memcmp(hdr->sig,"RCmb",4) != 0 || + ntohl(hdr->totlen) < CLUSTERMSG_MIN_LEN) + { redisLog(REDIS_WARNING, - "Bad message length received from Cluster bus."); + "Bad message length or signature received " + "from Cluster bus."); handleLinkIOError(link); return; } @@ -2255,8 +2582,7 @@ void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask) { /* Total length obtained? Process this packet. */ // 检查已读入内容的长度,看是否整条信息已经被读入了 - if (rcvbuflen >= 4 && rcvbuflen == ntohl(hdr->totlen)) { - + if (rcvbuflen >= 8 && rcvbuflen == ntohl(hdr->totlen)) { // 如果是的话,执行处理信息的函数 if (clusterProcessPacket(link)) { sdsfree(link->rcvbuf); @@ -2297,7 +2623,7 @@ void clusterSendMessage(clusterLink *link, unsigned char *msg, size_t msglen) { * a connected link. * * 向节点连接的所有其他节点发送信息。 - * + * * It is guaranteed that this function will never have as a side effect * some node->link to be invalidated, so it is safe to call this function * from event handlers that will do stuff with node links later. */ @@ -2328,7 +2654,7 @@ void clusterBroadcastMessage(void *buf, size_t len) { void clusterBuildMessageHdr(clusterMsg *hdr, int type) { int totlen = 0; uint64_t offset; - clusterNode *master, *myself = server.cluster->myself; + clusterNode *master; /* If this node is a master, we send its slots bitmap and configEpoch. * @@ -2343,12 +2669,17 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { * 因为接收信息的节点通过标识可以知道这个节点是一个从节点, * 所以接收信息的节点不会将从节点错认作是主节点。 */ - master = (myself->flags & REDIS_NODE_SLAVE && myself->slaveof) ? + master = (nodeIsSlave(myself) && myself->slaveof) ? myself->slaveof : myself; // 清零信息头 memset(hdr,0,sizeof(*hdr)); + hdr->sig[0] = 'R'; + hdr->sig[1] = 'C'; + hdr->sig[2] = 'm'; + hdr->sig[3] = 'b'; + // 设置信息类型 hdr->type = htons(type); @@ -2382,18 +2713,16 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { /* Set the replication offset. */ // 设置复制偏移量 - if (myself->flags & REDIS_NODE_SLAVE) { - if (server.master) - offset = server.master->reploff; - else if (server.cached_master) - offset = server.cached_master->reploff; - else - offset = 0; - } else { + if (nodeIsSlave(myself)) + offset = replicationGetSlaveOffset(); + else offset = server.master_repl_offset; - } hdr->offset = htonu64(offset); + /* Set the message flags. */ + if (nodeIsMaster(myself) && server.cluster->mf_end) + hdr->mflags[0] |= CLUSTERMSG_FLAG0_PAUSED; + /* Compute the message length for certain messages. For other messages * this is up to the caller. */ // 计算信息的长度 @@ -2437,7 +2766,7 @@ void clusterSendPing(clusterLink *link, int type) { // 将当前节点的信息(比如名字、地址、端口号、负责处理的槽)记录到消息里面 clusterBuildMessageHdr(hdr,type); - + /* Populate the gossip fields */ // 从当前节点已知的节点中随机选出两个节点 // 并通过这条消息捎带给目标节点,从而实现 gossip 协议 @@ -2445,9 +2774,8 @@ void clusterSendPing(clusterLink *link, int type) { // 每个节点有 freshnodes 次发送 gossip 信息的机会 // 每次向目标节点发送 2 个被选中节点的 gossip 信息(gossipcount 计数) while(freshnodes > 0 && gossipcount < 3) { - // 从 nodes 字典中随机选出一个节点(被选中节点) - struct dictEntry *de = dictGetRandomKey(server.cluster->nodes); + dictEntry *de = dictGetRandomKey(server.cluster->nodes); clusterNode *this = dictGetVal(de); clusterMsgDataGossip *gossip; @@ -2464,7 +2792,7 @@ void clusterSendPing(clusterLink *link, int type) { * 4) Disconnected nodes if they don't have configured slots. * 因为不处理任何槽而被断开连接的节点 */ - if (this == server.cluster->myself || + if (this == myself || this->flags & (REDIS_NODE_HANDSHAKE|REDIS_NODE_NOADDR) || (this->link == NULL && this->numslots == 0)) { @@ -2535,8 +2863,16 @@ void clusterSendPing(clusterLink *link, int type) { * 因此广播 PONG 回复在配置发生变化(比如从节点转变为主节点), * 并且当前节点想让其他节点尽快知悉这一变化的时候, * 就会广播 PONG 回复。 + * + * The 'target' argument specifies the receiving instances using the + * defines below: + * + * CLUSTER_BROADCAST_ALL -> All known instances. + * CLUSTER_BROADCAST_LOCAL_SLAVES -> All slaves in my master-slaves ring. */ -void clusterBroadcastPong(void) { +#define CLUSTER_BROADCAST_ALL 0 +#define CLUSTER_BROADCAST_LOCAL_SLAVES 1 +void clusterBroadcastPong(int target) { dictIterator *di; dictEntry *de; @@ -2547,9 +2883,13 @@ void clusterBroadcastPong(void) { // 不向未建立连接的节点发送 if (!node->link) continue; - // 不向 HANDSHAKE 以及自己发送 - if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_HANDSHAKE)) continue; - + if (node == myself || nodeInHandshake(node)) continue; + if (target == CLUSTER_BROADCAST_LOCAL_SLAVES) { + int local_slave = + nodeIsSlave(node) && node->slaveof && + (node->slaveof == myself || node->slaveof == myself->slaveof); + if (!local_slave) continue; + } // 发送 PONG 信息 clusterSendPing(node->link,CLUSTERMSG_TYPE_PONG); } @@ -2708,6 +3048,10 @@ void clusterRequestFailoverAuth(void) { // 设置信息头(包含当前节点的信息) clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST); + /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit + * in the header to communicate the nodes receiving the message that + * they should authorized the failover even if the master is working. */ + if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK; totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); hdr->totlen = htonl(totlen); @@ -2729,6 +3073,19 @@ void clusterSendFailoverAuth(clusterNode *node) { clusterSendMessage(node->link,buf,totlen); } +/* Send a MFSTART message to the specified node. */ +void clusterSendMFStart(clusterNode *node) { + unsigned char buf[sizeof(clusterMsg)]; + clusterMsg *hdr = (clusterMsg*) buf; + uint32_t totlen; + + if (!node->link) return; + clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_MFSTART); + totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + hdr->totlen = htonl(totlen); + clusterSendMessage(node->link,buf,totlen); +} + /* Vote for the node asking for our vote if there are the conditions. */ // 在条件满足的情况下,为请求进行故障转移的节点 node 进行投票,支持它进行故障转移 void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { @@ -2744,18 +3101,17 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { // 请求节点的槽布局 unsigned char *claimed_slots = request->myslots; - + int force_ack = request->mflags[0] & CLUSTERMSG_FLAG0_FORCEACK; int j; /* IF we are not a master serving at least 1 slot, we don't have the * right to vote, as the cluster size in Redis Cluster is the number * of masters serving at least one slot, and quorum is the cluster * size + 1 */ - // 非主节点无权投票 - if (!(server.cluster->myself->flags & REDIS_NODE_MASTER)) return; - // 没有处理任何槽的节点无权投票 - if (server.cluster->myself->numslots == 0) return; + // 如果节点为从节点,或者是一个没有处理任何槽的主节点, + // 那么它没有投票权 + if (nodeIsSlave(myself) || myself->numslots == 0) return; /* Request epoch must be >= our currentEpoch. */ // 请求的配置纪元必须大于等于当前节点的配置纪元 @@ -2763,13 +3119,13 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* I already voted for this epoch? Return ASAP. */ // 已经投过票了 - if (server.cluster->last_vote_epoch == server.cluster->currentEpoch) return; + if (server.cluster->lastVoteEpoch == server.cluster->currentEpoch) return; - /* Node must be a slave and its master down. */ - // 请求节点必须是从服务器,并且它的主节点必须已经 FAIL - if (!(node->flags & REDIS_NODE_SLAVE) || - master == NULL || - !(master->flags & REDIS_NODE_FAIL)) return; + /* Node must be a slave and its master down. + * The master can be non failing if the request is flagged + * with CLUSTERMSG_FLAG0_FORCEACK (manual failover). */ + if (nodeIsMaster(node) || master == NULL || + (!nodeFailed(master) && !force_ack)) return; /* We did not voted for a slave about this master for two * times the node timeout. This is not strictly needed for correctness @@ -2788,7 +3144,10 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { // 查找是否有某个槽的配置纪元大于节点请求的纪元 if (server.cluster->slots[j] == NULL || - server.cluster->slots[j]->configEpoch <= requestConfigEpoch) continue; + server.cluster->slots[j]->configEpoch <= requestConfigEpoch) + { + continue; + } // 如果有的话,说明节点请求的纪元已经过期,没有必要进行投票 /* If we reached this point we found a slot that in our current slots @@ -2800,12 +3159,39 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* We can vote for this slave. */ // 为节点投票 clusterSendFailoverAuth(node); - // 更新时间值 - server.cluster->last_vote_epoch = server.cluster->currentEpoch; + server.cluster->lastVoteEpoch = server.cluster->currentEpoch; node->slaveof->voted_time = mstime(); } +/* This function returns the "rank" of this instance, a slave, in the context + * of its master-slaves ring. The rank of the slave is given by the number of + * other slaves for the same master that have a better replication offset + * compared to the local one (better means, greater, so they claim more data). + * + * A slave with rank 0 is the one with the greatest (most up to date) + * replication offset, and so forth. Note that because how the rank is computed + * multiple slaves may have the same rank, in case they have the same offset. + * + * The slave rank is used to add a delay to start an election in order to + * get voted and replace a failing master. Slaves with better replication + * offsets are more likely to win. */ +int clusterGetSlaveRank(void) { + long long myoffset; + int j, rank = 0; + clusterNode *master; + + redisAssert(nodeIsSlave(myself)); + master = myself->slaveof; + if (master == NULL) return 0; /* Never called by slaves without master. */ + + myoffset = replicationGetSlaveOffset(); + for (j = 0; j < master->numslaves; j++) + if (master->slaves[j] != myself && + master->slaves[j]->repl_offset > myoffset) rank++; + return rank; +} + /* This function is called if we are a slave node and our master serving * a non-zero amount of hash slots is in FAIL state. * @@ -2827,31 +3213,44 @@ void clusterHandleSlaveFailover(void) { mstime_t data_age; mstime_t auth_age = mstime() - server.cluster->failover_auth_time; int needed_quorum = (server.cluster->size / 2) + 1; + int manual_failover = server.cluster->mf_end != 0 && + server.cluster->mf_can_start; int j; + mstime_t auth_timeout, auth_retry_time; + + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_FAILOVER; + + /* Compute the failover timeout (the max time we have to send votes + * and wait for replies), and the failover retry time (the time to wait + * before waiting again. + * + * Timeout is MIN(NODE_TIMEOUT*2,2000) milliseconds. + * Retry is two times the Timeout. + */ + auth_timeout = server.cluster_node_timeout*2; + if (auth_timeout < 2000) auth_timeout = 2000; + auth_retry_time = auth_timeout*2; + + /* Pre conditions to run the function, that must be met both in case + * of an automatic or manual failover: + * 1) We are a slave. + * 2) Our master is flagged as FAIL, or this is a manual failover. + * 3) It is serving slots. */ + if (nodeIsMaster(myself) || + myself->slaveof == NULL || + (!nodeFailed(myself->slaveof) && !manual_failover) || + myself->slaveof->numslots == 0) return; /* Set data_age to the number of seconds we are disconnected from * the master. */ // 将 data_age 设置为从节点与主节点的断开秒数 if (server.repl_state == REDIS_REPL_CONNECTED) { - data_age = (server.unixtime - server.master->lastinteraction) * 1000; + data_age = (mstime_t)(server.unixtime - server.master->lastinteraction) + * 1000; } else { - data_age = (server.unixtime - server.repl_down_since) * 1000; + data_age = (mstime_t)(server.unixtime - server.repl_down_since) * 1000; } - /* Pre conditions to run the function: - * 执行函数的条件: - * 1) We are a slave. - * 当前节点是从节点 - * 2) Our master is flagged as FAIL. - * 这个从节点的主节点状态为 FAIL - * 3) It is serving slots. - * FAIL 的主节点正在处理某个(或某些)槽 - */ - if (!(server.cluster->myself->flags & REDIS_NODE_SLAVE) || - server.cluster->myself->slaveof == NULL || - !(server.cluster->myself->slaveof->flags & REDIS_NODE_FAIL) || - server.cluster->myself->slaveof->numslots == 0) return; - /* Remove the node timeout from the data age as it is fine that we are * disconnected from our master at least for the time it was down to be * flagged as FAIL, that's the baseline. */ @@ -2861,31 +3260,70 @@ void clusterHandleSlaveFailover(void) { /* Check if our data is recent enough. For now we just use a fixed * constant of ten times the node timeout since the cluster should - * react much faster to a master down. */ + * react much faster to a master down. + * + * Check bypassed for manual failovers. */ // 检查这个从节点的数据是否较新: // 目前的检测办法是断线时间不能超过 node timeout 的十倍 if (data_age > - (server.repl_ping_slave_period * 1000) + + ((mstime_t)server.repl_ping_slave_period * 1000) + (server.cluster_node_timeout * REDIS_CLUSTER_SLAVE_VALIDITY_MULT)) - return; - - /* Compute the time at which we can start an election. */ - // 在开始故障转移之前,先等待一段时间 - if (auth_age > - server.cluster_node_timeout * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT) { + if (!manual_failover) return; + } + + /* If the previous failover attempt timedout and the retry time has + * elapsed, we can setup a new one. */ + if (auth_age > auth_retry_time) { server.cluster->failover_auth_time = mstime() + 500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */ - data_age / 10 + /* Add 100 milliseconds for every second of age. */ random() % 500; /* Random delay between 0 and 500 milliseconds. */ server.cluster->failover_auth_count = 0; server.cluster->failover_auth_sent = 0; + server.cluster->failover_auth_rank = clusterGetSlaveRank(); + /* We add another delay that is proportional to the slave rank. + * Specifically 1 second * rank. This way slaves that have a probably + * less updated replication offset, are penalized. */ + server.cluster->failover_auth_time += + server.cluster->failover_auth_rank * 1000; + /* However if this is a manual failover, no delay is needed. */ + if (server.cluster->mf_end) { + server.cluster->failover_auth_time = mstime(); + server.cluster->failover_auth_rank = 0; + } redisLog(REDIS_WARNING, - "Start of election delayed for %lld milliseconds.", - server.cluster->failover_auth_time - mstime()); + "Start of election delayed for %lld milliseconds " + "(rank #%d, offset %lld).", + server.cluster->failover_auth_time - mstime(), + server.cluster->failover_auth_rank, + replicationGetSlaveOffset()); + /* Now that we have a scheduled election, broadcast our offset + * to all the other slaves so that they'll updated their offsets + * if our offset is better. */ + clusterBroadcastPong(CLUSTER_BROADCAST_LOCAL_SLAVES); return; } + /* It is possible that we received more updated offsets from other + * slaves for the same master since we computed our election delay. + * Update the delay if our rank changed. + * + * Not performed if this is a manual failover. */ + if (server.cluster->failover_auth_sent == 0 && + server.cluster->mf_end == 0) + { + int newrank = clusterGetSlaveRank(); + if (newrank > server.cluster->failover_auth_rank) { + long long added_delay = + (newrank - server.cluster->failover_auth_rank) * 1000; + server.cluster->failover_auth_time += added_delay; + server.cluster->failover_auth_rank = newrank; + redisLog(REDIS_WARNING, + "Slave rank updated to #%d, added %lld milliseconds of delay.", + newrank, added_delay); + } + } + /* Return ASAP if we can't still start the election. */ // 如果执行故障转移的时间未到,先返回 if (mstime() < server.cluster->failover_auth_time) return; @@ -2894,7 +3332,7 @@ void clusterHandleSlaveFailover(void) { // 如果距离应该执行故障转移的时间已经过了很久 // 那么不应该再执行故障转移了(因为可能已经没有需要了) // 直接返回 - if (auth_age > server.cluster_node_timeout) return; + if (auth_age > auth_timeout) return; /* Ask for votes if needed. */ // 向其他节点发送故障转移请求 @@ -2929,9 +3367,8 @@ void clusterHandleSlaveFailover(void) { /* Check if we reached the quorum. */ // 如果当前节点获得了足够多的投票,那么对下线主节点进行故障转移 if (server.cluster->failover_auth_count >= needed_quorum) { - // 旧主节点 - clusterNode *oldmaster = server.cluster->myself->slaveof; + clusterNode *oldmaster = myself->slaveof; redisLog(REDIS_WARNING, "Failover election won: I'm the new master."); @@ -2942,15 +3379,7 @@ void clusterHandleSlaveFailover(void) { * 1) Turn this node into a master. * 将当前节点的身份由从节点改为主节点 */ - // 在 slaves 字典中移除当前节点 - clusterNodeRemoveSlave(server.cluster->myself->slaveof, - server.cluster->myself); - // 关闭从节点标记 - server.cluster->myself->flags &= ~REDIS_NODE_SLAVE; - // 打开主节点标记 - server.cluster->myself->flags |= REDIS_NODE_MASTER; - // 清空 slaveof 对象 - server.cluster->myself->slaveof = NULL; + clusterSetNodeAsMaster(myself); // 让从节点取消复制,成为新的主节点 replicationUnsetMaster(); @@ -2961,14 +3390,13 @@ void clusterHandleSlaveFailover(void) { // 将槽设置为未分配的 clusterDelSlot(j); // 将槽的负责人设置为当前节点 - clusterAddSlot(server.cluster->myself,j); + clusterAddSlot(myself,j); } } /* 3) Update my configEpoch to the epoch of the election. */ // 更新集群配置纪元 - server.cluster->myself->configEpoch = - server.cluster->failover_auth_epoch; + myself->configEpoch = server.cluster->failover_auth_epoch; /* 4) Update state and save config. */ // 更新节点状态 @@ -2980,7 +3408,172 @@ void clusterHandleSlaveFailover(void) { * accordingly and detect that we switched to master role. */ // 向所有节点发送 PONG 信息 // 让它们可以知道当前节点已经升级为主节点了 - clusterBroadcastPong(); + clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + + /* 6) If there was a manual failover in progress, clear the state. */ + resetManualFailover(); + } +} + +/* ----------------------------------------------------------------------------- + * CLUSTER slave migration + * + * Slave migration is the process that allows a slave of a master that is + * already covered by at least another slave, to "migrate" to a master that + * is orpaned, that is, left with no working slaves. + * -------------------------------------------------------------------------- */ + +/* This function is responsible to decide if this replica should be migrated + * to a different (orphaned) master. It is called by the clusterCron() function + * only if: + * + * 1) We are a slave node. + * 2) It was detected that there is at least one orphaned master in + * the cluster. + * 3) We are a slave of one of the masters with the greatest number of + * slaves. + * + * This checks are performed by the caller since it requires to iterate + * the nodes anyway, so we spend time into clusterHandleSlaveMigration() + * if definitely needed. + * + * The fuction is called with a pre-computed max_slaves, that is the max + * number of working (not in FAIL state) slaves for a single master. + * + * Additional conditions for migration are examined inside the function. + */ +void clusterHandleSlaveMigration(int max_slaves) { + int j, okslaves = 0; + clusterNode *mymaster = myself->slaveof, *target = NULL, *candidate = NULL; + dictIterator *di; + dictEntry *de; + + /* Step 1: Don't migrate if the cluster state is not ok. */ + if (server.cluster->state != REDIS_CLUSTER_OK) return; + + /* Step 2: Don't migrate if my master will not be left with at least + * 'migration-barrier' slaves after my migration. */ + if (mymaster == NULL) return; + for (j = 0; j < mymaster->numslaves; j++) + if (!nodeFailed(mymaster->slaves[j]) && + !nodeTimedOut(mymaster->slaves[j])) okslaves++; + if (okslaves <= server.cluster_migration_barrier) return; + + /* Step 3: Idenitfy a candidate for migration, and check if among the + * masters with the greatest number of ok slaves, I'm the one with the + * smaller node ID. + * + * Note that this means that eventually a replica migration will occurr + * since slaves that are reachable again always have their FAIL flag + * cleared. At the same time this does not mean that there are no + * race conditions possible (two slaves migrating at the same time), but + * this is extremely unlikely to happen, and harmless. */ + candidate = myself; + di = dictGetSafeIterator(server.cluster->nodes); + while((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + int okslaves; + + /* Only iterate over working masters. */ + if (nodeIsSlave(node) || nodeFailed(node)) continue; + okslaves = clusterCountNonFailingSlaves(node); + + if (okslaves == 0 && target == NULL && node->numslots > 0) + target = node; + + if (okslaves == max_slaves) { + for (j = 0; j < node->numslaves; j++) { + if (memcmp(node->slaves[j]->name, + candidate->name, + REDIS_CLUSTER_NAMELEN) < 0) + { + candidate = node->slaves[j]; + } + } + } + } + + /* Step 4: perform the migration if there is a target, and if I'm the + * candidate. */ + if (target && candidate == myself) { + redisLog(REDIS_WARNING,"Migrating to orphaned master %.40s", + target->name); + clusterSetMaster(target); + } +} + +/* ----------------------------------------------------------------------------- + * CLUSTER manual failover + * + * This are the important steps performed by slaves during a manual failover: + * 1) User send CLUSTER FAILOVER command. The failover state is initialized + * setting mf_end to the millisecond unix time at which we'll abort the + * attempt. + * 2) Slave sends a MFSTART message to the master requesting to pause clients + * for two times the manual failover timeout REDIS_CLUSTER_MF_TIMEOUT. + * When master is paused for manual failover, it also starts to flag + * packets with CLUSTERMSG_FLAG0_PAUSED. + * 3) Slave waits for master to send its replication offset flagged as PAUSED. + * 4) If slave received the offset from the master, and its offset matches, + * mf_can_start is set to 1, and clusterHandleSlaveFailover() will perform + * the failover as usually, with the difference that the vote request + * will be modified to force masters to vote for a slave that has a + * working master. + * + * From the point of view of the master things are simpler: when a + * PAUSE_CLIENTS packet is received the master sets mf_end as well and + * the sender in mf_slave. During the time limit for the manual failover + * the master will just send PINGs more often to this slave, flagged with + * the PAUSED flag, so that the slave will set mf_master_offset when receiving + * a packet from the master with this flag set. + * + * The gaol of the manual failover is to perform a fast failover without + * data loss due to the asynchronous master-slave replication. + * -------------------------------------------------------------------------- */ + +/* Reset the manual failover state. This works for both masters and slavesa + * as all the state about manual failover is cleared. + * + * The function can be used both to initialize the manual failover state at + * startup or to abort a manual failover in progress. */ +void resetManualFailover(void) { + if (server.cluster->mf_end && clientsArePaused()) { + server.clients_pause_end_time = 0; + clientsArePaused(); /* Just use the side effect of the function. */ + } + server.cluster->mf_end = 0; /* No manual failover in progress. */ + server.cluster->mf_can_start = 0; + server.cluster->mf_slave = NULL; + server.cluster->mf_master_offset = 0; +} + +/* If a manual failover timed out, abort it. */ +void manualFailoverCheckTimeout(void) { + if (server.cluster->mf_end && server.cluster->mf_end < mstime()) { + redisLog(REDIS_WARNING,"Manual failover timed out."); + resetManualFailover(); + } +} + +/* This function is called from the cluster cron function in order to go + * forward with a manual failover state machine. */ +void clusterHandleManualFailover(void) { + /* Return ASAP if no manual failover is in progress. */ + if (server.cluster->mf_end == 0) return; + + /* If mf_can_start is non-zero, the failover was alrady triggered so the + * next steps are performed by clusterHandleSlaveFailover(). */ + if (server.cluster->mf_can_start) return; + + if (server.cluster->mf_master_offset == 0) return; /* Wait for offset... */ + + if (server.cluster->mf_master_offset == replicationGetSlaveOffset()) { + /* Our replication offset matches the master replication offset + * announced after clients were paused. We can start the failover. */ + server.cluster->mf_can_start = 1; + redisLog(REDIS_WARNING, + "All master replication stream processed, " + "manual failover can start."); } } @@ -2993,7 +3586,10 @@ void clusterHandleSlaveFailover(void) { void clusterCron(void) { dictIterator *di; dictEntry *de; - int j, update_state = 0; + int update_state = 0; + int orphaned_masters; /* How many masters there are without ok slaves. */ + int max_slaves; /* Max number of ok slaves for a single master. */ + int this_slaves; /* Number of ok slaves for our master (if we are slave). */ mstime_t min_pong = 0, now = mstime(); clusterNode *min_pong_node = NULL; // 迭代计数器,一个静态变量 @@ -3027,9 +3623,7 @@ void clusterCron(void) { /* A Node in HANDSHAKE state has a limited lifespan equal to the * configured node timeout. */ // 如果 handshake 节点已超时,释放它 - if (node->flags & REDIS_NODE_HANDSHAKE && - now - node->ctime > handshake_timeout) - { + if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) { freeClusterNode(node); continue; } @@ -3040,15 +3634,21 @@ void clusterCron(void) { mstime_t old_ping_sent; clusterLink *link; - // 创建连接 - fd = anetTcpNonBlockConnect(server.neterr, node->ip, - node->port+REDIS_CLUSTER_PORT_INCR); - if (fd == -1) continue; + fd = anetTcpNonBlockBindConnect(server.neterr, node->ip, + node->port+REDIS_CLUSTER_PORT_INCR, + server.bindaddr_count ? server.bindaddr[0] : NULL); + if (fd == -1) { + redisLog(REDIS_DEBUG, "Unable to connect to " + "Cluster Node [%s]:%d -> %s", node->ip, + node->port+REDIS_CLUSTER_PORT_INCR, + server.neterr); + continue; + } link = createClusterLink(node); link->fd = fd; node->link = link; - // 关联读事件处理器 - aeCreateFileEvent(server.el,link->fd,AE_READABLE,clusterReadHandler,link); + aeCreateFileEvent(server.el,link->fd,AE_READABLE, + clusterReadHandler,link); /* Queue a PING in the new connection ASAP: this is crucial * to avoid false positives in failure detection. * @@ -3087,7 +3687,8 @@ void clusterCron(void) { */ node->flags &= ~REDIS_NODE_MEET; - redisLog(REDIS_DEBUG,"Connecting with Node %.40s at %s:%d", node->name, node->ip, node->port+REDIS_CLUSTER_PORT_INCR); + redisLog(REDIS_DEBUG,"Connecting with Node %.40s at %s:%d", + node->name, node->ip, node->port+REDIS_CLUSTER_PORT_INCR); } } dictReleaseIterator(di); @@ -3096,6 +3697,8 @@ void clusterCron(void) { * one random node every second. */ // clusterCron() 每执行 10 次(至少间隔一秒钟),就向一个随机节点发送 gossip 信息 if (!(iteration % 10)) { + int j; + /* Check a few random nodes and ping the one with the oldest * pong_received time. */ // 随机 5 个节点,选出其中一个 @@ -3108,7 +3711,9 @@ void clusterCron(void) { /* Don't ping nodes disconnected or with a ping currently active. */ // 不要 PING 连接断开的节点,也不要 PING 最近已经 PING 过的节点 if (this->link == NULL || this->ping_sent != 0) continue; - if (this->flags & (REDIS_NODE_MYSELF|REDIS_NODE_HANDSHAKE)) continue; + + if (this->flags & (REDIS_NODE_MYSELF|REDIS_NODE_HANDSHAKE)) + continue; // 选出 5 个随机节点中最近一次接收 PONG 回复距离现在最旧的节点 if (min_pong_node == NULL || min_pong > this->pong_received) { @@ -3124,8 +3729,16 @@ void clusterCron(void) { } } - /* Iterate nodes to check if we need to flag something as failing */ // 遍历所有节点,检查是否需要将某个节点标记为下线 + /* Iterate nodes to check if we need to flag something as failing. + * This loop is also responsible to: + * 1) Check if there are orphaned masters (masters without non failing + * slaves). + * 2) Count the max number of non failing slaves for a single master. + * 3) Count the number of slaves for our master, if we are a slave. */ + orphaned_masters = 0; + max_slaves = 0; + this_slaves = 0; di = dictGetSafeIterator(server.cluster->nodes); while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); @@ -3137,6 +3750,17 @@ void clusterCron(void) { (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR|REDIS_NODE_HANDSHAKE)) continue; + /* Orphaned master check, useful only if the current instance + * is a slave that may migrate to another master. */ + if (nodeIsSlave(myself) && nodeIsMaster(node) && !nodeFailed(node)) { + int okslaves = clusterCountNonFailingSlaves(node); + + if (okslaves == 0 && node->numslots > 0) orphaned_masters++; + if (okslaves > max_slaves) max_slaves = okslaves; + if (nodeIsSlave(myself) && myself->slaveof == node) + this_slaves = okslaves; + } + /* If we are waiting for the PONG more than half the cluster * timeout, reconnect the link: maybe there is a connection * issue even if the node is alive. */ @@ -3171,6 +3795,17 @@ void clusterCron(void) { continue; } + /* If we are a master and one of the slaves requested a manual + * failover, ping it continuously. */ + if (server.cluster->mf_end && + nodeIsMaster(myself) && + server.cluster->mf_slave == node && + node->link) + { + clusterSendPing(node->link, CLUSTERMSG_TYPE_PING); + continue; + } + /* Check only if we have an active ping for this instance. */ // 以下代码只在节点发送了 PING 命令的情况下执行 if (node->ping_sent == 0) continue; @@ -3200,18 +3835,30 @@ void clusterCron(void) { * enable it if we know the address of our master and it appears to * be up. */ // 如果从节点没有在复制主节点,那么对从节点进行设置 - if (server.cluster->myself->flags & REDIS_NODE_SLAVE && + if (nodeIsSlave(myself) && server.masterhost == NULL && - server.cluster->myself->slaveof && - !(server.cluster->myself->slaveof->flags & REDIS_NODE_NOADDR)) + myself->slaveof && + nodeHasAddr(myself->slaveof)) { - replicationSetMaster(server.cluster->myself->slaveof->ip, - server.cluster->myself->slaveof->port); + replicationSetMaster(myself->slaveof->ip, myself->slaveof->port); + } + + /* Abourt a manual failover if the timeout is reached. */ + manualFailoverCheckTimeout(); + + if (nodeIsSlave(myself)) { + clusterHandleManualFailover(); + clusterHandleSlaveFailover(); + /* If there are orphaned slaves, and we are a slave among the masters + * with the max number of non-failing slaves, consider migrating to + * the orphaned masters. Note that it does not make sense to try + * a migration if there is no master with at least *two* working + * slaves. */ + if (orphaned_masters && max_slaves >= 2 && this_slaves == max_slaves) + clusterHandleSlaveMigration(max_slaves); } - // 如果条件满足的话,执行故障转移 - clusterHandleSlaveFailover(); - // 更新节点状态 + // 更新集群状态 if (update_state || server.cluster->state == REDIS_CLUSTER_FAIL) clusterUpdateState(); } @@ -3241,12 +3888,13 @@ void clusterBeforeSleep(void) { /* Save the config, possibly using fsync. */ // 保存 nodes.conf 配置文件 if (server.cluster->todo_before_sleep & CLUSTER_TODO_SAVE_CONFIG) { - int fsync = server.cluster->todo_before_sleep & CLUSTER_TODO_FSYNC_CONFIG; + int fsync = server.cluster->todo_before_sleep & + CLUSTER_TODO_FSYNC_CONFIG; clusterSaveConfigOrDie(fsync); } - /* Reset our flags. */ - // 重置 flag + /* Reset our flags (not strictly needed since every single function + * called for flags set should be able to clear its flag). */ server.cluster->todo_before_sleep = 0; } @@ -3371,6 +4019,15 @@ int clusterDelNodeSlots(clusterNode *node) { return deleted; } +/* Clear the migrating / importing state for all the slots. + * This is useful at initialization and when turning a master into slave. */ +void clusterCloseAllSlots(void) { + memset(server.cluster->migrating_slots_to,0, + sizeof(server.cluster->migrating_slots_to)); + memset(server.cluster->importing_slots_from,0, + sizeof(server.cluster->importing_slots_from)); +} + /* ----------------------------------------------------------------------------- * Cluster state evaluation function * -------------------------------------------------------------------------- */ @@ -3390,6 +4047,8 @@ void clusterUpdateState(void) { static mstime_t among_minority_time; static mstime_t first_call_time = 0; + server.cluster->todo_before_sleep &= ~CLUSTER_TODO_UPDATE_STATE; + /* If this is a master node, wait some time before turning the state * into OK, since it is not a good idea to rejoin the cluster as a writable * master, after a reboot, without giving the cluster a chance to @@ -3397,7 +4056,7 @@ void clusterUpdateState(void) { * the first call to this function and not since the server start, in order * to don't count the DB loading time. */ if (first_call_time == 0) first_call_time = mstime(); - if (server.cluster->myself->flags & REDIS_NODE_MASTER && + if (nodeIsMaster(myself) && mstime() - first_call_time < REDIS_CLUSTER_WRITABLE_DELAY) return; /* Start assuming the state is OK. We'll turn it into FAIL if there @@ -3433,7 +4092,7 @@ void clusterUpdateState(void) { while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); - if (node->flags & REDIS_NODE_MASTER && node->numslots) { + if (nodeIsMaster(node) && node->numslots) { server.cluster->size++; if (node->flags & (REDIS_NODE_FAIL|REDIS_NODE_PFAIL)) unreachable_masters++; @@ -3451,7 +4110,7 @@ void clusterUpdateState(void) { */ { int needed_quorum = (server.cluster->size / 2) + 1; - + if (unreachable_masters >= needed_quorum) { new_state = REDIS_CLUSTER_FAIL; among_minority_time = mstime(); @@ -3473,7 +4132,7 @@ void clusterUpdateState(void) { rejoin_delay = REDIS_CLUSTER_MIN_REJOIN_DELAY; if (new_state == REDIS_CLUSTER_OK && - server.cluster->myself->flags & REDIS_NODE_MASTER && + nodeIsMaster(myself) && mstime() - among_minority_time < rejoin_delay) { return; @@ -3519,7 +4178,7 @@ int verifyClusterConfigWithData(void) { /* If this node is a slave, don't perform the check at all as we * completely depend on the replication stream. */ // 不对从节点进行检查 - if (server.cluster->myself->flags & REDIS_NODE_SLAVE) return REDIS_OK; + if (nodeIsSlave(myself)) return REDIS_OK; /* Make sure we only have keys in DB0. */ // 确保只有 0 号数据库有数据 @@ -3536,7 +4195,7 @@ int verifyClusterConfigWithData(void) { * In both cases check the next slot as the configuration makes * sense. */ // 跳过正在导入的槽 - if (server.cluster->slots[j] == server.cluster->myself || + if (server.cluster->slots[j] == myself || server.cluster->importing_slots_from[j] != NULL) continue; /* If we are here data and cluster config don't agree, and we have @@ -3550,7 +4209,7 @@ int verifyClusterConfigWithData(void) { redisLog(REDIS_WARNING, "I've keys about slot %d that is " "unassigned. Taking responsability " "for it.",j); - clusterAddSlot(server.cluster->myself,j); + clusterAddSlot(myself,j); } else { // 如果一个槽已经被其他节点接管 // 那么将槽中的资料发送给对方 @@ -3571,21 +4230,16 @@ int verifyClusterConfigWithData(void) { * SLAVE nodes handling * -------------------------------------------------------------------------- */ -/* Set the specified node 'n' as master. Setup the node as a slave if - * needed. */ -// 将节点 n 设置为当前节点的主节点 +/* Set the specified node 'n' as master for this node. + * If this node is currently a master, it is turned into a slave. */ void clusterSetMaster(clusterNode *n) { - - // 指向当前节点 - clusterNode *myself = server.cluster->myself; - redisAssert(n != myself); redisAssert(myself->numslots == 0); - // 设置当前节点的标识值 - if (myself->flags & REDIS_NODE_MASTER) { + if (nodeIsMaster(myself)) { myself->flags &= ~REDIS_NODE_MASTER; myself->flags |= REDIS_NODE_SLAVE; + clusterCloseAllSlots(); } else { if (myself->slaveof) clusterNodeRemoveSlave(myself->slaveof,myself); @@ -3597,6 +4251,7 @@ void clusterSetMaster(clusterNode *n) { // 设置主节点的 IP 和地址,开始对它进行复制 clusterNodeAddSlave(n,myself); replicationSetMaster(n->ip, n->port); + resetManualFailover(); } /* ----------------------------------------------------------------------------- @@ -3651,7 +4306,7 @@ sds clusterGenNodeDescription(clusterNode *node) { if (start == -1) start = j; } if (start != -1 && (!bit || j == REDIS_CLUSTER_SLOTS-1)) { - if (j == REDIS_CLUSTER_SLOTS-1) j++; + if (bit && j == REDIS_CLUSTER_SLOTS-1) j++; if (start == j-1) { ci = sdscatprintf(ci," %d",start); @@ -3751,11 +4406,12 @@ void clusterCommand(redisClient *c) { /* CLUSTER MEET */ // 将给定地址的节点添加到当前节点所处的集群里面 - long port; + long long port; // 检查 port 参数的合法性 - if (getLongFromObjectOrReply(c, c->argv[3], &port, NULL) != REDIS_OK) { - addReplyError(c,"Invalid TCP port specified"); + if (getLongLongFromObject(c->argv[3], &port) != REDIS_OK) { + addReplyErrorFormat(c,"Invalid TCP port specified: %s", + (char*)c->argv[3]->ptr); return; } @@ -3764,7 +4420,8 @@ void clusterCommand(redisClient *c) { errno == EINVAL) { // 连接失败 - addReplyError(c,"Invalid node address specified"); + addReplyErrorFormat(c,"Invalid node address specified: %s:%s", + (char*)c->argv[2]->ptr, (char*)c->argv[3]->ptr); } else { // 连接成功 addReply(c,shared.ok); @@ -3790,7 +4447,7 @@ void clusterCommand(redisClient *c) { return; } // 删除所有由该节点处理的槽 - clusterDelNodeSlots(server.cluster->myself); + clusterDelNodeSlots(myself); clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); addReply(c,shared.ok); @@ -3860,7 +4517,7 @@ void clusterCommand(redisClient *c) { // 添加或者删除指定 slot retval = del ? clusterDelSlot(j) : - clusterAddSlot(server.cluster->myself,j); + clusterAddSlot(myself,j); redisAssertWithInfo(c,NULL,retval == REDIS_OK); } } @@ -3882,9 +4539,8 @@ void clusterCommand(redisClient *c) { // CLUSTER SETSLOT MIGRATING // 将本节点的槽 slot 迁移至 node id 所指定的节点 if (!strcasecmp(c->argv[3]->ptr,"migrating") && c->argc == 5) { - // 被迁移的槽必须属于本节点 - if (server.cluster->slots[slot] != server.cluster->myself) { + if (server.cluster->slots[slot] != myself) { addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot); return; } @@ -3904,7 +4560,7 @@ void clusterCommand(redisClient *c) { } else if (!strcasecmp(c->argv[3]->ptr,"importing") && c->argc == 5) { // 如果 slot 槽本身已经由本节点处理,那么无须进行导入 - if (server.cluster->slots[slot] == server.cluster->myself) { + if (server.cluster->slots[slot] == myself) { addReplyErrorFormat(c, "I'm already the owner of hash slot %u",slot); return; @@ -3944,29 +4600,50 @@ void clusterCommand(redisClient *c) { /* If this hash slot was served by 'myself' before to switch * make sure there are no longer local keys for this hash slot. */ // 如果这个槽之前由当前节点负责处理,那么必须保证槽里面没有键存在 - if (server.cluster->slots[slot] == server.cluster->myself && - n != server.cluster->myself) - { + if (server.cluster->slots[slot] == myself && n != myself) { if (countKeysInSlot(slot) != 0) { - addReplyErrorFormat(c, "Can't assign hashslot %d to a different node while I still hold keys for this hash slot.", slot); + addReplyErrorFormat(c, + "Can't assign hashslot %d to a different node " + "while I still hold keys for this hash slot.", slot); return; } } - - /* If this node was the slot owner and the slot was marked as - * migrating, assigning the slot to another node will clear + /* If this slot is in migrating status but we have no keys + * for it assigning the slot to another node will clear * the migratig status. */ - // 撤销本节点对 slot 的迁移计划 - if (server.cluster->slots[slot] == server.cluster->myself && + if (countKeysInSlot(slot) == 0 && server.cluster->migrating_slots_to[slot]) server.cluster->migrating_slots_to[slot] = NULL; /* If this node was importing this slot, assigning the slot to * itself also clears the importing status. */ // 撤销本节点对 slot 的导入计划 - if (n == server.cluster->myself && + if (n == myself && server.cluster->importing_slots_from[slot]) + { + /* This slot was manually migrated, set this node configEpoch + * to a new epoch so that the new version can be propagated + * by the cluster. + * + * Note that if this ever results in a collision with another + * node getting the same configEpoch, for example because a + * failover happens at the same time we close the slot, the + * configEpoch collision resolution will fix it assigning + * a different epoch to each node. */ + uint64_t maxEpoch = clusterGetMaxEpoch(); + + if (myself->configEpoch == 0 || + myself->configEpoch != maxEpoch) + { + server.cluster->currentEpoch++; + myself->configEpoch = server.cluster->currentEpoch; + clusterDoBeforeSleep(CLUSTER_TODO_FSYNC_CONFIG); + redisLog(REDIS_WARNING, + "configEpoch set to %llu after importing slot %d", + (unsigned long long) myself->configEpoch, slot); + } server.cluster->importing_slots_from[slot] = NULL; + } // 将槽设置为未指派 clusterDelSlot(slot); @@ -3975,10 +4652,11 @@ void clusterCommand(redisClient *c) { clusterAddSlot(n,slot); } else { - addReplyError(c,"Invalid CLUSTER SETSLOT action or number of arguments"); + addReplyError(c, + "Invalid CLUSTER SETSLOT action or number of arguments"); return; } - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|CLUSTER_TODO_UPDATE_STATE); addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"info") && c->argc == 2) { @@ -3998,13 +4676,9 @@ void clusterCommand(redisClient *c) { // 统计已指派节点的数量 slots_assigned++; - - // 统计各个不同状态下的节点的数量 - if (n->flags & REDIS_NODE_FAIL) { - // 已下线节点 + if (nodeFailed(n)) { slots_fail++; - } else if (n->flags & REDIS_NODE_PFAIL) { - // 疑似下线节点 + } else if (nodeTimedOut(n)) { slots_pfail++; } else { // 正常节点 @@ -4090,7 +4764,8 @@ void clusterCommand(redisClient *c) { if (getLongLongFromObjectOrReply(c,c->argv[2],&slot,NULL) != REDIS_OK) return; // 取出 count 参数 - if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL) != REDIS_OK) + if (getLongLongFromObjectOrReply(c,c->argv[3],&maxkeys,NULL) + != REDIS_OK) return; // 检查参数的合法性 if (slot < 0 || slot >= REDIS_CLUSTER_SLOTS || maxkeys < 0) { @@ -4119,11 +4794,10 @@ void clusterCommand(redisClient *c) { if (!n) { addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[2]->ptr); return; - } else if (n == server.cluster->myself) { + } else if (n == myself) { addReplyError(c,"I tried hard but I can't forget myself..."); return; - } else if (server.cluster->myself->flags & REDIS_NODE_SLAVE && - server.cluster->myself->slaveof == n) { + } else if (nodeIsSlave(myself) && myself->slaveof == n) { addReplyError(c,"Can't forget my master!"); return; } @@ -4132,8 +4806,8 @@ void clusterCommand(redisClient *c) { clusterBlacklistAddNode(n); // 从集群中删除该节点 clusterDelNode(n); - - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE|CLUSTER_TODO_SAVE_CONFIG); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_SAVE_CONFIG); addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc == 3) { @@ -4151,7 +4825,7 @@ void clusterCommand(redisClient *c) { /* I can't replicate myself. */ // 指定节点是自己,不能进行复制 - if (n == server.cluster->myself) { + if (n == myself) { addReplyError(c,"Can't replicate myself"); return; } @@ -4167,11 +4841,11 @@ void clusterCommand(redisClient *c) { * slots nor keys to accept to replicate some other node. * Slaves can switch to another master without issues. */ // 节点必须没有被指派任何槽,并且数据库必须为空 - if (server.cluster->myself->flags & REDIS_NODE_MASTER && - (server.cluster->myself->numslots != 0 || - dictSize(server.db[0].dict) != 0)) - { - addReplyError(c,"To set a master the node must be empty and without assigned slots."); + if (nodeIsMaster(myself) && + (myself->numslots != 0 || dictSize(server.db[0].dict) != 0)) { + addReplyError(c, + "To set a master the node must be empty and " + "without assigned slots."); return; } @@ -4191,7 +4865,7 @@ void clusterCommand(redisClient *c) { return; } - if (n->flags & REDIS_NODE_SLAVE) { + if (nodeIsSlave(n)) { addReplyError(c,"The specified node is not a master"); return; } @@ -4202,6 +4876,102 @@ void clusterCommand(redisClient *c) { addReplyBulkCString(c,ni); sdsfree(ni); } + } else if (!strcasecmp(c->argv[1]->ptr,"failover") && + (c->argc == 2 || c->argc == 3)) + { + /* CLUSTER FAILOVER [FORCE] */ + int force = 0; + + if (c->argc == 3) { + if (!strcasecmp(c->argv[2]->ptr,"force")) { + force = 1; + } else { + addReply(c,shared.syntaxerr); + return; + } + } + + if (nodeIsMaster(myself)) { + addReplyError(c,"You should send CLUSTER FAILOVER to a slave"); + return; + } else if (!force && + (myself->slaveof == NULL || nodeFailed(myself->slaveof) || + myself->slaveof->link == NULL)) + { + addReplyError(c,"Master is down or failed, " + "please use CLUSTER FAILOVER FORCE"); + return; + } + resetManualFailover(); + server.cluster->mf_end = mstime() + REDIS_CLUSTER_MF_TIMEOUT; + + /* If this is a forced failover, we don't need to talk with our master + * to agree about the offset. We just failover taking over it without + * coordination. */ + if (force) { + server.cluster->mf_can_start = 1; + } else { + clusterSendMFStart(myself->slaveof); + } + redisLog(REDIS_WARNING,"Manual failover user request accepted."); + addReply(c,shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr,"set-config-epoch") && c->argc == 3) + { + /* CLUSTER SET-CONFIG-EPOCH + * + * The user is allowed to set the config epoch only when a node is + * totally fresh: no config epoch, no other known node, and so forth. + * This happens at cluster creation time to start with a cluster where + * every node has a different node ID, without to rely on the conflicts + * resolution system which is too slow when a big cluster is created. */ + long long epoch; + + if (getLongLongFromObjectOrReply(c,c->argv[2],&epoch,NULL) != REDIS_OK) + return; + + if (epoch < 0) { + addReplyErrorFormat(c,"Invalid config epoch specified: %lld",epoch); + } else if (dictSize(server.cluster->nodes) > 1) { + addReplyError(c,"The user can assign a config epoch only when the " + "node does not know any other node."); + } else if (myself->configEpoch != 0) { + addReplyError(c,"Node config epoch is already non-zero"); + } else { + myself->configEpoch = epoch; + /* No need to fsync the config here since in the unlucky event + * of a failure to persist the config, the conflict resolution code + * will assign an unique config to this node. */ + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE| + CLUSTER_TODO_SAVE_CONFIG); + addReply(c,shared.ok); + } + } else if (!strcasecmp(c->argv[1]->ptr,"reset") && + (c->argc == 2 || c->argc == 3)) + { + /* CLUSTER RESET [SOFT|HARD] */ + int hard = 0; + + /* Parse soft/hard argument. Default is soft. */ + if (c->argc == 3) { + if (!strcasecmp(c->argv[2]->ptr,"hard")) { + hard = 1; + } else if (!strcasecmp(c->argv[2]->ptr,"soft")) { + hard = 0; + } else { + addReply(c,shared.syntaxerr); + return; + } + } + + /* Slaves can be reset while containing data, but not master nodes + * that must be empty. */ + if (nodeIsMaster(myself) && dictSize(c->db->dict) != 0) { + addReplyError(c,"CLUSTER RESET can't be called with " + "master nodes containing keys"); + return; + } + clusterReset(hard); + addReply(c,shared.ok); } else { addReplyError(c,"Wrong CLUSTER subcommand or number of arguments"); } @@ -4351,7 +5121,7 @@ void restoreCommand(redisClient *c) { /* Make sure this key does not already exist here... */ // 如果没有给定 REPLACE 选项,并且键已经存在,那么返回错误 if (!replace && lookupKeyWrite(c->db,c->argv[1]) != NULL) { - addReplyError(c,"Target key name is busy."); + addReply(c,shared.busykeyerr); return; } @@ -4499,7 +5269,8 @@ int migrateGetSocket(redisClient *c, robj *host, robj *port, long timeout) { // 检查连接的超时设置 if ((aeWait(fd,AE_WRITABLE,timeout) & AE_WRITABLE) == 0) { sdsfree(name); - addReplySds(c,sdsnew("-IOERR error or timeout connecting to the client\r\n")); + addReplySds(c, + sdsnew("-IOERR error or timeout connecting to the client\r\n")); close(fd); return -1; } @@ -4604,7 +5375,7 @@ void migrateCommand(redisClient *c) { addReplySds(c,sdsnew("+NOKEY\r\n")); return; } - + /* Connect */ // 获取套接字连接 fd = migrateGetSocket(c,c->argv[1],c->argv[2],timeout); @@ -4635,7 +5406,8 @@ void migrateCommand(redisClient *c) { // 写入键名和过期时间 redisAssertWithInfo(c,NULL,sdsEncodedObject(c->argv[3])); - redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,c->argv[3]->ptr,sdslen(c->argv[3]->ptr))); + redisAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,c->argv[3]->ptr, + sdslen(c->argv[3]->ptr))); redisAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,ttl)); /* Emit the payload argument, that is the serialized object using @@ -4780,33 +5552,43 @@ void readwriteCommand(redisClient *c) { } /* Return the pointer to the cluster node that is able to serve the command. - * For the function to succeed the command should only target a single - * key (or the same key multiple times). + * For the function to succeed the command should only target either: * - * 返回负责处理命令 cmd 的节点的 clusterNode ,集群目前只允许执行处理单个键的命令。 + * 1) A single key (even multiple times like LPOPRPUSH mylist mylist). + * 2) Multiple keys in the same hash slot, while the slot is stable (no + * resharding in progress). * - * If the returned node should be used only for this request, the *ask - * integer is set to '1', otherwise to '0'. This is used in order to - * let the caller know if we should reply with -MOVED or with -ASK. + * On success the function returns the node that is able to serve the request. + * If the node is not 'myself' a redirection must be perfomed. The kind of + * redirection is specified setting the integer passed by reference + * 'error_code', which will be set to REDIS_CLUSTER_REDIR_ASK or + * REDIS_CLUSTER_REDIR_MOVED. * - * 如果返回的节点仅被用于当此转向,那么将 ask 设置为 1 ,否则设置为 0 。 - * 根据 ask 的值,节点会判断应该是发送 -ASK 转向(临时转向)还是 -MOVED 转向(永久转向)。 + * When the node is 'myself' 'error_code' is set to REDIS_CLUSTER_REDIR_NONE. * - * If the command contains multiple keys, and as a consequence it is not - * possible to handle the request in Redis Cluster, NULL is returned. + * If the command fails NULL is returned, and the reason of the failure is + * provided via 'error_code', which will be set to: * - * 如果命令包含多个键,那么这个命令不能被集群处理,函数返回 NULL 。 - */ -clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask) { + * REDIS_CLUSTER_REDIR_CROSS_SLOT if the request contains multiple keys that + * don't belong to the same hash slot. + * + * REDIS_CLUSTER_REDIR_UNSTABLE if the request contains mutliple keys + * belonging to the same slot, but the slot is not stable (in migration or + * importing state, likely because a resharding is in progress). */ +clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) { // 初始化为 NULL , // 如果输入命令是无参数命令,那么 n 就会继续为 NULL clusterNode *n = NULL; robj *firstkey = NULL; + int multiple_keys = 0; multiState *ms, _ms; multiCmd mc; - int i, slot = 0; + int i, slot = 0, migrating_slot = 0, importing_slot = 0, missing_keys = 0; + + /* Set error code optimistically for the base case. */ + if (error_code) *error_code = REDIS_CLUSTER_REDIR_NONE; /* We handle all the cases as if they were EXEC commands, so we have * a common code path for everything */ @@ -4816,7 +5598,7 @@ clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **arg if (cmd->proc == execCommand) { /* If REDIS_MULTI flag is not set EXEC is just going to return an * error. */ - if (!(c->flags & REDIS_MULTI)) return server.cluster->myself; + if (!(c->flags & REDIS_MULTI)) return myself; ms = &c->mstate; } else { /* In order to have a single codepath create a fake Multi State @@ -4830,9 +5612,8 @@ clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **arg mc.cmd = cmd; } - /* Check that all the keys are the same key, and get the slot and - * node for this key. */ - // 遍历事务中的命令 + /* Check that all the keys are in the same hash slot, and obtain this + * slot and the node associated. */ for (i = 0; i < ms->count; i++) { struct redisCommand *mcmd; robj **margv; @@ -4843,84 +5624,92 @@ clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **arg margv = ms->commands[i].argv; // 定位命令的键位置 - keyindex = getKeysFromCommand(mcmd,margv,margc,&numkeys, - REDIS_GETKEYS_ALL); - + keyindex = getKeysFromCommand(mcmd,margv,margc,&numkeys); // 遍历命令中的所有键 for (j = 0; j < numkeys; j++) { + robj *thiskey = margv[keyindex[j]]; + int thisslot = keyHashSlot((char*)thiskey->ptr, + sdslen(thiskey->ptr)); + if (firstkey == NULL) { // 这是事务中第一个被处理的键 // 获取该键的槽和负责处理该槽的节点 /* This is the first key we see. Check what is the slot * and node. */ - - // 键 - firstkey = margv[keyindex[j]]; - - // 计算负责处理键 firstkey 的槽 - slot = keyHashSlot((char*)firstkey->ptr, sdslen(firstkey->ptr)); - - // 指向负责处理槽 slot 的节点 + firstkey = thiskey; + slot = thisslot; n = server.cluster->slots[slot]; redisAssertWithInfo(c,firstkey,n != NULL); + /* If we are migrating or importing this slot, we need to check + * if we have all the keys in the request (the only way we + * can safely serve the request, otherwise we return a TRYAGAIN + * error). To do so we set the importing/migrating state and + * increment a counter for every missing key. */ + if (n == myself && + server.cluster->migrating_slots_to[slot] != NULL) + { + migrating_slot = 1; + } else if (server.cluster->importing_slots_from[slot] != NULL) { + importing_slot = 1; + } } else { /* If it is not the first key, make sure it is exactly * the same key as the first we saw. */ - if (!equalStringObjects(firstkey,margv[keyindex[j]])) { - getKeysFreeResult(keyindex); - return NULL; + if (!equalStringObjects(firstkey,thiskey)) { + if (slot != thisslot) { + /* Error: multiple keys from different slots. */ + getKeysFreeResult(keyindex); + if (error_code) + *error_code = REDIS_CLUSTER_REDIR_CROSS_SLOT; + return NULL; + } else { + /* Flag this request as one with multiple different + * keys. */ + multiple_keys = 1; + } } } + + /* Migarting / Improrting slot? Count keys we don't have. */ + if ((migrating_slot || importing_slot) && + lookupKeyRead(&server.db[0],thiskey) == NULL) + { + missing_keys++; + } } getKeysFreeResult(keyindex); } - if (ask) *ask = 0; /* This is the default. Set to 1 if needed later. */ - /* No key at all in command? then we can serve the request - * without redirections. */ - // 这是一个无参数命令,无须转向,直接由本节点处理 - if (n == NULL) return server.cluster->myself; + * without redirections or errors. */ + if (n == NULL) return myself; - // 记录负责处理键的槽 + /* Return the hashslot by reference. */ if (hashslot) *hashslot = slot; /* This request is about a slot we are migrating into another instance? - * Then we need to check if we have the key. If we have it we can reply. - * If instead is a new key, we pass the request to the node that is - * receiving the slot. */ - // 如果负责处理槽 slot 的是本节点 - // 并且这个槽 slot 正在迁移至另一个节点 - // 那么首先检查键 key 是否存在于本节点 - // 如果没有的话,那么键 key 可能已经转移至另一个节点了 - // 要求客户端进行 ASK 临时转向,到另一个节点去查找键 key - if (n == server.cluster->myself && - server.cluster->migrating_slots_to[slot] != NULL) - { - // 在本节点中查找键 key - if (lookupKeyRead(&server.db[0],firstkey) == NULL) { - - // 在本节点没找到键 key - - // 进行 ASK 临时转向 - if (ask) *ask = 1; + * Then if we have all the keys. */ - // 返回转移槽 slot 的目标节点 - return server.cluster->migrating_slots_to[slot]; - } + /* If we don't have all the keys and we are migrating the slot, send + * an ASK redirection. */ + if (migrating_slot && missing_keys) { + if (error_code) *error_code = REDIS_CLUSTER_REDIR_ASK; + return server.cluster->migrating_slots_to[slot]; } - /* Handle the case in which we are receiving this hash slot from - * another instance, so we'll accept the query even if in the table - * it is assigned to a different node, but only if the client - * issued an ASKING command before. */ - // 如果当前客户端正在从另一个节点中导入槽 slot ,并且 - // 1)在接到这个命令之前,客户端先发送了一个 ASKING 命令 - // 2)这个命令是一个带有 REDIS_CMD_ASKING 标识的命令 - // 那么将这个命令的执行者设置为当前节点 - if (server.cluster->importing_slots_from[slot] != NULL && - (c->flags & REDIS_ASKING || cmd->flags & REDIS_CMD_ASKING)) { - return server.cluster->myself; + /* If we are receiving the slot, and the client correctly flagged the + * request as "ASKING", we can serve the request. However if the request + * involves multiple keys and we don't have them all, the only option is + * to send a TRYAGAIN error. */ + if (importing_slot && + (c->flags & REDIS_ASKING || cmd->flags & REDIS_CMD_ASKING)) + { + if (multiple_keys && missing_keys) { + if (error_code) *error_code = REDIS_CLUSTER_REDIR_UNSTABLE; + return NULL; + } else { + return myself; + } } /* Handle the read-only client case reading from a slave: if this @@ -4928,13 +5717,16 @@ clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **arg * is serving, we can reply without redirection. */ if (c->flags & REDIS_READONLY && cmd->flags & REDIS_CMD_READONLY && - server.cluster->myself->flags & REDIS_NODE_SLAVE && - server.cluster->myself->slaveof == n) + nodeIsSlave(myself) && + myself->slaveof == n) { - return server.cluster->myself; + return myself; } - /* It's not a -ASK case. Base case: just return the right node. */ + /* Base case: just return the right node. However if this node is not + * myself, set error_code to MOVED since we need to issue a rediretion. */ + if (n != myself && error_code) *error_code = REDIS_CLUSTER_REDIR_MOVED; + // 返回负责处理槽 slot 的节点 n return n; } diff --git a/src/cluster.h b/src/cluster.h index f9c5e7a30..9a9c3a35e 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -15,9 +15,6 @@ #define REDIS_CLUSTER_NAMELEN 40 /* sha1 hex length */ // 集群的实际端口号 = 用户指定的端口号 + REDIS_CLUSTER_PORT_INCR #define REDIS_CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ -// IPv6 地址的长度 -#define REDIS_CLUSTER_IPLEN INET6_ADDRSTRLEN /* IPv6 address string length */ - /* The following defines are amunt of time, sometimes expressed as * multiplicators of the node timeout value (when ending with MULT). @@ -35,10 +32,17 @@ #define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */ // 在检查从节点数据是否有效时使用的乘法因子 #define REDIS_CLUSTER_SLAVE_VALIDITY_MULT 10 /* Slave data validity. */ -// 发送投票请求的间隔时间的乘法因子 -#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 4 /* Auth request retry time. */ -// 在执行故障转移之前需要等待的秒数 #define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */ +#define REDIS_CLUSTER_DEFAULT_MIGRATION_BARRIER 1 +#define REDIS_CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */ +#define REDIS_CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */ + +/* Redirection errors returned by getNodeByQuery(). */ +#define REDIS_CLUSTER_REDIR_NONE 0 /* Node can serve the request. */ +#define REDIS_CLUSTER_REDIR_CROSS_SLOT 1 /* Keys in different slots. */ +#define REDIS_CLUSTER_REDIR_UNSTABLE 2 /* Keys in slot resharding. */ +#define REDIS_CLUSTER_REDIR_ASK 3 /* -ASK redirection required. */ +#define REDIS_CLUSTER_REDIR_MOVED 4 /* -MOVED redirection required. */ struct clusterNode; @@ -64,8 +68,7 @@ typedef struct clusterLink { } clusterLink; - -/* Node flags 节点标识*/ +/* Cluster node flags and macros. */ // 该节点为主节点 #define REDIS_NODE_MASTER 1 /* The node is a master */ // 该节点为从节点 @@ -88,6 +91,13 @@ typedef struct clusterLink { // 空名字(在节点为主节点时,用作消息中的 slaveof 属性的值) #define REDIS_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" +#define nodeIsMaster(n) ((n)->flags & REDIS_NODE_MASTER) +#define nodeIsSlave(n) ((n)->flags & REDIS_NODE_SLAVE) +#define nodeInHandshake(n) ((n)->flags & REDIS_NODE_HANDSHAKE) +#define nodeHasAddr(n) (!((n)->flags & REDIS_NODE_NOADDR)) +#define nodeWithoutAddr(n) ((n)->flags & REDIS_NODE_NOADDR) +#define nodeTimedOut(n) ((n)->flags & REDIS_NODE_PFAIL) +#define nodeFailed(n) ((n)->flags & REDIS_NODE_FAIL) /* This structure represent elements of node->fail_reports. */ // 每个 clusterNodeFailReport 结构保存了一条其他节点对目标节点的下线报告 @@ -233,14 +243,20 @@ typedef struct clusterState { // 如果值为 1 ,表示本节点已经向其他节点发送了投票请求 int failover_auth_sent; /* True if we already asked for votes. */ - // 集群当前进行选举的配置纪元 + int failover_auth_rank; /* This slave rank for current auth request. */ uint64_t failover_auth_epoch; /* Epoch of the current election. */ - + /* Manual failover state in common. */ + mstime_t mf_end; /* Manual failover time limit (ms unixtime). + It is zero if there is no MF in progress. */ + /* Manual failover state of master. */ + clusterNode *mf_slave; /* Slave performing the manual failover. */ + /* Manual failover state of slave. */ + long long mf_master_offset; /* Master offset the slave needs to start MF + or zero if stil not received. */ + int mf_can_start; /* If non-zero signal that the manual failover + can start requesting masters vote. */ /* The followign fields are uesd by masters to take state on elections. */ - // 以下一个域是主节点在进行故障迁移投票时使用的域 - - // 节点最后投票的配置纪元 - uint64_t last_vote_epoch; /* Epoch of the last vote granted. */ + uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */ // 在进入下个事件循环之前要做的事情,以各个 flag 来记录 int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ @@ -287,6 +303,7 @@ typedef struct clusterState { #define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you have my vote */ // 槽布局已经发生变化,消息发送者要求消息接收者进行相应的更新 #define CLUSTERMSG_TYPE_UPDATE 7 /* Another node slots configuration */ +#define CLUSTERMSG_TYPE_MFSTART 8 /* Pause clients for manual failover */ /* Initially we don't know our "name", but we'll find it once we connect * to the first node, using the getsockname() function. Then we'll use this @@ -306,7 +323,7 @@ typedef struct { uint32_t pong_received; // 节点的 IP 地址 - char ip[16]; /* IP address last time it was seen */ + char ip[REDIS_IP_STR_LEN]; /* IP address last time it was seen */ // 节点的端口号 uint16_t port; /* port last time it was seen */ @@ -381,9 +398,11 @@ union clusterMsgData { // 用来表示集群消息的结构(消息头,header) typedef struct { - + char sig[4]; /* Siganture "RCmb" (Redis Cluster message bus). */ // 消息的长度(包括这个消息头的长度和消息正文的长度) uint32_t totlen; /* Total length of this message */ + uint16_t ver; /* Protocol version, currently set to 0. */ + uint16_t notused0; /* 2 bytes not used. */ // 消息的类型 uint16_t type; /* Message type */ @@ -427,7 +446,7 @@ typedef struct { // 消息发送者所处集群的状态 unsigned char state; /* Cluster state from the POV of the sender */ - unsigned char notused2[3]; /* Reserved for future use. For alignment. */ + unsigned char mflags[3]; /* Message flags: CLUSTERMSG_FLAG[012]_... */ // 消息的正文(或者说,内容) union clusterMsgData data; @@ -436,6 +455,12 @@ typedef struct { #define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData)) +/* Message flags better specify the packet content or are used to + * provide some information about the node state. */ +#define CLUSTERMSG_FLAG0_PAUSED (1<<0) /* Master paused for manual failover. */ +#define CLUSTERMSG_FLAG0_FORCEACK (1<<1) /* Give ACK to AUTH_REQUEST even if + master is up. */ + /* ---------------------- API exported outside cluster.c -------------------- */ clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); diff --git a/src/config.c b/src/config.c index e9ce386c5..96f3628f6 100644 --- a/src/config.c +++ b/src/config.c @@ -134,6 +134,11 @@ void loadServerConfigFromString(char *config) { if (server.port < 0 || server.port > 65535) { err = "Invalid port"; goto loaderr; } + } else if (!strcasecmp(argv[0],"tcp-backlog") && argc == 2) { + server.tcp_backlog = atoi(argv[1]); + if (server.tcp_backlog < 0) { + err = "Invalid backlog value"; goto loaderr; + } } else if (!strcasecmp(argv[0],"bind") && argc >= 2) { int j, addresses = argc-1; @@ -393,6 +398,8 @@ void loadServerConfigFromString(char *config) { server.zset_max_ziplist_entries = memtoll(argv[1], NULL); } else if (!strcasecmp(argv[0],"zset-max-ziplist-value") && argc == 2) { server.zset_max_ziplist_value = memtoll(argv[1], NULL); + } else if (!strcasecmp(argv[0],"hll-sparse-max-bytes") && argc == 2) { + server.hll_sparse_max_bytes = memtoll(argv[1], NULL); } else if (!strcasecmp(argv[0],"rename-command") && argc == 3) { struct redisCommand *cmd = lookupCommand(argv[1]); int retval; @@ -429,6 +436,14 @@ void loadServerConfigFromString(char *config) { if (server.cluster_node_timeout <= 0) { err = "cluster node timeout must be 1 or greater"; goto loaderr; } + } else if (!strcasecmp(argv[0],"cluster-migration-barrier") + && argc == 2) + { + server.cluster_migration_barrier = atoi(argv[1]); + if (server.cluster_migration_barrier < 0) { + err = "cluster migration barrier must be positive"; + goto loaderr; + } } else if (!strcasecmp(argv[0],"lua-time-limit") && argc == 2) { server.lua_time_limit = strtoll(argv[1],NULL,10); } else if (!strcasecmp(argv[0],"slowlog-log-slower-than") && @@ -612,7 +627,7 @@ void configSetCommand(redisClient *c) { } else if (!strcasecmp(c->argv[2]->ptr,"maxclients")) { int orig_value = server.maxclients; - if (getLongLongFromObject(o,&ll) == REDIS_ERR || ll < 0) goto badfmt; + if (getLongLongFromObject(o,&ll) == REDIS_ERR || ll < 1) goto badfmt; /* Try to check if the OS is capable of supporting so many FDs. */ server.maxclients = ll; @@ -777,6 +792,9 @@ void configSetCommand(redisClient *c) { } else if (!strcasecmp(c->argv[2]->ptr,"zset-max-ziplist-value")) { if (getLongLongFromObject(o,&ll) == REDIS_ERR || ll < 0) goto badfmt; server.zset_max_ziplist_value = ll; + } else if (!strcasecmp(c->argv[2]->ptr,"hll-sparse-max-bytes")) { + if (getLongLongFromObject(o,&ll) == REDIS_ERR || ll < 0) goto badfmt; + server.hll_sparse_max_bytes = ll; } else if (!strcasecmp(c->argv[2]->ptr,"lua-time-limit")) { if (getLongLongFromObject(o,&ll) == REDIS_ERR || ll < 0) goto badfmt; server.lua_time_limit = ll; @@ -900,6 +918,10 @@ void configSetCommand(redisClient *c) { if (getLongLongFromObject(o,&ll) == REDIS_ERR || ll <= 0) goto badfmt; server.cluster_node_timeout = ll; + } else if (!strcasecmp(c->argv[2]->ptr,"cluster-migration-barrier")) { + if (getLongLongFromObject(o,&ll) == REDIS_ERR || + ll < 0) goto badfmt; + server.cluster_migration_barrier = ll; } else { addReplyErrorFormat(c,"Unsupported CONFIG parameter: %s", (char*)c->argv[2]->ptr); @@ -982,12 +1004,15 @@ void configGetCommand(redisClient *c) { server.zset_max_ziplist_entries); config_get_numerical_field("zset-max-ziplist-value", server.zset_max_ziplist_value); + config_get_numerical_field("hll-sparse-max-bytes", + server.hll_sparse_max_bytes); config_get_numerical_field("lua-time-limit",server.lua_time_limit); config_get_numerical_field("slowlog-log-slower-than", server.slowlog_log_slower_than); config_get_numerical_field("slowlog-max-len", server.slowlog_max_len); config_get_numerical_field("port",server.port); + config_get_numerical_field("tcp-backlog",server.tcp_backlog); config_get_numerical_field("databases",server.dbnum); config_get_numerical_field("repl-ping-slave-period",server.repl_ping_slave_period); config_get_numerical_field("repl-timeout",server.repl_timeout); @@ -1000,6 +1025,7 @@ void configGetCommand(redisClient *c) { config_get_numerical_field("min-slaves-max-lag",server.repl_min_slaves_max_lag); config_get_numerical_field("hz",server.hz); config_get_numerical_field("cluster-node-timeout",server.cluster_node_timeout); + config_get_numerical_field("cluster-migration-barrier",server.cluster_migration_barrier); /* Bool (yes/no) values */ config_get_bool_field("no-appendfsync-on-rewrite", @@ -1467,7 +1493,7 @@ void rewriteConfigSaveOption(struct rewriteConfigState *state) { * resulting into no RDB persistence as expected. */ for (j = 0; j < server.saveparamslen; j++) { line = sdscatprintf(sdsempty(),"save %ld %d", - server.saveparams[j].seconds, server.saveparams[j].changes); + (long) server.saveparams[j].seconds, server.saveparams[j].changes); rewriteConfigRewriteLine(state,"save",line,1); } /* Mark "save" as processed in case server.saveparamslen is zero. */ @@ -1707,6 +1733,7 @@ int rewriteConfig(char *path) { rewriteConfigYesNoOption(state,"daemonize",server.daemonize,0); rewriteConfigStringOption(state,"pidfile",server.pidfile,REDIS_DEFAULT_PID_FILE); rewriteConfigNumericalOption(state,"port",server.port,REDIS_SERVERPORT); + rewriteConfigNumericalOption(state,"tcp-backlog",server.tcp_backlog,REDIS_TCP_BACKLOG); rewriteConfigBindOption(state); rewriteConfigStringOption(state,"unixsocket",server.unixsocket,NULL); rewriteConfigOctalOption(state,"unixsocketperm",server.unixsocketperm,REDIS_DEFAULT_UNIX_SOCKET_PERM); @@ -1767,6 +1794,7 @@ int rewriteConfig(char *path) { rewriteConfigYesNoOption(state,"cluster-enabled",server.cluster_enabled,0); rewriteConfigStringOption(state,"cluster-config-file",server.cluster_configfile,REDIS_DEFAULT_CLUSTER_CONFIG_FILE); rewriteConfigNumericalOption(state,"cluster-node-timeout",server.cluster_node_timeout,REDIS_CLUSTER_DEFAULT_NODE_TIMEOUT); + rewriteConfigNumericalOption(state,"cluster-migration-barrier",server.cluster_migration_barrier,REDIS_CLUSTER_DEFAULT_MIGRATION_BARRIER); rewriteConfigNumericalOption(state,"slowlog-log-slower-than",server.slowlog_log_slower_than,REDIS_SLOWLOG_LOG_SLOWER_THAN); rewriteConfigNumericalOption(state,"slowlog-max-len",server.slowlog_max_len,REDIS_SLOWLOG_MAX_LEN); rewriteConfigNotifykeyspaceeventsOption(state); @@ -1777,6 +1805,7 @@ int rewriteConfig(char *path) { rewriteConfigNumericalOption(state,"set-max-intset-entries",server.set_max_intset_entries,REDIS_SET_MAX_INTSET_ENTRIES); rewriteConfigNumericalOption(state,"zset-max-ziplist-entries",server.zset_max_ziplist_entries,REDIS_ZSET_MAX_ZIPLIST_ENTRIES); rewriteConfigNumericalOption(state,"zset-max-ziplist-value",server.zset_max_ziplist_value,REDIS_ZSET_MAX_ZIPLIST_VALUE); + rewriteConfigNumericalOption(state,"hll-sparse-max-bytes",server.hll_sparse_max_bytes,REDIS_DEFAULT_HLL_SPARSE_MAX_BYTES); rewriteConfigYesNoOption(state,"activerehashing",server.activerehashing,REDIS_DEFAULT_ACTIVE_REHASHING); rewriteConfigClientoutputbufferlimitOption(state); rewriteConfigNumericalOption(state,"hz",server.hz,REDIS_DEFAULT_HZ); @@ -1811,14 +1840,7 @@ void configCommand(redisClient *c) { configGetCommand(c); } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) { if (c->argc != 2) goto badarity; - server.stat_keyspace_hits = 0; - server.stat_keyspace_misses = 0; - server.stat_numcommands = 0; - server.stat_numconnections = 0; - server.stat_expiredkeys = 0; - server.stat_rejected_conn = 0; - server.stat_fork_time = 0; - server.aof_delayed_fsync = 0; + resetServerStats(); resetCommandTableStats(); addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"rewrite")) { @@ -1828,8 +1850,10 @@ void configCommand(redisClient *c) { return; } if (rewriteConfig(server.configfile) == -1) { + redisLog(REDIS_WARNING,"CONFIG REWRITE failed: %s", strerror(errno)); addReplyErrorFormat(c,"Rewriting config file: %s", strerror(errno)); } else { + redisLog(REDIS_WARNING,"CONFIG REWRITE executed with success."); addReply(c,shared.ok); } } else { diff --git a/src/config.h b/src/config.h index 9f2baaa1f..8041f7ebe 100644 --- a/src/config.h +++ b/src/config.h @@ -187,7 +187,7 @@ void setproctitle(const char *fmt, ...); #if (__i386 || __amd64) && __GNUC__ #define GNUC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) -#if GNUC_VERSION >= 40100 +#if (GNUC_VERSION >= 40100) || defined(__clang__) #define HAVE_ATOMIC #endif #endif diff --git a/src/db.c b/src/db.c index be96e3b05..a38707c57 100644 --- a/src/db.c +++ b/src/db.c @@ -63,7 +63,7 @@ robj *lookupKey(redisDb *db, robj *key) { * a copy on write madness. */ // 更新时间信息(只在不存在子进程时执行,防止破坏 copy-on-write 机制) if (server.rdb_child_pid == -1 && server.aof_child_pid == -1) - val->lru = server.lruclock; + val->lru = LRU_CLOCK(); // 返回值 return val; @@ -194,8 +194,7 @@ void dbAdd(redisDb *db, robj *key, robj *val) { * 如果键不存在,那么函数停止。 */ void dbOverwrite(redisDb *db, robj *key, robj *val) { - - struct dictEntry *de = dictFind(db->dict,key->ptr); + dictEntry *de = dictFind(db->dict,key->ptr); // 节点必须存在,否则中止 redisAssertWithInfo(NULL,key,de != NULL); @@ -257,7 +256,7 @@ int dbExists(redisDb *db, robj *key) { * 这个函数保证被返回的键都是未过期的。 */ robj *dbRandomKey(redisDb *db) { - struct dictEntry *de; + dictEntry *de; while(1) { sds key; @@ -311,6 +310,44 @@ int dbDelete(redisDb *db, robj *key) { } } +/* Prepare the string object stored at 'key' to be modified destructively + * to implement commands like SETBIT or APPEND. + * + * An object is usually ready to be modified unless one of the two conditions + * are true: + * + * 1) The object 'o' is shared (refcount > 1), we don't want to affect + * other users. + * 2) The object encoding is not "RAW". + * + * If the object is found in one of the above conditions (or both) by the + * function, an unshared / not-encoded copy of the string object is stored + * at 'key' in the specified 'db'. Otherwise the object 'o' itself is + * returned. + * + * USAGE: + * + * The object 'o' is what the caller already obtained by looking up 'key' + * in 'db', the usage pattern looks like this: + * + * o = lookupKeyWrite(db,key); + * if (checkType(c,o,REDIS_STRING)) return; + * o = dbUnshareStringValue(db,key,o); + * + * At this point the caller is ready to modify the object, for example + * using an sdscat() call to append some data, or anything else. + */ +robj *dbUnshareStringValue(redisDb *db, robj *key, robj *o) { + redisAssert(o->type == REDIS_STRING); + if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) { + robj *decoded = getDecodedObject(o); + o = createRawStringObject(decoded->ptr, sdslen(decoded->ptr)); + decrRefCount(decoded); + dbOverwrite(db,key,o); + } + return o; +} + /* * 清空服务器的所有数据。 */ @@ -442,6 +479,9 @@ void delCommand(redisClient *c) { // 遍历所有输入键 for (j = 1; j < c->argc; j++) { + // 先删除过期的键 + expireIfNeeded(c->db,c->argv[j]); + // 尝试删除键 if (dbDelete(c->db,c->argv[j])) { @@ -879,11 +919,13 @@ void shutdownCommand(redisClient *c) { } } - /* SHUTDOWN can be called even while the server is in "loading" state. - * When this happens we need to make sure no attempt is performed to save + /* When SHUTDOWN is called while the server is loading a dataset in + * memory we need to make sure no attempt is performed to save * the dataset on shutdown (otherwise it could overwrite the current DB - * with half-read data). */ - if (server.loading) + * with half-read data). + * + * Also when in Sentinel mode clear the SAVE flag and force NOSAVE. */ + if (server.loading || server.sentinel_mode) flags = (flags & ~REDIS_SHUTDOWN_SAVE) | REDIS_SHUTDOWN_NOSAVE; if (prepareForShutdown(flags) == REDIS_OK) exit(0); @@ -1134,7 +1176,8 @@ void propagateExpire(redisDb *db, robj *key) { int expireIfNeeded(redisDb *db, robj *key) { // 取出键的过期时间 - long long when = getExpire(db,key); + mstime_t when = getExpire(db,key); + mstime_t now; // 没有过期时间 if (when < 0) return 0; /* No expire for this key */ @@ -1143,6 +1186,13 @@ int expireIfNeeded(redisDb *db, robj *key) { // 如果服务器正在进行载入,那么不进行任何过期检查 if (server.loading) return 0; + /* If we are in the context of a Lua script, we claim that time is + * blocked to when the Lua script started. This way a key can expire + * only the first time it is accessed and not in the middle of the + * script execution, making propagation to slaves / AOF consistent. + * See issue #1525 on Github for more information. */ + now = server.lua_caller ? server.lua_time_start : mstime(); + /* If we are running in the context of a slave, return ASAP: * the slave key expiration is controlled by the master that will * send us synthesized DEL operations for expired keys. @@ -1155,15 +1205,13 @@ int expireIfNeeded(redisDb *db, robj *key) { // 它只返回一个逻辑上正确的返回值 // 真正的删除操作要等待主节点发来删除命令时才执行 // 从而保证数据的同步 - if (server.masterhost != NULL) { - return mstime() > when; - } + if (server.masterhost != NULL) return now > when; // 运行到这里,表示键带有过期时间,并且服务器为主节点 /* Return when this key has not expired */ // 如果未过期,返回 0 - if (mstime() <= when) return 0; + if (now <= when) return 0; /* Delete the key */ server.stat_expiredkeys++; @@ -1365,6 +1413,8 @@ void persistCommand(redisClient *c) { * API to get key arguments from commands * ---------------------------------------------------------------------------*/ +/* The base case is to use the keys position as given in the command table + * (firstkey, lastkey, step). */ int *getKeysUsingCommandTable(struct redisCommand *cmd,robj **argv, int argc, int *numkeys) { int j, i = 0, last, *keys; REDIS_NOTUSED(argv); @@ -1384,42 +1434,65 @@ int *getKeysUsingCommandTable(struct redisCommand *cmd,robj **argv, int argc, in return keys; } -int *getKeysFromCommand(struct redisCommand *cmd,robj **argv, int argc, int *numkeys, int flags) { +/* Return all the arguments that are keys in the command passed via argc / argv. + * + * The command returns the positions of all the key arguments inside the array, + * so the actual return value is an heap allocated array of integers. The + * length of the array is returned by reference into *numkeys. + * + * 'cmd' must be point to the corresponding entry into the redisCommand + * table, according to the command name in argv[0]. + * + * This function uses the command table if a command-specific helper function + * is not required, otherwise it calls the command-specific function. */ +int *getKeysFromCommand(struct redisCommand *cmd, robj **argv, int argc, int *numkeys) { if (cmd->getkeys_proc) { - return cmd->getkeys_proc(cmd,argv,argc,numkeys,flags); + return cmd->getkeys_proc(cmd,argv,argc,numkeys); } else { return getKeysUsingCommandTable(cmd,argv,argc,numkeys); } } +/* Free the result of getKeysFromCommand. */ void getKeysFreeResult(int *result) { zfree(result); } -int *noPreloadGetKeys(struct redisCommand *cmd,robj **argv, int argc, int *numkeys, int flags) { - if (flags & REDIS_GETKEYS_PRELOAD) { +/* Helper function to extract keys from following commands: + * ZUNIONSTORE ... + * ZINTERSTORE ... */ +int *zunionInterGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys) { + int i, num, *keys; + REDIS_NOTUSED(cmd); + + num = atoi(argv[2]->ptr); + /* Sanity check. Don't return any key if the command is going to + * reply with syntax error. */ + if (num > (argc-3)) { *numkeys = 0; return NULL; - } else { - return getKeysUsingCommandTable(cmd,argv,argc,numkeys); } -} -int *renameGetKeys(struct redisCommand *cmd,robj **argv, int argc, int *numkeys, int flags) { - if (flags & REDIS_GETKEYS_PRELOAD) { - int *keys = zmalloc(sizeof(int)); - *numkeys = 1; - keys[0] = 1; - return keys; - } else { - return getKeysUsingCommandTable(cmd,argv,argc,numkeys); - } + /* Keys in z{union,inter}store come from two places: + * argv[1] = storage key, + * argv[3...n] = keys to intersect */ + keys = zmalloc(sizeof(int)*(num+1)); + + /* Add all key positions for argv[3...n] to keys[] */ + for (i = 0; i < num; i++) keys[i] = 3+i; + + /* Finally add the argv[1] key position (the storage key target). */ + keys[num] = 1; + *numkeys = num+1; /* Total keys = {union,inter} keys + storage key */ + return keys; } -int *zunionInterGetKeys(struct redisCommand *cmd,robj **argv, int argc, int *numkeys, int flags) { +/* Helper function to extract keys from the following commands: + * EVAL