valkey-io · hanxizh9910 · Oct 18, 2025 · Oct 20, 2025 · Oct 20, 2025 · Oct 20, 2025
diff --git a/.github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml b/.github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml
@@ -0,0 +1,45 @@
+name: Test Flaky Cluster Slot Migration
+
+on:
+  workflow_dispatch:
+  push:
+
+jobs:
+  test-flaky:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      - name: make
+        run: make valgrind SERVER_CFLAGS='-Werror'
+      - name: testprep
+        run: |
+          sudo apt-get update
+          sudo apt-get install tcl8.6 tclx valgrind -y
+      - name: Run test 100 times
+        run: |
+          FAILURES=0
+          SUCCESSES=0
+          for i in {1..100}; do
+            echo "========================================="
+            echo "Run $i of 100"
+            echo "========================================="
+            if ./runtest --valgrind --no-latency --verbose --clients 1 --timeout 2400 --single unit/cluster/test-flaky-migrate-slots; then
+              SUCCESSES=$((SUCCESSES + 1))
+              echo "✓ Run $i: PASSED"
+            else
+              FAILURES=$((FAILURES + 1))
+              echo "✗ Run $i: FAILED"
+            fi
+          done
+          echo "========================================="
+          echo "SUMMARY"
+          echo "========================================="
+          echo "Total runs: 100"
+          echo "Successes: $SUCCESSES"
+          echo "Failures: $FAILURES"
+          echo "Failure rate: $((FAILURES * 100 / 100))%"
+          if [ $FAILURES -gt 0 ]; then
+            echo "Test is FLAKY - failed $FAILURES out of 100 runs"
+            exit 1
+          fi
+
diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl
@@ -458,3 +458,14 @@ proc wait_for_slot_state {srv_idx pattern} {
         fail "incorrect slot state on R $srv_idx: expected $pattern; got [get_open_slots $srv_idx]"
     }
 }
+
+# Check if server_a knows node_b_id
+proc server_knows_node {server_a node_b_id} {
+    set nodes [get_cluster_nodes $server_a]
+    foreach n $nodes {
+        if {[dict get $n id] eq $node_b_id} {
+            return 1
+        }
+    }
+    return 0
+}
diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl
@@ -295,6 +295,17 @@ test {Migrate the last slot away from a node using valkey-cli} {
         set owner_r [valkey $owner_host $owner_port 0 $::tls]
         set owner_id [$owner_r CLUSTER MYID]
 
+        # Wait until owner node knows the new node
+        wait_for_condition 1000 50 {
+            set found 0
+            foreach n [get_cluster_nodes $owner_id] {
+                if {[dict get $n id] eq $newnode_id} { set found 1; break }
+            }
+            $found
+        } else {
+            fail "Owner node does not know the new node yet"
+        }
+
         # Move slot to new node using plain commands
         assert_equal OK [$newnode_r CLUSTER SETSLOT $slot IMPORTING $owner_id]
         assert_equal OK [$owner_r CLUSTER SETSLOT $slot MIGRATING $newnode_id]

diff --git a/tests/unit/cluster/test-flaky-migrate-slots.tcl b/tests/unit/cluster/test-flaky-migrate-slots.tcl
@@ -0,0 +1,113 @@
+set base_conf [list cluster-enabled yes cluster-node-timeout 1000 cluster-databases 16]
+
+test {Migrate the last slot away from a node using valkey-cli} {
+    start_multiple_servers 4 [list overrides $base_conf] {
+
+        # Create a cluster of 3 nodes
+        exec src/valkey-cli --cluster-yes --cluster create \
+                           127.0.0.1:[srv 0 port] \
+                           127.0.0.1:[srv -1 port] \
+                           127.0.0.1:[srv -2 port]
+
+        wait_for_condition 1000 50 {
+            [CI 0 cluster_state] eq {ok} &&
+            [CI 1 cluster_state] eq {ok} &&
+            [CI 2 cluster_state] eq {ok}
+        } else {
+            fail "Cluster doesn't stabilize"
+        }
+
+        # Insert some data
+        assert_equal OK [exec src/valkey-cli -c -p [srv 0 port] SET foo bar]
+        set slot [exec src/valkey-cli -c -p [srv 0 port] CLUSTER KEYSLOT foo]
+
+        # Add new node to the cluster
+        exec src/valkey-cli --cluster-yes --cluster add-node \
+                     127.0.0.1:[srv -3 port] \
+                     127.0.0.1:[srv 0 port]
+
+        # First we wait for new node to be recognized by entire cluster
+        wait_for_cluster_size 4
+
+        wait_for_condition 1000 50 {
+            [CI 0 cluster_state] eq {ok} &&
+            [CI 1 cluster_state] eq {ok} &&
+            [CI 2 cluster_state] eq {ok} &&
+            [CI 3 cluster_state] eq {ok}
+        } else {
+            fail "Cluster doesn't stabilize"
+        }
+
+        set newnode_r [valkey_client -3]
+        set newnode_id [$newnode_r CLUSTER MYID]
+
+        # Find out which node has the key "foo" by asking the new node for a
+        # redirect.
+        catch { $newnode_r get foo } e
+        assert_match "MOVED $slot *" $e
+        lassign [split [lindex $e 2] :] owner_host owner_port
+        set owner_r [valkey $owner_host $owner_port 0 $::tls]
+        set owner_id [$owner_r CLUSTER MYID]
+
+        # Wait until owner node knows the new node
+        wait_for_condition 1000 50 {
+            set found 0
+            foreach n [get_cluster_nodes $owner_id] {
+                if {[dict get $n id] eq $newnode_id} { set found 1; break }
+            }
+            $found
+        } else {
+            fail "Owner node does not know the new node yet"
+        }
+
+        # Move slot to new node using plain commands
+        assert_equal OK [$newnode_r CLUSTER SETSLOT $slot IMPORTING $owner_id]
+        assert_equal OK [$owner_r CLUSTER SETSLOT $slot MIGRATING $newnode_id]
+        assert_equal {foo} [$owner_r CLUSTER GETKEYSINSLOT $slot 10]
+        assert_equal OK [$owner_r MIGRATE 127.0.0.1 [srv -3 port] "" 0 5000 KEYS foo]
+        assert_equal OK [$newnode_r CLUSTER SETSLOT $slot NODE $newnode_id]
+        assert_equal OK [$owner_r CLUSTER SETSLOT $slot NODE $newnode_id]
+
+        # Using --cluster check make sure we won't get `Not all slots are covered by nodes`.
+        # Wait for the cluster to become stable make sure the cluster is up during MIGRATE.
+        wait_for_condition 1000 50 {
+            [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv 0 port]}] == 0 &&
+            [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -1 port]}] == 0 &&
+            [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -2 port]}] == 0 &&
+            [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -3 port]}] == 0 &&
+            [CI 0 cluster_state] eq {ok} &&
+            [CI 1 cluster_state] eq {ok} &&
+            [CI 2 cluster_state] eq {ok} &&
+            [CI 3 cluster_state] eq {ok}
+        } else {
+            fail "Cluster doesn't stabilize"
+        }
+
+        # Move the only slot back to original node using valkey-cli
+        exec src/valkey-cli --cluster reshard 127.0.0.1:[srv -3 port] \
+            --cluster-from $newnode_id \
+            --cluster-to $owner_id \
+            --cluster-slots 1 \
+            --cluster-yes
+
+        # The empty node will become a replica of the new owner before the
+        # `MOVED` check, so let's wait for the cluster to become stable.
+        wait_for_condition 1000 50 {
+            [CI 0 cluster_state] eq {ok} &&
+            [CI 1 cluster_state] eq {ok} &&
+            [CI 2 cluster_state] eq {ok} &&
+            [CI 3 cluster_state] eq {ok}
+        } else {
+            fail "Cluster doesn't stabilize"
+        }
+
+        # Check that the key foo has been migrated back to the original owner.
+        catch { $newnode_r get foo } e
+        assert_equal "MOVED $slot $owner_host:$owner_port" $e
+
+        # Check that the now empty primary node doesn't turn itself into
+        # a replica of any other nodes
+        wait_for_cluster_propagation
+        assert_match *master* [$owner_r role]
+    }
+}