Skip to content
Draft
45 changes: 45 additions & 0 deletions .github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: Test Flaky Cluster Slot Migration

on:
workflow_dispatch:
push:

jobs:
test-flaky:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
- name: make
run: make valgrind SERVER_CFLAGS='-Werror'
- name: testprep
run: |
sudo apt-get update
sudo apt-get install tcl8.6 tclx valgrind -y
- name: Run test 100 times
run: |
FAILURES=0
SUCCESSES=0
for i in {1..100}; do
echo "========================================="
echo "Run $i of 100"
echo "========================================="
if ./runtest --valgrind --no-latency --verbose --clients 1 --timeout 2400 --single unit/cluster/test-flaky-migrate-slots; then
SUCCESSES=$((SUCCESSES + 1))
echo "✓ Run $i: PASSED"
else
FAILURES=$((FAILURES + 1))
echo "✗ Run $i: FAILED"
fi
done
echo "========================================="
echo "SUMMARY"
echo "========================================="
echo "Total runs: 100"
echo "Successes: $SUCCESSES"
echo "Failures: $FAILURES"
echo "Failure rate: $((FAILURES * 100 / 100))%"
if [ $FAILURES -gt 0 ]; then
echo "Test is FLAKY - failed $FAILURES out of 100 runs"
exit 1
fi

11 changes: 11 additions & 0 deletions tests/support/cluster_util.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -458,3 +458,14 @@ proc wait_for_slot_state {srv_idx pattern} {
fail "incorrect slot state on R $srv_idx: expected $pattern; got [get_open_slots $srv_idx]"
}
}

# Check if server_a knows node_b_id
proc server_knows_node {server_a node_b_id} {
set nodes [get_cluster_nodes $server_a]
foreach n $nodes {
if {[dict get $n id] eq $node_b_id} {
return 1
}
}
return 0
}
11 changes: 11 additions & 0 deletions tests/unit/cluster/cli.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,17 @@ test {Migrate the last slot away from a node using valkey-cli} {
set owner_r [valkey $owner_host $owner_port 0 $::tls]
set owner_id [$owner_r CLUSTER MYID]

# Wait until owner node knows the new node
wait_for_condition 1000 50 {
set found 0
foreach n [get_cluster_nodes $owner_id] {
if {[dict get $n id] eq $newnode_id} { set found 1; break }
}
$found
} else {
fail "Owner node does not know the new node yet"
}

# Move slot to new node using plain commands
assert_equal OK [$newnode_r CLUSTER SETSLOT $slot IMPORTING $owner_id]
assert_equal OK [$owner_r CLUSTER SETSLOT $slot MIGRATING $newnode_id]
Expand Down
113 changes: 113 additions & 0 deletions tests/unit/cluster/test-flaky-migrate-slots.tcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
set base_conf [list cluster-enabled yes cluster-node-timeout 1000 cluster-databases 16]

test {Migrate the last slot away from a node using valkey-cli} {
start_multiple_servers 4 [list overrides $base_conf] {

# Create a cluster of 3 nodes
exec src/valkey-cli --cluster-yes --cluster create \
127.0.0.1:[srv 0 port] \
127.0.0.1:[srv -1 port] \
127.0.0.1:[srv -2 port]

wait_for_condition 1000 50 {
[CI 0 cluster_state] eq {ok} &&
[CI 1 cluster_state] eq {ok} &&
[CI 2 cluster_state] eq {ok}
} else {
fail "Cluster doesn't stabilize"
}

# Insert some data
assert_equal OK [exec src/valkey-cli -c -p [srv 0 port] SET foo bar]
set slot [exec src/valkey-cli -c -p [srv 0 port] CLUSTER KEYSLOT foo]

# Add new node to the cluster
exec src/valkey-cli --cluster-yes --cluster add-node \
127.0.0.1:[srv -3 port] \
127.0.0.1:[srv 0 port]

# First we wait for new node to be recognized by entire cluster
wait_for_cluster_size 4

wait_for_condition 1000 50 {
[CI 0 cluster_state] eq {ok} &&
[CI 1 cluster_state] eq {ok} &&
[CI 2 cluster_state] eq {ok} &&
[CI 3 cluster_state] eq {ok}
} else {
fail "Cluster doesn't stabilize"
}

set newnode_r [valkey_client -3]
set newnode_id [$newnode_r CLUSTER MYID]

# Find out which node has the key "foo" by asking the new node for a
# redirect.
catch { $newnode_r get foo } e
assert_match "MOVED $slot *" $e
lassign [split [lindex $e 2] :] owner_host owner_port
set owner_r [valkey $owner_host $owner_port 0 $::tls]
set owner_id [$owner_r CLUSTER MYID]

# Wait until owner node knows the new node
wait_for_condition 1000 50 {
set found 0
foreach n [get_cluster_nodes $owner_id] {
if {[dict get $n id] eq $newnode_id} { set found 1; break }
}
$found
} else {
fail "Owner node does not know the new node yet"
}

# Move slot to new node using plain commands
assert_equal OK [$newnode_r CLUSTER SETSLOT $slot IMPORTING $owner_id]
assert_equal OK [$owner_r CLUSTER SETSLOT $slot MIGRATING $newnode_id]
assert_equal {foo} [$owner_r CLUSTER GETKEYSINSLOT $slot 10]
assert_equal OK [$owner_r MIGRATE 127.0.0.1 [srv -3 port] "" 0 5000 KEYS foo]
assert_equal OK [$newnode_r CLUSTER SETSLOT $slot NODE $newnode_id]
assert_equal OK [$owner_r CLUSTER SETSLOT $slot NODE $newnode_id]

# Using --cluster check make sure we won't get `Not all slots are covered by nodes`.
# Wait for the cluster to become stable make sure the cluster is up during MIGRATE.
wait_for_condition 1000 50 {
[catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv 0 port]}] == 0 &&
[catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -1 port]}] == 0 &&
[catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -2 port]}] == 0 &&
[catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -3 port]}] == 0 &&
[CI 0 cluster_state] eq {ok} &&
[CI 1 cluster_state] eq {ok} &&
[CI 2 cluster_state] eq {ok} &&
[CI 3 cluster_state] eq {ok}
} else {
fail "Cluster doesn't stabilize"
}

# Move the only slot back to original node using valkey-cli
exec src/valkey-cli --cluster reshard 127.0.0.1:[srv -3 port] \
--cluster-from $newnode_id \
--cluster-to $owner_id \
--cluster-slots 1 \
--cluster-yes

# The empty node will become a replica of the new owner before the
# `MOVED` check, so let's wait for the cluster to become stable.
wait_for_condition 1000 50 {
[CI 0 cluster_state] eq {ok} &&
[CI 1 cluster_state] eq {ok} &&
[CI 2 cluster_state] eq {ok} &&
[CI 3 cluster_state] eq {ok}
} else {
fail "Cluster doesn't stabilize"
}

# Check that the key foo has been migrated back to the original owner.
catch { $newnode_r get foo } e
assert_equal "MOVED $slot $owner_host:$owner_port" $e

# Check that the now empty primary node doesn't turn itself into
# a replica of any other nodes
wait_for_cluster_propagation
assert_match *master* [$owner_r role]
}
}
Loading