Skip to content

Commit 70598a8

Browse files
committed
Gather more cluster metrics
1 parent f36c225 commit 70598a8

File tree

4 files changed

+144
-15
lines changed

4 files changed

+144
-15
lines changed

quickwit/quickwit-cluster/src/cluster.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ use crate::member::{
4747
GRPC_ADVERTISE_ADDR_KEY, PIPELINE_METRICS_PREFIX, READINESS_KEY, READINESS_VALUE_NOT_READY,
4848
READINESS_VALUE_READY,
4949
};
50+
use crate::metrics::spawn_metrics_task;
5051
use crate::ClusterNode;
5152

5253
const MARKED_FOR_DELETION_GRACE_PERIOD: usize = if cfg!(any(test, feature = "testsuite")) {
@@ -161,8 +162,11 @@ impl Cluster {
161162
let chitchat = chitchat_handle.chitchat();
162163
let live_nodes_stream = chitchat.lock().await.live_nodes_watcher();
163164
let (ready_members_tx, ready_members_rx) = watch::channel(Vec::new());
164-
165165
spawn_ready_members_task(cluster_id.clone(), live_nodes_stream, ready_members_tx);
166+
167+
let weak_chitchat = Arc::downgrade(&chitchat);
168+
spawn_metrics_task(weak_chitchat, self_node.chitchat_id());
169+
166170
let inner = InnerCluster {
167171
cluster_id: cluster_id.clone(),
168172
self_chitchat_id: self_node.chitchat_id(),

quickwit/quickwit-cluster/src/lib.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,17 @@ impl Transport for CountingUdpTransport {
102102
let socket = UdpSocket::open(listen_addr).await?;
103103
Ok(Box::new(CountingUdpSocket {
104104
socket,
105-
gossip_recv: crate::metrics::CLUSTER_METRICS.gossip_recv_total.clone(),
105+
gossip_recv: crate::metrics::CLUSTER_METRICS
106+
.gossip_recv_messages_total
107+
.clone(),
106108
gossip_recv_bytes: crate::metrics::CLUSTER_METRICS
107109
.gossip_recv_bytes_total
108110
.clone(),
109-
gossip_send: crate::metrics::CLUSTER_METRICS.gossip_send_total.clone(),
111+
gossip_send: crate::metrics::CLUSTER_METRICS
112+
.gossip_sent_messages_total
113+
.clone(),
110114
gossip_send_bytes: crate::metrics::CLUSTER_METRICS
111-
.gossip_send_bytes_total
115+
.gossip_sent_bytes_total
112116
.clone(),
113117
}))
114118
}

quickwit/quickwit-cluster/src/member.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,12 @@
1818
// along with this program. If not, see <http://www.gnu.org/licenses/>.
1919

2020
use std::collections::HashSet;
21+
use std::mem::size_of;
2122
use std::net::SocketAddr;
2223
use std::str::FromStr;
2324

2425
use anyhow::Context;
25-
use chitchat::{ChitchatId, NodeState};
26+
use chitchat::{ChitchatId, NodeState, Version};
2627
use quickwit_proto::indexing::{CpuCapacity, IndexingTask};
2728
use quickwit_proto::types::NodeId;
2829
use tracing::{error, warn};
@@ -46,6 +47,8 @@ pub(crate) trait NodeStateExt {
4647
fn grpc_advertise_addr(&self) -> anyhow::Result<SocketAddr>;
4748

4849
fn is_ready(&self) -> bool;
50+
51+
fn size_bytes(&self) -> usize;
4952
}
5053

5154
impl NodeStateExt for NodeState {
@@ -66,6 +69,16 @@ impl NodeStateExt for NodeState {
6669
.map(|health_value| health_value == READINESS_VALUE_READY)
6770
.unwrap_or(false)
6871
}
72+
73+
// TODO: Expose more accurate size of the state in Chitchat.
74+
fn size_bytes(&self) -> usize {
75+
const SIZE_OF_VERSION: usize = size_of::<Version>();
76+
const SIZE_OF_TOMBSTONE: usize = size_of::<u64>();
77+
78+
self.key_values()
79+
.map(|(key, value)| key.len() + value.value.len() + SIZE_OF_VERSION + SIZE_OF_TOMBSTONE)
80+
.sum()
81+
}
6982
}
7083

7184
/// Cluster member.

quickwit/quickwit-cluster/src/metrics.rs

Lines changed: 118 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,21 +17,71 @@
1717
// You should have received a copy of the GNU Affero General Public License
1818
// along with this program. If not, see <http://www.gnu.org/licenses/>.
1919

20+
use std::net::SocketAddr;
21+
use std::sync::Weak;
22+
use std::time::Duration;
23+
24+
use chitchat::{Chitchat, ChitchatId};
2025
use once_cell::sync::Lazy;
21-
use quickwit_common::metrics::{new_counter, IntCounter};
26+
use quickwit_common::metrics::{new_counter, new_gauge, IntCounter, IntGauge};
27+
use tokio::sync::Mutex;
28+
29+
use crate::member::NodeStateExt;
2230

2331
pub struct ClusterMetrics {
24-
pub gossip_recv_total: IntCounter,
32+
pub live_nodes: IntGauge,
33+
pub ready_nodes: IntGauge,
34+
pub zombie_nodes: IntGauge,
35+
pub dead_nodes: IntGauge,
36+
pub cluster_state_size_bytes: IntGauge,
37+
pub node_state_size_bytes: IntGauge,
38+
pub node_state_keys: IntGauge,
39+
pub gossip_recv_messages_total: IntCounter,
2540
pub gossip_recv_bytes_total: IntCounter,
26-
pub gossip_send_total: IntCounter,
27-
pub gossip_send_bytes_total: IntCounter,
41+
pub gossip_sent_messages_total: IntCounter,
42+
pub gossip_sent_bytes_total: IntCounter,
2843
}
2944

3045
impl Default for ClusterMetrics {
3146
fn default() -> Self {
3247
ClusterMetrics {
33-
gossip_recv_total: new_counter(
34-
"gossip_recv_total",
48+
live_nodes: new_gauge(
49+
"live_nodes",
50+
"The number of live nodes observed locally.",
51+
"cluster",
52+
),
53+
ready_nodes: new_gauge(
54+
"ready_nodes",
55+
"The number of ready nodes observed locally.",
56+
"cluster",
57+
),
58+
zombie_nodes: new_gauge(
59+
"zombie_nodes",
60+
"The number of zombie nodes observed locally.",
61+
"cluster",
62+
),
63+
dead_nodes: new_gauge(
64+
"dead_nodes",
65+
"The number of dead nodes observed locally.",
66+
"cluster",
67+
),
68+
cluster_state_size_bytes: new_gauge(
69+
"cluster_state_size_bytes",
70+
"The size of the cluster state in bytes.",
71+
"cluster",
72+
),
73+
node_state_keys: new_gauge(
74+
"node_state_keys",
75+
"The number of keys in the node state.",
76+
"cluster",
77+
),
78+
node_state_size_bytes: new_gauge(
79+
"node_state_size_bytes",
80+
"The size of the node state in bytes.",
81+
"cluster",
82+
),
83+
gossip_recv_messages_total: new_counter(
84+
"gossip_recv_messages_total",
3585
"Total number of gossip messages received.",
3686
"cluster",
3787
),
@@ -40,13 +90,13 @@ impl Default for ClusterMetrics {
4090
"Total amount of gossip data received in bytes.",
4191
"cluster",
4292
),
43-
gossip_send_total: new_counter(
44-
"gossip_send_total",
93+
gossip_sent_messages_total: new_counter(
94+
"gossip_sent_messages_total",
4595
"Total number of gossip messages sent.",
4696
"cluster",
4797
),
48-
gossip_send_bytes_total: new_counter(
49-
"gossip_send_bytes_total",
98+
gossip_sent_bytes_total: new_counter(
99+
"gossip_sent_bytes_total",
50100
"Total amount of gossip data sent in bytes.",
51101
"cluster",
52102
),
@@ -55,3 +105,61 @@ impl Default for ClusterMetrics {
55105
}
56106

57107
pub static CLUSTER_METRICS: Lazy<ClusterMetrics> = Lazy::new(ClusterMetrics::default);
108+
109+
pub(crate) fn spawn_metrics_task(
110+
weak_chitchat: Weak<Mutex<Chitchat>>,
111+
self_chitchat_id: ChitchatId,
112+
) {
113+
const METRICS_INTERVAL: Duration = Duration::from_secs(15);
114+
115+
const SIZE_OF_GENERATION_ID: usize = std::mem::size_of::<u64>();
116+
const SIZE_OF_SOCKET_ADDR: usize = std::mem::size_of::<SocketAddr>();
117+
118+
let future = async move {
119+
let mut interval = tokio::time::interval(METRICS_INTERVAL);
120+
121+
while let Some(chitchat) = weak_chitchat.upgrade() {
122+
interval.tick().await;
123+
124+
let mut num_ready_nodes = 0;
125+
let mut cluster_state_size_bytes = 0;
126+
127+
let chitchat_guard = chitchat.lock().await;
128+
129+
let num_live_nodes = chitchat_guard.live_nodes().count();
130+
let num_zombie_nodes = chitchat_guard.scheduled_for_deletion_nodes().count();
131+
let num_dead_nodes = chitchat_guard.dead_nodes().count();
132+
133+
for (chitchat_id, node_state) in chitchat_guard.node_states() {
134+
if node_state.is_ready() {
135+
num_ready_nodes += 1;
136+
}
137+
let chitchat_id_size_bytes =
138+
chitchat_id.node_id.len() + SIZE_OF_GENERATION_ID + SIZE_OF_SOCKET_ADDR;
139+
let node_state_size_bytes = node_state.size_bytes();
140+
141+
cluster_state_size_bytes += chitchat_id_size_bytes + node_state_size_bytes;
142+
143+
if *chitchat_id == self_chitchat_id {
144+
CLUSTER_METRICS
145+
.node_state_keys
146+
.set(node_state.num_key_values() as i64);
147+
CLUSTER_METRICS
148+
.node_state_size_bytes
149+
.set(node_state_size_bytes as i64);
150+
}
151+
}
152+
drop(chitchat_guard);
153+
154+
CLUSTER_METRICS.live_nodes.set(num_live_nodes as i64);
155+
CLUSTER_METRICS.ready_nodes.set(num_ready_nodes as i64);
156+
CLUSTER_METRICS.zombie_nodes.set(num_zombie_nodes as i64);
157+
CLUSTER_METRICS.dead_nodes.set(num_dead_nodes as i64);
158+
159+
CLUSTER_METRICS
160+
.cluster_state_size_bytes
161+
.set(cluster_state_size_bytes as i64);
162+
}
163+
};
164+
tokio::spawn(future);
165+
}

0 commit comments

Comments
 (0)