Skip to content

Commit b6c60d3

Browse files
authored
feat: congestion metrics (#3491)
## Description Needs n0-computer/iroh-metrics#35 and updating the Cargo.toml's again. ## Breaking Changes <!-- Optional, if there are any breaking changes document them, including how to migrate older code. --> ## Notes & open questions <!-- Any notes, remarks or open questions you have to make about the PR. --> ## Change checklist <!-- Remove any that are not relevant. --> - [ ] Self-review. - [ ] Documentation updates following the [style guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text), if relevant. - [ ] Tests if relevant. - [ ] All breaking changes documented. - [ ] List all breaking changes in the above "Breaking Changes" section. - [ ] Open an issue or PR on any number0 repos that are affected by this breaking change. Give guidance on how the updates should be handled or do the actual updates themselves. The major ones are: - [ ] [`quic-rpc`](https://github.com/n0-computer/quic-rpc) - [ ] [`iroh-gossip`](https://github.com/n0-computer/iroh-gossip) - [ ] [`iroh-blobs`](https://github.com/n0-computer/iroh-blobs) - [ ] [`dumbpipe`](https://github.com/n0-computer/dumbpipe) - [ ] [`sendme`](https://github.com/n0-computer/sendme)
1 parent 33aca18 commit b6c60d3

File tree

11 files changed

+362
-56
lines changed

11 files changed

+362
-56
lines changed

Cargo.lock

Lines changed: 11 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

iroh-dns-server/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ hickory-server = { version = "0.25.1", features = ["https-ring"] }
2828
http = "1.0.0"
2929
humantime = "2.2.0"
3030
humantime-serde = "1.1.1"
31-
iroh-metrics = { version = "0.35", features = ["service"] }
31+
iroh-metrics = { version = "0.36", features = ["service"] }
3232
lru = "0.16"
3333
n0-future = "0.1.2"
3434
n0-snafu = "0.2.2"

iroh-relay/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ http-body-util = "0.1.0"
3232
hyper = { version = "1", features = ["server", "client", "http1"] }
3333
hyper-util = "0.1.1"
3434
iroh-base = { version = "0.92.0", path = "../iroh-base", default-features = false, features = ["key", "relay"] }
35-
iroh-metrics = { version = "0.35", default-features = false }
35+
iroh-metrics = { version = "0.36", default-features = false }
3636
n0-future = "0.1.2"
3737
num_enum = "0.7"
3838
pin-project = "1"

iroh/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ futures-buffered = "0.2.11"
8282
pkcs8 = "0.11.0-rc.7"
8383

8484
# metrics
85-
iroh-metrics = { version = "0.35", default-features = false }
85+
iroh-metrics = { version = "0.36", default-features = false }
8686

8787
# local-swarm-discovery
8888
swarm-discovery = { version = "0.4", optional = true }
@@ -104,7 +104,7 @@ parse-size = { version = "=1.0.0", optional = true, features = ['std'] } # pinne
104104
hickory-resolver = "0.25.1"
105105
igd-next = { version = "0.16", features = ["aio_tokio"] }
106106
netdev = { version = "0.36.0" }
107-
portmapper = { version = "0.9", default-features = false }
107+
portmapper = { version = "0.10", default-features = false }
108108
quinn = { package = "iroh-quinn", version = "0.14.0", default-features = false, features = ["runtime-tokio", "rustls-ring"] }
109109
tokio = { version = "1", features = [
110110
"io-util",

iroh/bench/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ publish = false
99
bytes = "1.7"
1010
hdrhistogram = { version = "7.2", default-features = false }
1111
iroh = { path = ".." }
12-
iroh-metrics = "0.35"
12+
iroh-metrics = "0.36"
1313
n0-future = "0.1.1"
1414
n0-snafu = "0.2.0"
1515
quinn = { package = "iroh-quinn", version = "0.14" }

iroh/src/magicsock.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -792,7 +792,8 @@ impl MagicSock {
792792
}
793793
disco::Message::Pong(pong) => {
794794
self.metrics.magicsock.recv_disco_pong.inc();
795-
self.node_map.handle_pong(sender, src, pong);
795+
self.node_map
796+
.handle_pong(sender, src, pong, &self.metrics.magicsock);
796797
}
797798
disco::Message::CallMeMaybe(cm) => {
798799
self.metrics.magicsock.recv_disco_call_me_maybe.inc();
@@ -2049,7 +2050,9 @@ impl Actor {
20492050
async fn handle_actor_message(&mut self, msg: ActorMessage) {
20502051
match msg {
20512052
ActorMessage::EndpointPingExpired(id, txid) => {
2052-
self.msock.node_map.notify_ping_timeout(id, txid);
2053+
self.msock
2054+
.node_map
2055+
.notify_ping_timeout(id, txid, &self.msock.metrics.magicsock);
20532056
}
20542057
ActorMessage::NetworkChange => {
20552058
self.network_monitor.network_change().await.ok();
@@ -2249,7 +2252,9 @@ impl Actor {
22492252
/// This is called when connectivity changes enough that we no longer trust the old routes.
22502253
#[instrument(skip_all)]
22512254
fn reset_endpoint_states(&mut self) {
2252-
self.msock.node_map.reset_node_states()
2255+
self.msock
2256+
.node_map
2257+
.reset_node_states(&self.msock.metrics.magicsock)
22532258
}
22542259
}
22552260

iroh/src/magicsock/metrics.rs

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
use iroh_metrics::{Counter, MetricsGroup};
1+
use iroh_metrics::{Counter, Histogram, MetricsGroup};
22
use serde::{Deserialize, Serialize};
33

44
/// Enum of metrics for the module
55
// TODO(frando): Add description doc strings for each metric.
66
#[allow(missing_docs)]
7-
#[derive(Debug, Default, Serialize, Deserialize, MetricsGroup)]
7+
#[derive(Debug, Serialize, Deserialize, MetricsGroup)]
88
#[non_exhaustive]
9-
#[metrics(name = "magicsock")]
9+
#[metrics(name = "magicsock", default)]
1010
pub struct Metrics {
1111
pub update_direct_addrs: Counter,
1212

@@ -77,4 +77,23 @@ pub struct Metrics {
7777
pub connection_handshake_success: Counter,
7878
/// Number of connections with a successful handshake that became direct.
7979
pub connection_became_direct: Counter,
80+
81+
/*
82+
* Path Congestion Metrics
83+
*/
84+
/// Number of times a path was marked as outdated due to consecutive ping failures.
85+
pub path_marked_outdated: Counter,
86+
/// Number of ping failures recorded across all paths.
87+
pub path_ping_failures: Counter,
88+
/// Number of consecutive failure resets (path recovered).
89+
pub path_failure_resets: Counter,
90+
/// Histogram of packet loss rates (0.0-1.0) observed on UDP paths.
91+
#[default(Histogram::new(vec![0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0]))]
92+
pub path_packet_loss_rate: Histogram,
93+
/// Histogram of RTT variance (in milliseconds) as a congestion indicator.
94+
#[default(Histogram::new(vec![0.0, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0]))]
95+
pub path_rtt_variance_ms: Histogram,
96+
/// Histogram of path quality scores (0.0-1.0).
97+
#[default(Histogram::new(vec![0.0, 0.3, 0.5, 0.7, 0.85, 0.95, 1.0]))]
98+
pub path_quality_score: Histogram,
8099
}

iroh/src/magicsock/node_map.rs

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -192,14 +192,19 @@ impl NodeMap {
192192
}
193193
}
194194

195-
pub(super) fn notify_ping_timeout(&self, id: usize, tx_id: stun_rs::TransactionId) {
195+
pub(super) fn notify_ping_timeout(
196+
&self,
197+
id: usize,
198+
tx_id: stun_rs::TransactionId,
199+
metrics: &Metrics,
200+
) {
196201
if let Some(ep) = self
197202
.inner
198203
.lock()
199204
.expect("poisoned")
200205
.get_mut(NodeStateKey::Idx(id))
201206
{
202-
ep.ping_timeout(tx_id, Instant::now());
207+
ep.ping_timeout(tx_id, Instant::now(), metrics);
203208
}
204209
}
205210

@@ -228,11 +233,17 @@ impl NodeMap {
228233
.handle_ping(sender, src, tx_id)
229234
}
230235

231-
pub(super) fn handle_pong(&self, sender: PublicKey, src: &transports::Addr, pong: Pong) {
236+
pub(super) fn handle_pong(
237+
&self,
238+
sender: PublicKey,
239+
src: &transports::Addr,
240+
pong: Pong,
241+
metrics: &Metrics,
242+
) {
232243
self.inner
233244
.lock()
234245
.expect("poisoned")
235-
.handle_pong(sender, src, pong)
246+
.handle_pong(sender, src, pong, metrics)
236247
}
237248

238249
#[must_use = "actions must be handled"]
@@ -268,11 +279,11 @@ impl NodeMap {
268279
Some((public_key, udp_addr, relay_url, ping_actions))
269280
}
270281

271-
pub(super) fn reset_node_states(&self) {
282+
pub(super) fn reset_node_states(&self, metrics: &Metrics) {
272283
let now = Instant::now();
273284
let mut inner = self.inner.lock().expect("poisoned");
274285
for (_, ep) in inner.node_states_mut() {
275-
ep.note_connectivity_change(now);
286+
ep.note_connectivity_change(now, metrics);
276287
}
277288
}
278289

@@ -518,9 +529,15 @@ impl NodeMapInner {
518529
.and_then(|ep| ep.latency())
519530
}
520531

521-
fn handle_pong(&mut self, sender: NodeId, src: &transports::Addr, pong: Pong) {
532+
fn handle_pong(
533+
&mut self,
534+
sender: NodeId,
535+
src: &transports::Addr,
536+
pong: Pong,
537+
metrics: &Metrics,
538+
) {
522539
if let Some(ns) = self.get_mut(NodeStateKey::NodeId(sender)).as_mut() {
523-
let insert = ns.handle_pong(&pong, src.clone().into());
540+
let insert = ns.handle_pong(&pong, src.clone().into(), metrics);
524541
if let Some((src, key)) = insert {
525542
self.set_node_key_for_ip_port(src, &key);
526543
}
@@ -553,7 +570,7 @@ impl NodeMapInner {
553570
Some(ns) => {
554571
debug!(endpoints = ?cm.my_numbers, "received call-me-maybe");
555572

556-
ns.handle_call_me_maybe(cm)
573+
ns.handle_call_me_maybe(cm, metrics)
557574
}
558575
}
559576
}

0 commit comments

Comments
 (0)