Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metrics comparing the V1 and V2 routing protocols #10060

Merged
merged 2 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions chain/network/src/peer_manager/network_state/routing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,13 +144,41 @@ impl NetworkState {
.unwrap_or(Ok(()))
}

fn record_routing_protocol_metrics(&self, target: &PeerId) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a value of knowing what the actual delta is?

let v1_dist = self.graph.routing_table.get_distance(target);
let v2_dist = self.graph_v2.routing_table.get_distance(target);

match (v1_dist, v2_dist) {
(None, None) => {
metrics::NETWORK_ROUTED_MSG_DISTANCES.with_label_values(&["v1, v2 NONE"]).inc();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about the following?
NONE
v1 ONLY
v2 ONLY

I find it confusing to mention v1 or v2 when they are not present.

}
(Some(_), None) => {
metrics::NETWORK_ROUTED_MSG_DISTANCES.with_label_values(&["v2 NONE"]).inc();
}
(None, Some(_)) => {
metrics::NETWORK_ROUTED_MSG_DISTANCES.with_label_values(&["v1 NONE"]).inc();
}
(Some(v1), Some(v2)) => {
if v1 == v2 {
metrics::NETWORK_ROUTED_MSG_DISTANCES.with_label_values(&["v1 == v2"]).inc();
} else if v1 < v2 {
metrics::NETWORK_ROUTED_MSG_DISTANCES.with_label_values(&["v1 < v2"]).inc();
} else {
metrics::NETWORK_ROUTED_MSG_DISTANCES.with_label_values(&["v1 > v2"]).inc();
}
}
}
}

pub(crate) fn tier2_find_route(
&self,
clock: &time::Clock,
target: &PeerIdOrHash,
) -> Result<PeerId, FindRouteError> {
match target {
PeerIdOrHash::PeerId(peer_id) => {
self.record_routing_protocol_metrics(peer_id);

match self.graph.routing_table.find_next_hop_for_target(peer_id) {
Ok(peer_id) => Ok(peer_id),
Err(_) => self.graph_v2.routing_table.find_next_hop_for_target(peer_id),
Expand Down
24 changes: 20 additions & 4 deletions chain/network/src/routing/bfs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,10 @@ impl Graph {
/// Compute for every node `u` on the graph (other than `source`) which are the neighbors of
/// `sources` which belong to the shortest path from `source` to `u`. Nodes that are
/// not connected to `source` will not appear in the result.
pub fn calculate_distance(
pub fn calculate_next_hops_and_distance(
&self,
unreliable_peers: &HashSet<PeerId>,
) -> HashMap<PeerId, Vec<PeerId>> {
) -> (HashMap<PeerId, Vec<PeerId>>, HashMap<PeerId, u32>) {
// TODO add removal of unreachable nodes

let unreliable_peers: HashSet<_> =
Expand Down Expand Up @@ -189,8 +189,13 @@ impl Graph {
/// - routes - for node given node at index `i`, give list of connected peers, which
/// are on the optimal path
/// - distances - not really needed: TODO remove this argument
fn compute_result(&self, routes: &[u128], distance: &[i32]) -> HashMap<PeerId, Vec<PeerId>> {
fn compute_result(
&self,
routes: &[u128],
distance: &[i32],
) -> (HashMap<PeerId, Vec<PeerId>>, HashMap<PeerId, u32>) {
let mut res = HashMap::with_capacity(routes.len());
let mut distances = HashMap::with_capacity(routes.len());

let neighbors = &self.adjacency[self.source_id as usize];
let mut unreachable_nodes = 0;
Expand All @@ -217,11 +222,22 @@ impl Graph {
.map(|(_, &neighbor)| self.id2p[neighbor as usize].clone())
.collect();
res.insert(self.id2p[key].clone(), peer_set);
distances.insert(self.id2p[key].clone(), distance[key] as u32);
}
if unreachable_nodes > 1000 {
warn!("We store more than 1000 unreachable nodes: {}", unreachable_nodes);
}
res

(res, distances)
}

#[cfg(test)]
pub fn calculate_distance(
&self,
unreliable_peers: &HashSet<PeerId>,
) -> HashMap<PeerId, Vec<PeerId>> {
let (next_hops, _) = self.calculate_next_hops_and_distance(&unreliable_peers);
next_hops
}
}

Expand Down
11 changes: 8 additions & 3 deletions chain/network/src/routing/graph/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ mod tests;
// TODO: make it opaque, so that the key.0 < key.1 invariant is protected.
type EdgeKey = (PeerId, PeerId);
pub type NextHopTable = HashMap<PeerId, Vec<PeerId>>;
pub type DistanceTable = HashMap<PeerId, u32>;

#[derive(Clone)]
pub struct GraphConfig {
Expand All @@ -32,6 +33,7 @@ pub struct GraphSnapshot {
pub edges: im::HashMap<EdgeKey, Edge>,
pub local_edges: HashMap<PeerId, Edge>,
pub next_hops: Arc<NextHopTable>,
pub distances: Arc<DistanceTable>,
}

struct Inner {
Expand Down Expand Up @@ -246,7 +248,10 @@ impl Inner {
if let Some(prune_edges_after) = self.config.prune_edges_after {
self.prune_old_edges(clock.now_utc() - prune_edges_after);
}
let next_hops = Arc::new(self.graph.calculate_distance(unreliable_peers));

let (next_hops, distances) = self.graph.calculate_next_hops_and_distance(unreliable_peers);
let next_hops = Arc::new(next_hops);
let distances = Arc::new(distances);

// Update peer_reachable_at.
let now = clock.now();
Expand All @@ -268,7 +273,7 @@ impl Inner {
metrics::PEER_REACHABLE.set(next_hops.len() as i64);
metrics::EDGE_ACTIVE.set(self.graph.total_active_edges() as i64);
metrics::EDGE_TOTAL.set(self.edges.len() as i64);
GraphSnapshot { edges: self.edges.clone(), local_edges, next_hops }
GraphSnapshot { edges: self.edges.clone(), local_edges, next_hops, distances }
}
}

Expand Down Expand Up @@ -345,7 +350,7 @@ impl Graph {
}
let snapshot = inner.update(&clock, &this.unreliable_peers.load());
let snapshot = Arc::new(snapshot);
this.routing_table.update(snapshot.next_hops.clone());
this.routing_table.update(snapshot.next_hops.clone(), snapshot.distances.clone());
this.snapshot.store(snapshot);
(new_edges, oks)
})
Expand Down
2 changes: 1 addition & 1 deletion chain/network/src/routing/graph_v2/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -723,7 +723,7 @@ impl GraphV2 {
let (next_hops, to_broadcast) =
inner.compute_routes(&clock, &this.unreliable_peers.load());

this.routing_table.update(next_hops.into());
this.routing_table.update(next_hops.into(), Arc::new(inner.my_distances.clone()));

inner.log_state();

Expand Down
2 changes: 1 addition & 1 deletion chain/network/src/routing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ mod graph_v2;
pub(crate) mod route_back_cache;
pub mod routing_table_view;

pub(crate) use graph::{Graph, GraphConfig, NextHopTable};
pub(crate) use graph::{DistanceTable, Graph, GraphConfig, NextHopTable};
pub(crate) use graph_v2::{GraphConfigV2, GraphV2, NetworkTopologyChange};
27 changes: 25 additions & 2 deletions chain/network/src/routing/routing_table_view/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ struct Inner {
/// this will be the set of first nodes on all such paths.
next_hops: Arc<routing::NextHopTable>,

/// Contains the shortest path length for each routable peer in the network.
/// Used only to collect metrics measuring routing performance.
/// TODO(saketh): Remove this when we deprecate the V1 routing protocol.
distance: Arc<routing::DistanceTable>,

/// Counter of number of calls to find_route_by_peer_id.
find_route_calls: u64,
/// Last time the given peer was selected by find_route_by_peer_id.
Expand All @@ -36,6 +41,15 @@ impl Inner {
self.find_route_calls += 1;
Ok(next_hop.clone())
}

fn update(
&mut self,
next_hops: Arc<routing::NextHopTable>,
distance: Arc<routing::DistanceTable>,
) {
self.next_hops = next_hops;
self.distance = distance
}
}

#[derive(Debug)]
Expand All @@ -48,13 +62,18 @@ impl RoutingTableView {
pub fn new() -> Self {
Self(Mutex::new(Inner {
next_hops: Default::default(),
distance: Default::default(),
find_route_calls: 0,
last_routed: LruCache::new(LAST_ROUTED_CACHE_SIZE),
}))
}

pub(crate) fn update(&self, next_hops: Arc<routing::NextHopTable>) {
self.0.lock().next_hops = next_hops;
pub(crate) fn update(
&self,
next_hops: Arc<routing::NextHopTable>,
distance: Arc<routing::DistanceTable>,
) {
self.0.lock().update(next_hops, distance)
}

pub(crate) fn reachable_peers(&self) -> usize {
Expand All @@ -73,6 +92,10 @@ impl RoutingTableView {
self.0.lock().find_next_hop(target)
}

pub(crate) fn get_distance(&self, peer_id: &PeerId) -> Option<u32> {
self.0.lock().distance.get(peer_id).copied()
}

pub(crate) fn view_route(&self, peer_id: &PeerId) -> Option<Vec<PeerId>> {
self.0.lock().next_hops.get(peer_id).cloned()
}
Expand Down
2 changes: 1 addition & 1 deletion chain/network/src/routing/routing_table_view/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ fn find_route() {

// Check that RoutingTableView always selects a valid next hop.
let rtv = RoutingTableView::new();
rtv.update(next_hops.clone());
rtv.update(next_hops.clone(), Default::default());
for _ in 0..1000 {
let p = peers.choose(rng).unwrap();
let got = rtv.find_next_hop_for_target(&p).unwrap();
Expand Down
12 changes: 12 additions & 0 deletions chain/network/src/stats/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,18 @@ pub(crate) static ACCOUNT_TO_PEER_LOOKUPS: Lazy<IntCounterVec> = Lazy::new(|| {
.unwrap()
});

pub(crate) static NETWORK_ROUTED_MSG_DISTANCES: Lazy<IntCounterVec> = Lazy::new(|| {
try_create_int_counter_vec(
"near_network_routed_msg_distances",
"compares routing distance by protocol (V1 vs V2)",
// Compares the routing distances for the V1 and V2 routing protocols.
// We are currently running both while validating performance of V2.
// Eventually we want to deprecate V1 and run only V2.
&["cmp"],
)
.unwrap()
});

/// Updated the prometheus metrics about the received routed message `msg`.
/// `tier` indicates the network over which the message was transmitted.
/// `fastest` indicates whether this message is the first copy of `msg` received -
Expand Down