mirror of https://github.com/ethereum/go-ethereum
eth, p2p/msgrate: move peer QoS tracking to its own package and use it for snap (#22876)
This change extracts the peer QoS tracking logic from eth/downloader, moving it into the new package p2p/msgrate. The job of msgrate.Tracker is determining suitable timeout values and request sizes per peer. The snap sync scheduler now uses msgrate.Tracker instead of the hard-coded 15s timeout. This should make the sync work better on network links with high latency.pull/22906/head
parent
b3a1fda650
commit
3e795881ea
@ -1,53 +0,0 @@ |
||||
// Copyright 2020 The go-ethereum Authors
|
||||
// This file is part of go-ethereum.
|
||||
//
|
||||
// go-ethereum is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// go-ethereum is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with go-ethereum. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
package downloader |
||||
|
||||
import ( |
||||
"sort" |
||||
"testing" |
||||
) |
||||
|
||||
func TestPeerThroughputSorting(t *testing.T) { |
||||
a := &peerConnection{ |
||||
id: "a", |
||||
headerThroughput: 1.25, |
||||
} |
||||
b := &peerConnection{ |
||||
id: "b", |
||||
headerThroughput: 1.21, |
||||
} |
||||
c := &peerConnection{ |
||||
id: "c", |
||||
headerThroughput: 1.23, |
||||
} |
||||
|
||||
peers := []*peerConnection{a, b, c} |
||||
tps := []float64{a.headerThroughput, |
||||
b.headerThroughput, c.headerThroughput} |
||||
sortPeers := &peerThroughputSort{peers, tps} |
||||
sort.Sort(sortPeers) |
||||
if got, exp := sortPeers.p[0].id, "a"; got != exp { |
||||
t.Errorf("sort fail, got %v exp %v", got, exp) |
||||
} |
||||
if got, exp := sortPeers.p[1].id, "c"; got != exp { |
||||
t.Errorf("sort fail, got %v exp %v", got, exp) |
||||
} |
||||
if got, exp := sortPeers.p[2].id, "b"; got != exp { |
||||
t.Errorf("sort fail, got %v exp %v", got, exp) |
||||
} |
||||
|
||||
} |
@ -0,0 +1,458 @@ |
||||
// Copyright 2021 The go-ethereum Authors
|
||||
// This file is part of the go-ethereum library.
|
||||
//
|
||||
// The go-ethereum library is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Lesser General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// The go-ethereum library is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Lesser General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Lesser General Public License
|
||||
// along with the go-ethereum library. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
// Package msgrate allows estimating the throughput of peers for more balanced syncs.
|
||||
package msgrate |
||||
|
||||
import ( |
||||
"errors" |
||||
"sort" |
||||
"sync" |
||||
"time" |
||||
|
||||
"github.com/ethereum/go-ethereum/log" |
||||
) |
||||
|
||||
// measurementImpact is the impact a single measurement has on a peer's final
|
||||
// capacity value. A value closer to 0 reacts slower to sudden network changes,
|
||||
// but it is also more stable against temporary hiccups. 0.1 worked well for
|
||||
// most of Ethereum's existence, so might as well go with it.
|
||||
const measurementImpact = 0.1 |
||||
|
||||
// capacityOverestimation is the ratio of items to over-estimate when retrieving
|
||||
// a peer's capacity to avoid locking into a lower value due to never attempting
|
||||
// to fetch more than some local stable value.
|
||||
const capacityOverestimation = 1.01 |
||||
|
||||
// qosTuningPeers is the number of best peers to tune round trip times based on.
|
||||
// An Ethereum node doesn't need hundreds of connections to operate correctly,
|
||||
// so instead of lowering our download speed to the median of potentially many
|
||||
// bad nodes, we can target a smaller set of vey good nodes. At worse this will
|
||||
// result in less nodes to sync from, but that's still better than some hogging
|
||||
// the pipeline.
|
||||
const qosTuningPeers = 5 |
||||
|
||||
// rttMinEstimate is the minimal round trip time to target requests for. Since
|
||||
// every request entails a 2 way latency + bandwidth + serving database lookups,
|
||||
// it should be generous enough to permit meaningful work to be done on top of
|
||||
// the transmission costs.
|
||||
const rttMinEstimate = 2 * time.Second |
||||
|
||||
// rttMaxEstimate is the maximal round trip time to target requests for. Although
|
||||
// the expectation is that a well connected node will never reach this, certain
|
||||
// special connectivity ones might experience significant delays (e.g. satellite
|
||||
// uplink with 3s RTT). This value should be low enough to forbid stalling the
|
||||
// pipeline too long, but large enough to cover the worst of the worst links.
|
||||
const rttMaxEstimate = 20 * time.Second |
||||
|
||||
// rttPushdownFactor is a multiplier to attempt forcing quicker requests than
|
||||
// what the message rate tracker estimates. The reason is that message rate
|
||||
// tracking adapts queries to the RTT, but multiple RTT values can be perfectly
|
||||
// valid, they just result in higher packet sizes. Since smaller packets almost
|
||||
// always result in stabler download streams, this factor hones in on the lowest
|
||||
// RTT from all the functional ones.
|
||||
const rttPushdownFactor = 0.9 |
||||
|
||||
// rttMinConfidence is the minimum value the roundtrip confidence factor may drop
|
||||
// to. Since the target timeouts are based on how confident the tracker is in the
|
||||
// true roundtrip, it's important to not allow too huge fluctuations.
|
||||
const rttMinConfidence = 0.1 |
||||
|
||||
// ttlScaling is the multiplier that converts the estimated roundtrip time to a
|
||||
// timeout cap for network requests. The expectation is that peers' response time
|
||||
// will fluctuate around the estimated roundtrip, but depending in their load at
|
||||
// request time, it might be higher than anticipated. This scaling factor ensures
|
||||
// that we allow remote connections some slack but at the same time do enforce a
|
||||
// behavior similar to our median peers.
|
||||
const ttlScaling = 3 |
||||
|
||||
// ttlLimit is the maximum timeout allowance to prevent reaching crazy numbers
|
||||
// if some unforeseen network events shappen. As much as we try to hone in on
|
||||
// the most optimal values, it doesn't make any sense to go above a threshold,
|
||||
// even if everything is slow and screwy.
|
||||
const ttlLimit = time.Minute |
||||
|
||||
// tuningConfidenceCap is the number of active peers above which to stop detuning
|
||||
// the confidence number. The idea here is that once we hone in on the capacity
|
||||
// of a meaningful number of peers, adding one more should ot have a significant
|
||||
// impact on things, so just ron with the originals.
|
||||
const tuningConfidenceCap = 10 |
||||
|
||||
// tuningImpact is the influence that a new tuning target has on the previously
|
||||
// cached value. This number is mostly just an out-of-the-blue heuristic that
|
||||
// prevents the estimates from jumping around. There's no particular reason for
|
||||
// the current value.
|
||||
const tuningImpact = 0.25 |
||||
|
||||
// Tracker estimates the throughput capacity of a peer with regard to each data
|
||||
// type it can deliver. The goal is to dynamically adjust request sizes to max
|
||||
// out network throughput without overloading either the peer or th elocal node.
|
||||
//
|
||||
// By tracking in real time the latencies and bandiwdths peers exhibit for each
|
||||
// packet type, it's possible to prevent overloading by detecting a slowdown on
|
||||
// one type when another type is pushed too hard.
|
||||
//
|
||||
// Similarly, real time measurements also help avoid overloading the local net
|
||||
// connection if our peers would otherwise be capable to deliver more, but the
|
||||
// local link is saturated. In that case, the live measurements will force us
|
||||
// to reduce request sizes until the throughput gets stable.
|
||||
//
|
||||
// Lastly, message rate measurements allows us to detect if a peer is unsuaully
|
||||
// slow compared to other peers, in which case we can decide to keep it around
|
||||
// or free up the slot so someone closer.
|
||||
//
|
||||
// Since throughput tracking and estimation adapts dynamically to live network
|
||||
// conditions, it's fine to have multiple trackers locally track the same peer
|
||||
// in different subsystem. The throughput will simply be distributed across the
|
||||
// two trackers if both are highly active.
|
||||
type Tracker struct { |
||||
// capacity is the number of items retrievable per second of a given type.
|
||||
// It is analogous to bandwidth, but we deliberately avoided using bytes
|
||||
// as the unit, since serving nodes also spend a lot of time loading data
|
||||
// from disk, which is linear in the number of items, but mostly constant
|
||||
// in their sizes.
|
||||
//
|
||||
// Callers of course are free to use the item counter as a byte counter if
|
||||
// or when their protocol of choise if capped by bytes instead of items.
|
||||
// (eg. eth.getHeaders vs snap.getAccountRange).
|
||||
capacity map[uint64]float64 |
||||
|
||||
// roundtrip is the latency a peer in general responds to data requests.
|
||||
// This number is not used inside the tracker, but is exposed to compare
|
||||
// peers to each other and filter out slow ones. Note however, it only
|
||||
// makes sense to compare RTTs if the caller caters request sizes for
|
||||
// each peer to target the same RTT. There's no need to make this number
|
||||
// the real networking RTT, we just need a number to compare peers with.
|
||||
roundtrip time.Duration |
||||
|
||||
lock sync.RWMutex |
||||
} |
||||
|
||||
// NewTracker creates a new message rate tracker for a specific peer. An initial
|
||||
// RTT is needed to avoid a peer getting marked as an outlier compared to others
|
||||
// right after joining. It's suggested to use the median rtt across all peers to
|
||||
// init a new peer tracker.
|
||||
func NewTracker(caps map[uint64]float64, rtt time.Duration) *Tracker { |
||||
if caps == nil { |
||||
caps = make(map[uint64]float64) |
||||
} |
||||
return &Tracker{ |
||||
capacity: caps, |
||||
roundtrip: rtt, |
||||
} |
||||
} |
||||
|
||||
// Capacity calculates the number of items the peer is estimated to be able to
|
||||
// retrieve within the alloted time slot. The method will round up any division
|
||||
// errors and will add an additional overestimation ratio on top. The reason for
|
||||
// overshooting the capacity is because certain message types might not increase
|
||||
// the load proportionally to the requested items, so fetching a bit more might
|
||||
// still take the same RTT. By forcefully overshooting by a small amount, we can
|
||||
// avoid locking into a lower-that-real capacity.
|
||||
func (t *Tracker) Capacity(kind uint64, targetRTT time.Duration) float64 { |
||||
t.lock.RLock() |
||||
defer t.lock.RUnlock() |
||||
|
||||
// Calculate the actual measured throughput
|
||||
throughput := t.capacity[kind] * float64(targetRTT) / float64(time.Second) |
||||
|
||||
// Return an overestimation to force the peer out of a stuck minima, adding
|
||||
// +1 in case the item count is too low for the overestimator to dent
|
||||
return 1 + capacityOverestimation*throughput |
||||
} |
||||
|
||||
// Update modifies the peer's capacity values for a specific data type with a new
|
||||
// measurement. If the delivery is zero, the peer is assumed to have either timed
|
||||
// out or to not have the requested data, resulting in a slash to 0 capacity. This
|
||||
// avoids assigning the peer retrievals that it won't be able to honour.
|
||||
func (t *Tracker) Update(kind uint64, elapsed time.Duration, items int) { |
||||
t.lock.Lock() |
||||
defer t.lock.Unlock() |
||||
|
||||
// If nothing was delivered (timeout / unavailable data), reduce throughput
|
||||
// to minimum
|
||||
if items == 0 { |
||||
t.capacity[kind] = 0 |
||||
return |
||||
} |
||||
// Otherwise update the throughput with a new measurement
|
||||
if elapsed <= 0 { |
||||
elapsed = 1 // +1 (ns) to ensure non-zero divisor
|
||||
} |
||||
measured := float64(items) / (float64(elapsed) / float64(time.Second)) |
||||
|
||||
t.capacity[kind] = (1-measurementImpact)*(t.capacity[kind]) + measurementImpact*measured |
||||
t.roundtrip = time.Duration((1-measurementImpact)*float64(t.roundtrip) + measurementImpact*float64(elapsed)) |
||||
} |
||||
|
||||
// Trackers is a set of message rate trackers across a number of peers with the
|
||||
// goal of aggregating certain measurements across the entire set for outlier
|
||||
// filtering and newly joining initialization.
|
||||
type Trackers struct { |
||||
trackers map[string]*Tracker |
||||
|
||||
// roundtrip is the current best guess as to what is a stable round trip time
|
||||
// across the entire collection of connected peers. This is derived from the
|
||||
// various trackers added, but is used as a cache to avoid recomputing on each
|
||||
// network request. The value is updated once every RTT to avoid fluctuations
|
||||
// caused by hiccups or peer events.
|
||||
roundtrip time.Duration |
||||
|
||||
// confidence represents the probability that the estimated roundtrip value
|
||||
// is the real one across all our peers. The confidence value is used as an
|
||||
// impact factor of new measurements on old estimates. As our connectivity
|
||||
// stabilizes, this value gravitates towards 1, new measurements havinng
|
||||
// almost no impact. If there's a large peer churn and few peers, then new
|
||||
// measurements will impact it more. The confidence is increased with every
|
||||
// packet and dropped with every new connection.
|
||||
confidence float64 |
||||
|
||||
// tuned is the time instance the tracker recalculated its cached roundtrip
|
||||
// value and confidence values. A cleaner way would be to have a heartbeat
|
||||
// goroutine do it regularly, but that requires a lot of maintenance to just
|
||||
// run every now and again.
|
||||
tuned time.Time |
||||
|
||||
// The fields below can be used to override certain default values. Their
|
||||
// purpose is to allow quicker tests. Don't use them in production.
|
||||
OverrideTTLLimit time.Duration |
||||
|
||||
log log.Logger |
||||
lock sync.RWMutex |
||||
} |
||||
|
||||
// NewTrackers creates an empty set of trackers to be filled with peers.
|
||||
func NewTrackers(log log.Logger) *Trackers { |
||||
return &Trackers{ |
||||
trackers: make(map[string]*Tracker), |
||||
roundtrip: rttMaxEstimate, |
||||
confidence: 1, |
||||
tuned: time.Now(), |
||||
OverrideTTLLimit: ttlLimit, |
||||
log: log, |
||||
} |
||||
} |
||||
|
||||
// Track inserts a new tracker into the set.
|
||||
func (t *Trackers) Track(id string, tracker *Tracker) error { |
||||
t.lock.Lock() |
||||
defer t.lock.Unlock() |
||||
|
||||
if _, ok := t.trackers[id]; ok { |
||||
return errors.New("already tracking") |
||||
} |
||||
t.trackers[id] = tracker |
||||
t.detune() |
||||
|
||||
return nil |
||||
} |
||||
|
||||
// Untrack stops tracking a previously added peer.
|
||||
func (t *Trackers) Untrack(id string) error { |
||||
t.lock.Lock() |
||||
defer t.lock.Unlock() |
||||
|
||||
if _, ok := t.trackers[id]; !ok { |
||||
return errors.New("not tracking") |
||||
} |
||||
delete(t.trackers, id) |
||||
return nil |
||||
} |
||||
|
||||
// MedianRoundTrip returns the median RTT across all known trackers. The purpose
|
||||
// of the median RTT is to initialize a new peer with sane statistics that it will
|
||||
// hopefully outperform. If it seriously underperforms, there's a risk of dropping
|
||||
// the peer, but that is ok as we're aiming for a strong median.
|
||||
func (t *Trackers) MedianRoundTrip() time.Duration { |
||||
t.lock.RLock() |
||||
defer t.lock.RUnlock() |
||||
|
||||
return t.medianRoundTrip() |
||||
} |
||||
|
||||
// medianRoundTrip is the internal lockless version of MedianRoundTrip to be used
|
||||
// by the QoS tuner.
|
||||
func (t *Trackers) medianRoundTrip() time.Duration { |
||||
// Gather all the currently measured round trip times
|
||||
rtts := make([]float64, 0, len(t.trackers)) |
||||
for _, tt := range t.trackers { |
||||
tt.lock.RLock() |
||||
rtts = append(rtts, float64(tt.roundtrip)) |
||||
tt.lock.RUnlock() |
||||
} |
||||
sort.Float64s(rtts) |
||||
|
||||
median := rttMaxEstimate |
||||
if qosTuningPeers <= len(rtts) { |
||||
median = time.Duration(rtts[qosTuningPeers/2]) // Median of our best few peers
|
||||
} else if len(rtts) > 0 { |
||||
median = time.Duration(rtts[len(rtts)/2]) // Median of all out connected peers
|
||||
} |
||||
// Restrict the RTT into some QoS defaults, irrelevant of true RTT
|
||||
if median < rttMinEstimate { |
||||
median = rttMinEstimate |
||||
} |
||||
if median > rttMaxEstimate { |
||||
median = rttMaxEstimate |
||||
} |
||||
return median |
||||
} |
||||
|
||||
// MeanCapacities returns the capacities averaged across all the added trackers.
|
||||
// The purpos of the mean capacities are to initialize a new peer with some sane
|
||||
// starting values that it will hopefully outperform. If the mean overshoots, the
|
||||
// peer will be cut back to minimal capacity and given another chance.
|
||||
func (t *Trackers) MeanCapacities() map[uint64]float64 { |
||||
t.lock.RLock() |
||||
defer t.lock.RUnlock() |
||||
|
||||
return t.meanCapacities() |
||||
} |
||||
|
||||
// meanCapacities is the internal lockless version of MeanCapacities used for
|
||||
// debug logging.
|
||||
func (t *Trackers) meanCapacities() map[uint64]float64 { |
||||
capacities := make(map[uint64]float64) |
||||
for _, tt := range t.trackers { |
||||
tt.lock.RLock() |
||||
for key, val := range tt.capacity { |
||||
capacities[key] += val |
||||
} |
||||
tt.lock.RUnlock() |
||||
} |
||||
for key, val := range capacities { |
||||
capacities[key] = val / float64(len(t.trackers)) |
||||
} |
||||
return capacities |
||||
} |
||||
|
||||
// TargetRoundTrip returns the current target round trip time for a request to
|
||||
// complete in.The returned RTT is slightly under the estimated RTT. The reason
|
||||
// is that message rate estimation is a 2 dimensional problem which is solvable
|
||||
// for any RTT. The goal is to gravitate towards smaller RTTs instead of large
|
||||
// messages, to result in a stabler download stream.
|
||||
func (t *Trackers) TargetRoundTrip() time.Duration { |
||||
// Recalculate the internal caches if it's been a while
|
||||
t.tune() |
||||
|
||||
// Caches surely recent, return target roundtrip
|
||||
t.lock.RLock() |
||||
defer t.lock.RUnlock() |
||||
|
||||
return time.Duration(float64(t.roundtrip) * rttPushdownFactor) |
||||
} |
||||
|
||||
// TargetTimeout returns the timeout allowance for a single request to finish
|
||||
// under. The timeout is proportional to the roundtrip, but also takes into
|
||||
// consideration the tracker's confidence in said roundtrip and scales it
|
||||
// accordingly. The final value is capped to avoid runaway requests.
|
||||
func (t *Trackers) TargetTimeout() time.Duration { |
||||
// Recalculate the internal caches if it's been a while
|
||||
t.tune() |
||||
|
||||
// Caches surely recent, return target timeout
|
||||
t.lock.RLock() |
||||
defer t.lock.RUnlock() |
||||
|
||||
return t.targetTimeout() |
||||
} |
||||
|
||||
// targetTimeout is the internal lockless version of TargetTimeout to be used
|
||||
// during QoS tuning.
|
||||
func (t *Trackers) targetTimeout() time.Duration { |
||||
timeout := time.Duration(ttlScaling * float64(t.roundtrip) / t.confidence) |
||||
if timeout > t.OverrideTTLLimit { |
||||
timeout = t.OverrideTTLLimit |
||||
} |
||||
return timeout |
||||
} |
||||
|
||||
// tune gathers the individual tracker statistics and updates the estimated
|
||||
// request round trip time.
|
||||
func (t *Trackers) tune() { |
||||
// Tune may be called concurrently all over the place, but we only want to
|
||||
// periodically update and even then only once. First check if it was updated
|
||||
// recently and abort if so.
|
||||
t.lock.RLock() |
||||
dirty := time.Since(t.tuned) > t.roundtrip |
||||
t.lock.RUnlock() |
||||
if !dirty { |
||||
return |
||||
} |
||||
// If an update is needed, obtain a write lock but make sure we don't update
|
||||
// it on all concurrent threads one by one.
|
||||
t.lock.Lock() |
||||
defer t.lock.Unlock() |
||||
|
||||
if dirty := time.Since(t.tuned) > t.roundtrip; !dirty { |
||||
return // A concurrent request beat us to the tuning
|
||||
} |
||||
// First thread reaching the tuning point, update the estimates and return
|
||||
t.roundtrip = time.Duration((1-tuningImpact)*float64(t.roundtrip) + tuningImpact*float64(t.medianRoundTrip())) |
||||
t.confidence = t.confidence + (1-t.confidence)/2 |
||||
|
||||
t.tuned = time.Now() |
||||
t.log.Debug("Recalculated msgrate QoS values", "rtt", t.roundtrip, "confidence", t.confidence, "ttl", t.targetTimeout(), "next", t.tuned.Add(t.roundtrip)) |
||||
t.log.Trace("Debug dump of mean capacities", "caps", log.Lazy{Fn: t.meanCapacities}) |
||||
} |
||||
|
||||
// detune reduces the tracker's confidence in order to make fresh measurements
|
||||
// have a larger impact on the estimates. It is meant to be used during new peer
|
||||
// connections so they can have a proper impact on the estimates.
|
||||
func (t *Trackers) detune() { |
||||
// If we have a single peer, confidence is always 1
|
||||
if len(t.trackers) == 1 { |
||||
t.confidence = 1 |
||||
return |
||||
} |
||||
// If we have a ton of peers, don't drop the confidence since there's enough
|
||||
// remaining to retain the same throughput
|
||||
if len(t.trackers) >= tuningConfidenceCap { |
||||
return |
||||
} |
||||
// Otherwise drop the confidence factor
|
||||
peers := float64(len(t.trackers)) |
||||
|
||||
t.confidence = t.confidence * (peers - 1) / peers |
||||
if t.confidence < rttMinConfidence { |
||||
t.confidence = rttMinConfidence |
||||
} |
||||
t.log.Debug("Relaxed msgrate QoS values", "rtt", t.roundtrip, "confidence", t.confidence, "ttl", t.targetTimeout()) |
||||
} |
||||
|
||||
// Capacity is a helper function to access a specific tracker without having to
|
||||
// track it explicitly outside.
|
||||
func (t *Trackers) Capacity(id string, kind uint64, targetRTT time.Duration) float64 { |
||||
t.lock.RLock() |
||||
defer t.lock.RUnlock() |
||||
|
||||
tracker := t.trackers[id] |
||||
if tracker == nil { |
||||
return 1 // Unregister race, don't return 0, it's a dangerous number
|
||||
} |
||||
return tracker.Capacity(kind, targetRTT) |
||||
} |
||||
|
||||
// Update is a helper function to access a specific tracker without having to
|
||||
// track it explicitly outside.
|
||||
func (t *Trackers) Update(id string, kind uint64, elapsed time.Duration, items int) { |
||||
t.lock.RLock() |
||||
defer t.lock.RUnlock() |
||||
|
||||
if tracker := t.trackers[id]; tracker != nil { |
||||
tracker.Update(kind, elapsed, items) |
||||
} |
||||
} |
Loading…
Reference in new issue