Change
This commit is contained in:
@@ -37,43 +37,6 @@ type Score struct {
|
||||
// Peer witnesses
|
||||
witnessChecked int
|
||||
witnessConsistent int
|
||||
// WitnessPool: up to 3 witnesses last reported by this indexer.
|
||||
// Used for indirect probing when the indexer becomes unreachable.
|
||||
// Oldest entry is replaced when the pool is full and a fresher witness arrives.
|
||||
WitnessPool []WitnessCacheEntry
|
||||
}
|
||||
|
||||
// WitnessCacheEntry holds one witness AddrInfo with its last-seen timestamp.
|
||||
const maxWitnessPool = 3
|
||||
|
||||
type WitnessCacheEntry struct {
|
||||
AI pp.AddrInfo
|
||||
SeenAt time.Time
|
||||
}
|
||||
|
||||
// UpdateWitnessPool inserts or refreshes a witness entry.
|
||||
// If the pool is full and the witness is new, the oldest entry is replaced.
|
||||
func (s *Score) UpdateWitnessPool(w pp.AddrInfo) {
|
||||
for i, e := range s.WitnessPool {
|
||||
if e.AI.ID == w.ID {
|
||||
s.WitnessPool[i].AI = w
|
||||
s.WitnessPool[i].SeenAt = time.Now()
|
||||
return
|
||||
}
|
||||
}
|
||||
entry := WitnessCacheEntry{AI: w, SeenAt: time.Now()}
|
||||
if len(s.WitnessPool) < maxWitnessPool {
|
||||
s.WitnessPool = append(s.WitnessPool, entry)
|
||||
return
|
||||
}
|
||||
// Replace oldest.
|
||||
oldest := 0
|
||||
for i, e := range s.WitnessPool {
|
||||
if e.SeenAt.Before(s.WitnessPool[oldest].SeenAt) {
|
||||
oldest = i
|
||||
}
|
||||
}
|
||||
s.WitnessPool[oldest] = entry
|
||||
}
|
||||
|
||||
// computeNodeSideScore computes the node's quality assessment of an indexer from raw metrics.
|
||||
|
||||
@@ -172,46 +172,6 @@ func HandleWitnessQuery(h host.Host, s network.Stream) {
|
||||
json.NewEncoder(s).Encode(report)
|
||||
}
|
||||
|
||||
// IndirectProbeIndexer asks each witness in the cache whether it still sees
|
||||
// the given indexer (by PeerID). Returns true if at least one witness confirms
|
||||
// it is alive — meaning our direct link is asymmetrically broken, not the indexer.
|
||||
// All probes run in parallel; the function blocks at most 5 seconds.
|
||||
func IndirectProbeIndexer(h host.Host, indexerPeerID string, pool []WitnessCacheEntry) bool {
|
||||
if len(pool) == 0 {
|
||||
return false
|
||||
}
|
||||
results := make(chan bool, len(pool))
|
||||
for _, e := range pool {
|
||||
go func(ai pp.AddrInfo) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
s, err := h.NewStream(ctx, ai.ID, ProtocolWitnessQuery)
|
||||
if err != nil {
|
||||
results <- false
|
||||
return
|
||||
}
|
||||
defer s.Reset()
|
||||
s.SetDeadline(time.Now().Add(5 * time.Second))
|
||||
if err := json.NewEncoder(s).Encode(WitnessRequest{IndexerPeerID: indexerPeerID}); err != nil {
|
||||
results <- false
|
||||
return
|
||||
}
|
||||
var rep WitnessReport
|
||||
if err := json.NewDecoder(s).Decode(&rep); err != nil {
|
||||
results <- false
|
||||
return
|
||||
}
|
||||
results <- rep.Seen
|
||||
}(e.AI)
|
||||
}
|
||||
for range pool {
|
||||
if <-results {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// SupportsHeartbeat probes pid with a short-lived stream to verify it has
|
||||
// a ProtocolHeartbeat handler (i.e. it is an indexer, not a plain node).
|
||||
// Only protocol negotiation is performed — no data is sent.
|
||||
|
||||
@@ -368,7 +368,6 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
|
||||
|
||||
resp, rtt, err := sendHeartbeat(ctx, h, proto, ai.Info, hb, directory.Streams, interval*time.Second)
|
||||
if err != nil { // Heartbeat fails
|
||||
fmt.Println("EERR", err)
|
||||
HeartbeatFailure(h, proto, directory, ai.Addr, ai.Info, isIndexerHB, maxPool, err)
|
||||
continue
|
||||
}
|
||||
@@ -377,6 +376,7 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
|
||||
// even if the indexer does not support bidirectional heartbeat (Fix 1).
|
||||
if isIndexerHB && score != nil {
|
||||
score.UptimeTracker.RecordHeartbeat()
|
||||
score.UptimeTracker.ConsecutiveFails = 0 // reset on success
|
||||
|
||||
maxRTT := BaseRoundTrip * 10
|
||||
latencyScore := 1.0 - float64(rtt)/float64(maxRTT)
|
||||
@@ -442,11 +442,6 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
|
||||
}
|
||||
}
|
||||
|
||||
// Refresh local witness cache for indirect probing on future failure.
|
||||
for _, w := range resp.Witnesses {
|
||||
score.UpdateWitnessPool(w)
|
||||
}
|
||||
|
||||
// Launch witness cross-check asynchronously (must not hold lock).
|
||||
if len(resp.Witnesses) > 0 {
|
||||
go queryWitnesses(h, ai.Info.ID.String(), resp.BornAt, resp.FillRate, resp.Witnesses, score)
|
||||
@@ -550,16 +545,22 @@ func HeartbeatFailure(h host.Host, proto protocol.ID, directory *Directory,
|
||||
Msg("[pool] seed heartbeat failed — keeping in pool, ticker will retry " + err.Error())
|
||||
return
|
||||
}
|
||||
// Indirect probe: query cached witnesses before declaring the indexer dead.
|
||||
// If a witness confirms it is alive, the failure is a local asymmetric
|
||||
// link — not the indexer. Skip eviction; next tick will retry directly.
|
||||
if len(score.WitnessPool) > 0 {
|
||||
pool := append([]WitnessCacheEntry(nil), score.WitnessPool...)
|
||||
if IndirectProbeIndexer(h, info.ID.String(), pool) {
|
||||
// Indirect probing via other alive indexers:
|
||||
// If other indexers in the pool are still responding, they act as implicit
|
||||
// third-party witnesses confirming our connectivity is fine — the failed
|
||||
// indexer is genuinely dead, evict immediately.
|
||||
// If this is the last indexer, there is no third party. Retry up to 3 times
|
||||
// (consecutive failures tracked in UptimeTracker) before declaring it dead.
|
||||
if len(directory.GetAddrs()) <= 1 {
|
||||
score.UptimeTracker.ConsecutiveFails++
|
||||
if score.UptimeTracker.ConsecutiveFails < 3 {
|
||||
logger.Warn().Str("peer", info.ID.String()).
|
||||
Msg("[indirect] witness confirms indexer alive — asymmetric link, skipping eviction " + err.Error())
|
||||
Int("attempt", score.UptimeTracker.ConsecutiveFails).
|
||||
Msg("[indirect] last indexer failed, retrying before eviction")
|
||||
return
|
||||
}
|
||||
logger.Warn().Str("peer", info.ID.String()).
|
||||
Msg("[indirect] last indexer failed 3 times consecutively, evicting")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,9 +18,10 @@ const MaxPayloadChallenge = 2048
|
||||
const BaseRoundTrip = 400 * time.Millisecond
|
||||
|
||||
type UptimeTracker struct {
|
||||
FirstSeen time.Time
|
||||
LastSeen time.Time
|
||||
TotalOnline time.Duration
|
||||
FirstSeen time.Time
|
||||
LastSeen time.Time
|
||||
TotalOnline time.Duration
|
||||
ConsecutiveFails int // incremented on each heartbeat failure; reset to 0 on success
|
||||
}
|
||||
|
||||
// RecordHeartbeat accumulates online time gap-aware: only counts the interval if
|
||||
|
||||
@@ -12,6 +12,6 @@ type HeartBeatStreamed interface {
|
||||
}
|
||||
|
||||
type DiscoveryPeer interface {
|
||||
GetPeerRecord(ctx context.Context, key string, search bool) ([]*peer.Peer, error)
|
||||
GetPeerRecord(ctx context.Context, key string) ([]*peer.Peer, error)
|
||||
GetPubSub(topicName string) *pubsub.Topic
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user