This commit is contained in:
mr
2026-03-11 19:29:39 +01:00
parent d0af40f4c7
commit 780a0c530d
13 changed files with 51 additions and 997 deletions

View File

@@ -37,43 +37,6 @@ type Score struct {
// Peer witnesses
witnessChecked int
witnessConsistent int
// WitnessPool: up to 3 witnesses last reported by this indexer.
// Used for indirect probing when the indexer becomes unreachable.
// Oldest entry is replaced when the pool is full and a fresher witness arrives.
WitnessPool []WitnessCacheEntry
}
// WitnessCacheEntry holds one witness AddrInfo with its last-seen timestamp.
const maxWitnessPool = 3
type WitnessCacheEntry struct {
AI pp.AddrInfo
SeenAt time.Time
}
// UpdateWitnessPool inserts or refreshes a witness entry.
// If the pool is full and the witness is new, the oldest entry is replaced.
func (s *Score) UpdateWitnessPool(w pp.AddrInfo) {
for i, e := range s.WitnessPool {
if e.AI.ID == w.ID {
s.WitnessPool[i].AI = w
s.WitnessPool[i].SeenAt = time.Now()
return
}
}
entry := WitnessCacheEntry{AI: w, SeenAt: time.Now()}
if len(s.WitnessPool) < maxWitnessPool {
s.WitnessPool = append(s.WitnessPool, entry)
return
}
// Replace oldest.
oldest := 0
for i, e := range s.WitnessPool {
if e.SeenAt.Before(s.WitnessPool[oldest].SeenAt) {
oldest = i
}
}
s.WitnessPool[oldest] = entry
}
// computeNodeSideScore computes the node's quality assessment of an indexer from raw metrics.

View File

@@ -172,46 +172,6 @@ func HandleWitnessQuery(h host.Host, s network.Stream) {
json.NewEncoder(s).Encode(report)
}
// IndirectProbeIndexer asks each witness in the cache whether it still sees
// the given indexer (by PeerID). Returns true if at least one witness confirms
// it is alive — meaning our direct link is asymmetrically broken, not the indexer.
// All probes run in parallel; the function blocks at most 5 seconds.
func IndirectProbeIndexer(h host.Host, indexerPeerID string, pool []WitnessCacheEntry) bool {
if len(pool) == 0 {
return false
}
results := make(chan bool, len(pool))
for _, e := range pool {
go func(ai pp.AddrInfo) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
s, err := h.NewStream(ctx, ai.ID, ProtocolWitnessQuery)
if err != nil {
results <- false
return
}
defer s.Reset()
s.SetDeadline(time.Now().Add(5 * time.Second))
if err := json.NewEncoder(s).Encode(WitnessRequest{IndexerPeerID: indexerPeerID}); err != nil {
results <- false
return
}
var rep WitnessReport
if err := json.NewDecoder(s).Decode(&rep); err != nil {
results <- false
return
}
results <- rep.Seen
}(e.AI)
}
for range pool {
if <-results {
return true
}
}
return false
}
// SupportsHeartbeat probes pid with a short-lived stream to verify it has
// a ProtocolHeartbeat handler (i.e. it is an indexer, not a plain node).
// Only protocol negotiation is performed — no data is sent.

View File

@@ -368,7 +368,6 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
resp, rtt, err := sendHeartbeat(ctx, h, proto, ai.Info, hb, directory.Streams, interval*time.Second)
if err != nil { // Heartbeat fails
fmt.Println("EERR", err)
HeartbeatFailure(h, proto, directory, ai.Addr, ai.Info, isIndexerHB, maxPool, err)
continue
}
@@ -377,6 +376,7 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
// even if the indexer does not support bidirectional heartbeat (Fix 1).
if isIndexerHB && score != nil {
score.UptimeTracker.RecordHeartbeat()
score.UptimeTracker.ConsecutiveFails = 0 // reset on success
maxRTT := BaseRoundTrip * 10
latencyScore := 1.0 - float64(rtt)/float64(maxRTT)
@@ -442,11 +442,6 @@ func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.H
}
}
// Refresh local witness cache for indirect probing on future failure.
for _, w := range resp.Witnesses {
score.UpdateWitnessPool(w)
}
// Launch witness cross-check asynchronously (must not hold lock).
if len(resp.Witnesses) > 0 {
go queryWitnesses(h, ai.Info.ID.String(), resp.BornAt, resp.FillRate, resp.Witnesses, score)
@@ -550,16 +545,22 @@ func HeartbeatFailure(h host.Host, proto protocol.ID, directory *Directory,
Msg("[pool] seed heartbeat failed — keeping in pool, ticker will retry " + err.Error())
return
}
// Indirect probe: query cached witnesses before declaring the indexer dead.
// If a witness confirms it is alive, the failure is a local asymmetric
// link — not the indexer. Skip eviction; next tick will retry directly.
if len(score.WitnessPool) > 0 {
pool := append([]WitnessCacheEntry(nil), score.WitnessPool...)
if IndirectProbeIndexer(h, info.ID.String(), pool) {
// Indirect probing via other alive indexers:
// If other indexers in the pool are still responding, they act as implicit
// third-party witnesses confirming our connectivity is fine — the failed
// indexer is genuinely dead, evict immediately.
// If this is the last indexer, there is no third party. Retry up to 3 times
// (consecutive failures tracked in UptimeTracker) before declaring it dead.
if len(directory.GetAddrs()) <= 1 {
score.UptimeTracker.ConsecutiveFails++
if score.UptimeTracker.ConsecutiveFails < 3 {
logger.Warn().Str("peer", info.ID.String()).
Msg("[indirect] witness confirms indexer alive — asymmetric link, skipping eviction " + err.Error())
Int("attempt", score.UptimeTracker.ConsecutiveFails).
Msg("[indirect] last indexer failed, retrying before eviction")
return
}
logger.Warn().Str("peer", info.ID.String()).
Msg("[indirect] last indexer failed 3 times consecutively, evicting")
}
}
}

View File

@@ -18,9 +18,10 @@ const MaxPayloadChallenge = 2048
const BaseRoundTrip = 400 * time.Millisecond
type UptimeTracker struct {
FirstSeen time.Time
LastSeen time.Time
TotalOnline time.Duration
FirstSeen time.Time
LastSeen time.Time
TotalOnline time.Duration
ConsecutiveFails int // incremented on each heartbeat failure; reset to 0 on success
}
// RecordHeartbeat accumulates online time gap-aware: only counts the interval if

View File

@@ -12,6 +12,6 @@ type HeartBeatStreamed interface {
}
type DiscoveryPeer interface {
GetPeerRecord(ctx context.Context, key string, search bool) ([]*peer.Peer, error)
GetPeerRecord(ctx context.Context, key string) ([]*peer.Peer, error)
GetPubSub(topicName string) *pubsub.Topic
}