2026-03-11 16:28:15 +01:00
|
|
|
package common
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"context"
|
|
|
|
|
"encoding/json"
|
|
|
|
|
"fmt"
|
|
|
|
|
"math/rand"
|
|
|
|
|
"strings"
|
|
|
|
|
"sync/atomic"
|
|
|
|
|
"time"
|
|
|
|
|
|
|
|
|
|
"oc-discovery/conf"
|
|
|
|
|
|
|
|
|
|
oclib "cloud.o-forge.io/core/oc-lib"
|
|
|
|
|
|
|
|
|
|
"github.com/libp2p/go-libp2p/core/host"
|
|
|
|
|
"github.com/libp2p/go-libp2p/core/network"
|
|
|
|
|
pp "github.com/libp2p/go-libp2p/core/peer"
|
|
|
|
|
"github.com/libp2p/go-libp2p/core/protocol"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
var TimeWatcher time.Time
|
|
|
|
|
|
|
|
|
|
// retryRunning guards against launching multiple retryUntilSeedResponds goroutines.
|
|
|
|
|
var retryRunning atomic.Bool
|
|
|
|
|
|
|
|
|
|
func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, recordFn ...func() json.RawMessage) error {
|
|
|
|
|
TimeWatcher = time.Now().UTC()
|
|
|
|
|
logger := oclib.GetLogger()
|
|
|
|
|
|
|
|
|
|
// Bootstrap from IndexerAddresses seed set.
|
|
|
|
|
addresses := strings.Split(conf.GetConfig().IndexerAddresses, ",")
|
|
|
|
|
if len(addresses) > maxIndexer {
|
|
|
|
|
addresses = addresses[0:maxIndexer]
|
|
|
|
|
}
|
|
|
|
|
for _, indexerAddr := range addresses {
|
|
|
|
|
indexerAddr = strings.TrimSpace(indexerAddr)
|
|
|
|
|
if indexerAddr == "" {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
ad, err := pp.AddrInfoFromString(indexerAddr)
|
|
|
|
|
if err != nil {
|
|
|
|
|
logger.Err(err)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
key := ad.ID.String()
|
|
|
|
|
Indexers.SetAddr(key, ad)
|
|
|
|
|
// Pre-create score entry with IsSeed=true so the sticky flag is set before
|
|
|
|
|
// the first heartbeat tick (lazy creation in doTick would lose the flag).
|
|
|
|
|
if !Indexers.ExistsScore(key) {
|
|
|
|
|
Indexers.SetScore(key, &Score{
|
|
|
|
|
FirstContacted: time.Now().UTC(),
|
|
|
|
|
UptimeTracker: &UptimeTracker{FirstSeen: time.Now().UTC()},
|
|
|
|
|
nextChallenge: rand.Intn(10) + 1,
|
|
|
|
|
IsSeed: true,
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
seeds := Indexers.GetAddrs()
|
|
|
|
|
indexerCount := len(seeds)
|
|
|
|
|
|
|
|
|
|
if indexerCount < minIndexer {
|
|
|
|
|
return fmt.Errorf("you run a node without indexers... your gonna be isolated.")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Start long-lived heartbeat to seed indexers. The single goroutine follows
|
|
|
|
|
// all subsequent StaticIndexers changes.
|
|
|
|
|
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name,
|
|
|
|
|
h, Indexers, 20*time.Second, maxIndexer, recordFn...)
|
|
|
|
|
|
|
|
|
|
// Watch for inbound connections: if a peer connects to us and our pool has
|
|
|
|
|
// room, probe it first to confirm it supports ProtocolHeartbeat (i.e. it is
|
|
|
|
|
// an indexer). Plain nodes don't register the handler — the negotiation fails
|
|
|
|
|
// instantly so we never pollute the pool with non-indexer peers.
|
|
|
|
|
h.Network().Notify(&network.NotifyBundle{
|
|
|
|
|
ConnectedF: func(n network.Network, c network.Conn) {
|
|
|
|
|
if c.Stat().Direction != network.DirInbound {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
if len(Indexers.GetAddrs()) >= maxIndexer {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
peerID := c.RemotePeer()
|
|
|
|
|
if Indexers.ExistsAddr(peerID.String()) {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
// Probe in a goroutine — ConnectedF must not block.
|
|
|
|
|
go func(pid pp.ID) {
|
|
|
|
|
if !SupportsHeartbeat(h, pid) {
|
|
|
|
|
return // plain node, skip
|
|
|
|
|
}
|
|
|
|
|
if len(Indexers.GetAddrs()) >= maxIndexer {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
if Indexers.ExistsAddr(pid.String()) {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
addrs := h.Peerstore().Addrs(pid)
|
|
|
|
|
if len(addrs) == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
ai := FilterLoopbackAddrs(pp.AddrInfo{ID: pid, Addrs: addrs})
|
|
|
|
|
if len(ai.Addrs) == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
adCopy := ai
|
|
|
|
|
Indexers.SetAddr(pid.String(), &adCopy)
|
|
|
|
|
Indexers.NudgeIt()
|
|
|
|
|
log := oclib.GetLogger()
|
|
|
|
|
log.Info().Str("peer", pid.String()).
|
|
|
|
|
Msg("[pool] inbound indexer peer added as candidate")
|
|
|
|
|
}(peerID)
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// Proactive DHT upgrade: once seeds are connected and the DHT routing table
|
|
|
|
|
// is warm, discover better indexers and add them to the pool alongside the seeds.
|
|
|
|
|
// Seeds stay as guaranteed anchors; scoring will demote poor performers over time.
|
|
|
|
|
go func(seeds []Entry) {
|
|
|
|
|
// Let seed connections establish and the DHT routing table warm up.
|
|
|
|
|
time.Sleep(5 * time.Second)
|
|
|
|
|
// For pure nodes (no IndexerService), spin up a lightweight DHT client.
|
|
|
|
|
if discoveryDHT == nil {
|
|
|
|
|
if len(seeds) == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
initNodeDHT(h, seeds)
|
|
|
|
|
}
|
|
|
|
|
if discoveryDHT == nil {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
current := len(Indexers.GetAddrs())
|
|
|
|
|
need := maxIndexer - current
|
|
|
|
|
if need <= 0 {
|
|
|
|
|
need = maxIndexer / 2 // diversify even when pool is already at capacity
|
|
|
|
|
}
|
|
|
|
|
logger.Info().Int("need", need).Msg("[dht] proactive indexer discovery from DHT")
|
|
|
|
|
replenishIndexersFromDHT(h, need)
|
|
|
|
|
}(seeds)
|
|
|
|
|
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// reconnectToSeeds re-adds the configured seed indexers to StaticIndexers as
|
|
|
|
|
// sticky fallback entries. Called when the pool drops to zero so the node
|
|
|
|
|
// never becomes completely isolated.
|
|
|
|
|
func reconnectToSeeds() {
|
|
|
|
|
logger := oclib.GetLogger()
|
|
|
|
|
logger.Warn().Msg("[pool] all indexers lost, reconnecting to configured seeds")
|
|
|
|
|
addresses := strings.Split(conf.GetConfig().IndexerAddresses, ",")
|
|
|
|
|
for _, addrStr := range addresses {
|
|
|
|
|
addrStr = strings.TrimSpace(addrStr)
|
|
|
|
|
if addrStr == "" {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
ad, err := pp.AddrInfoFromString(addrStr)
|
|
|
|
|
if err != nil {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
key := ad.ID.String()
|
|
|
|
|
Indexers.SetAddr(key, ad)
|
|
|
|
|
if score := Indexers.GetScore(key); score == nil {
|
|
|
|
|
Indexers.SetScore(key, &Score{
|
|
|
|
|
FirstContacted: time.Now().UTC(),
|
|
|
|
|
UptimeTracker: &UptimeTracker{FirstSeen: time.Now().UTC()},
|
|
|
|
|
nextChallenge: rand.Intn(10) + 1,
|
|
|
|
|
IsSeed: true,
|
|
|
|
|
})
|
|
|
|
|
} else {
|
|
|
|
|
// Restore sticky flag so the seed is not immediately re-ejected.
|
|
|
|
|
score.IsSeed = true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// retryUntilSeedResponds loops with exponential backoff until at least one
|
|
|
|
|
// configured seed is reachable again. Once seeds are back in the pool it
|
|
|
|
|
// nudges the heartbeat loop and lets the normal DHT upgrade path take over.
|
|
|
|
|
// Should be called in a goroutine — it blocks until the situation resolves.
|
|
|
|
|
// Panics immediately if no seeds are configured: there is nothing to wait for.
|
|
|
|
|
func retryUntilSeedResponds() {
|
|
|
|
|
if !retryRunning.CompareAndSwap(false, true) {
|
|
|
|
|
return // another goroutine is already running the retry loop
|
|
|
|
|
}
|
|
|
|
|
defer retryRunning.Store(false)
|
|
|
|
|
|
|
|
|
|
logger := oclib.GetLogger()
|
|
|
|
|
rawAddresses := strings.TrimSpace(conf.GetConfig().IndexerAddresses)
|
|
|
|
|
if rawAddresses == "" {
|
|
|
|
|
// No seeds configured: rely on the inbound-connection notifee to fill
|
|
|
|
|
// the pool. Just wait patiently — the loop below will return as soon
|
|
|
|
|
// as any peer connects and NudgeIt() is called.
|
|
|
|
|
logger.Warn().Msg("[pool] pool empty and no seeds configured — waiting for inbound indexer")
|
|
|
|
|
}
|
|
|
|
|
backoff := 10 * time.Second
|
|
|
|
|
const maxBackoff = 5 * time.Minute
|
|
|
|
|
for {
|
|
|
|
|
time.Sleep(backoff)
|
|
|
|
|
if backoff < maxBackoff {
|
|
|
|
|
backoff *= 2
|
|
|
|
|
}
|
|
|
|
|
// Check whether someone else already refilled the pool.
|
|
|
|
|
if len(Indexers.GetAddrs()) > 0 {
|
|
|
|
|
logger.Info().Msg("[pool] pool refilled externally, stopping seed retry")
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
logger.Warn().Dur("backoff", backoff).Msg("[pool] still isolated, retrying seeds")
|
|
|
|
|
reconnectToSeeds()
|
|
|
|
|
if len(Indexers.GetAddrs()) > 0 {
|
|
|
|
|
Indexers.NudgeIt()
|
|
|
|
|
// Re-bootstrap DHT now that we have at least one connection candidate.
|
|
|
|
|
if discoveryDHT != nil {
|
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
|
|
|
|
discoveryDHT.Bootstrap(ctx) //nolint:errcheck
|
|
|
|
|
cancel()
|
|
|
|
|
}
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ensureScore returns the Score for addr, creating it if absent.
|
|
|
|
|
func ensureScore(d *Directory, addr string) *Score {
|
|
|
|
|
if !d.ExistsScore(addr) {
|
|
|
|
|
d.SetScore(addr, &Score{
|
|
|
|
|
FirstContacted: time.Now().UTC(),
|
|
|
|
|
UptimeTracker: &UptimeTracker{FirstSeen: time.Now().UTC()},
|
|
|
|
|
nextChallenge: rand.Intn(10) + 1,
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
return d.GetScore(addr)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// evictPeer removes addr from directory atomically and returns a snapshot of
|
|
|
|
|
// remaining AddrInfos (for consensus voter selection).
|
|
|
|
|
func evictPeer(d *Directory, addr string, id pp.ID, proto protocol.ID) []pp.AddrInfo {
|
|
|
|
|
d.Streams.Delete(proto, &id)
|
|
|
|
|
d.DeleteAddr(addr)
|
|
|
|
|
voters := make([]pp.AddrInfo, 0, len(d.Addrs))
|
|
|
|
|
for _, ai := range d.GetAddrs() {
|
|
|
|
|
if ai.Info != nil {
|
|
|
|
|
voters = append(voters, *ai.Info)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
d.DeleteScore(addr)
|
|
|
|
|
return voters
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// handleSuggestions adds unknown suggested indexers to the directory.
|
|
|
|
|
func handleSuggestions(d *Directory, from string, suggestions []pp.AddrInfo) {
|
|
|
|
|
added := 0
|
|
|
|
|
for _, sug := range suggestions {
|
|
|
|
|
key := addrKey(sug)
|
|
|
|
|
if !d.ExistsAddr(key) {
|
|
|
|
|
cpy := sug
|
|
|
|
|
d.SetAddr(key, &cpy)
|
|
|
|
|
added++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if added > 0 {
|
|
|
|
|
logger := oclib.GetLogger()
|
|
|
|
|
logger.Info().Int("added", added).Str("from", from).
|
|
|
|
|
Msg("added suggested indexers from heartbeat response")
|
|
|
|
|
d.NudgeIt()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// SendHeartbeat starts a goroutine that sends periodic heartbeats to peers.
|
|
|
|
|
// recordFn, when provided, is called on each tick and its output is embedded in
|
|
|
|
|
// the heartbeat as a fresh signed PeerRecord so the receiving indexer can
|
|
|
|
|
// republish it to the DHT without an extra round-trip.
|
|
|
|
|
// Pass no recordFn (or nil) for indexer→indexer / native heartbeats.
|
|
|
|
|
func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.Host, directory *Directory, interval time.Duration, maxPool int, recordFn ...func() json.RawMessage) {
|
|
|
|
|
logger := oclib.GetLogger()
|
|
|
|
|
isIndexerHB := directory == Indexers
|
|
|
|
|
var recFn func() json.RawMessage
|
|
|
|
|
if len(recordFn) > 0 {
|
|
|
|
|
recFn = recordFn[0]
|
|
|
|
|
}
|
|
|
|
|
go func() {
|
|
|
|
|
logger.Info().Str("proto", string(proto)).Int("peers", len(directory.Addrs)).Msg("heartbeat started")
|
|
|
|
|
t := time.NewTicker(interval)
|
|
|
|
|
defer t.Stop()
|
|
|
|
|
|
|
|
|
|
// peerEntry pairs addr key with AddrInfo so doTick can update score maps directly.
|
|
|
|
|
type peerEntry struct {
|
|
|
|
|
addr string
|
|
|
|
|
ai *pp.AddrInfo
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
doTick := func() {
|
|
|
|
|
addrs := directory.GetAddrsStr()
|
|
|
|
|
need := maxPool - len(addrs)
|
|
|
|
|
if need < 0 {
|
|
|
|
|
need = 0
|
|
|
|
|
}
|
|
|
|
|
baseHB := Heartbeat{
|
|
|
|
|
Name: name,
|
|
|
|
|
PeerID: h.ID().String(),
|
|
|
|
|
Timestamp: time.Now().UTC().Unix(),
|
|
|
|
|
IndexersBinded: addrs,
|
|
|
|
|
Need: need,
|
|
|
|
|
}
|
|
|
|
|
if recFn != nil {
|
|
|
|
|
baseHB.Record = recFn()
|
|
|
|
|
}
|
|
|
|
|
// Determine the referent indexer: highest-scored one receives Referent=true
|
|
|
|
|
// so it stores us in its referencedNodes for distributed search.
|
|
|
|
|
var referentAddr string
|
|
|
|
|
if isIndexerHB {
|
|
|
|
|
var bestScore float64 = -1
|
|
|
|
|
for _, ai2 := range directory.GetAddrs() {
|
|
|
|
|
if s := directory.GetScore(ai2.Addr); s != nil && s.Score > bestScore {
|
|
|
|
|
bestScore = s.Score
|
|
|
|
|
referentAddr = ai2.Addr
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for _, ai := range directory.GetAddrs() {
|
|
|
|
|
// Build per-peer heartbeat copy so challenge injection is peer-specific.
|
|
|
|
|
hb := baseHB
|
|
|
|
|
if isIndexerHB && referentAddr != "" && ai.Addr == referentAddr {
|
|
|
|
|
hb.Referent = true
|
|
|
|
|
}
|
|
|
|
|
// Ensure an IndexerScore entry exists for this peer.
|
|
|
|
|
var score *Score
|
|
|
|
|
if isIndexerHB {
|
|
|
|
|
score = ensureScore(directory, ai.Addr)
|
|
|
|
|
|
|
|
|
|
// Inject challenge batch if due (random 1-10 HBs between batches).
|
|
|
|
|
score.hbCount++
|
|
|
|
|
if score.hbCount >= score.nextChallenge {
|
|
|
|
|
// Ground truth: node's own PeerID — indexer MUST have us.
|
|
|
|
|
challenges := []string{h.ID().String()}
|
|
|
|
|
// Add up to 2 more known peers (other indexers) for richer data.
|
|
|
|
|
// Use the already-snapshotted entries to avoid re-locking.
|
|
|
|
|
for _, ai2 := range directory.GetAddrs() {
|
|
|
|
|
if ai2.Addr != ai.Addr && ai2.Info != nil {
|
|
|
|
|
challenges = append(challenges, ai2.Info.ID.String())
|
|
|
|
|
if len(challenges) >= 3 {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
hb.Challenges = challenges
|
|
|
|
|
score.hbCount = 0
|
|
|
|
|
score.nextChallenge = rand.Intn(10) + 1
|
|
|
|
|
score.challengeTotal++ // count own-PeerID challenge (ground truth)
|
|
|
|
|
score.dhtBatchCounter++
|
|
|
|
|
// DHT challenge every 5th batch: ask indexer to retrieve our own DID.
|
|
|
|
|
if score.dhtBatchCounter%5 == 0 {
|
|
|
|
|
var selfDID string
|
|
|
|
|
if len(baseHB.Record) > 0 {
|
|
|
|
|
var partial struct {
|
|
|
|
|
DID string `json:"did"`
|
|
|
|
|
}
|
|
|
|
|
if json.Unmarshal(baseHB.Record, &partial) == nil {
|
|
|
|
|
selfDID = partial.DID
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if selfDID != "" {
|
|
|
|
|
hb.ChallengeDID = selfDID
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
resp, rtt, err := sendHeartbeat(ctx, h, proto, ai.Info, hb, directory.Streams, interval*time.Second)
|
|
|
|
|
if err != nil { // Heartbeat fails
|
|
|
|
|
HeartbeatFailure(h, proto, directory, ai.Addr, ai.Info, isIndexerHB, maxPool, err)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Update IndexerScore — uptime recorded on any successful send,
|
|
|
|
|
// even if the indexer does not support bidirectional heartbeat (Fix 1).
|
|
|
|
|
if isIndexerHB && score != nil {
|
|
|
|
|
score.UptimeTracker.RecordHeartbeat()
|
2026-03-11 19:29:39 +01:00
|
|
|
score.UptimeTracker.ConsecutiveFails = 0 // reset on success
|
2026-03-11 16:28:15 +01:00
|
|
|
|
|
|
|
|
maxRTT := BaseRoundTrip * 10
|
|
|
|
|
latencyScore := 1.0 - float64(rtt)/float64(maxRTT)
|
|
|
|
|
if latencyScore < 0 {
|
|
|
|
|
latencyScore = 0
|
|
|
|
|
}
|
|
|
|
|
if latencyScore > 1 {
|
|
|
|
|
latencyScore = 1
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Update fill / challenge fields only when the indexer responded.
|
|
|
|
|
if resp != nil {
|
|
|
|
|
// BornAt stability check.
|
|
|
|
|
if score.LastBornAt.IsZero() {
|
|
|
|
|
score.LastBornAt = resp.BornAt
|
|
|
|
|
} else if !resp.BornAt.IsZero() && !resp.BornAt.Equal(score.LastBornAt) {
|
|
|
|
|
score.bornAtChanges++
|
|
|
|
|
score.LastBornAt = resp.BornAt
|
|
|
|
|
logger.Warn().Str("peer", ai.Info.ID.String()).
|
|
|
|
|
Int("changes", score.bornAtChanges).
|
|
|
|
|
Msg("indexer BornAt changed — possible restart or impersonation")
|
|
|
|
|
}
|
|
|
|
|
score.LastFillRate = resp.FillRate
|
|
|
|
|
|
|
|
|
|
// Fill rate consistency: cross-check peerCount/maxNodes vs reported fillRate.
|
|
|
|
|
if resp.MaxNodes > 0 {
|
|
|
|
|
expected := float64(resp.PeerCount) / float64(resp.MaxNodes)
|
|
|
|
|
diff := expected - resp.FillRate
|
|
|
|
|
if diff < 0 {
|
|
|
|
|
diff = -diff
|
|
|
|
|
}
|
|
|
|
|
score.fillChecked++
|
|
|
|
|
if diff < 0.1 {
|
|
|
|
|
score.fillConsistent++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Validate challenge responses. Only own-PeerID counts as ground truth.
|
|
|
|
|
if len(hb.Challenges) > 0 && len(resp.Challenges) > 0 {
|
|
|
|
|
ownID := h.ID().String()
|
|
|
|
|
for _, ce := range resp.Challenges {
|
|
|
|
|
if ce.PeerID != ownID {
|
|
|
|
|
continue // informational only
|
|
|
|
|
}
|
|
|
|
|
recentEnough := !ce.LastSeen.IsZero() &&
|
|
|
|
|
time.Since(ce.LastSeen) < 2*RecommendedHeartbeatInterval
|
|
|
|
|
if ce.Found && recentEnough {
|
|
|
|
|
score.challengeCorrect++
|
|
|
|
|
}
|
|
|
|
|
logger.Info().Str("peer", ai.Info.ID.String()).
|
|
|
|
|
Bool("found", ce.Found).
|
|
|
|
|
Bool("recent", recentEnough).
|
|
|
|
|
Msg("own-PeerID challenge result")
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// DHT challenge result.
|
|
|
|
|
if hb.ChallengeDID != "" {
|
|
|
|
|
score.dhtChecked++
|
|
|
|
|
if resp.DHTFound {
|
|
|
|
|
score.dhtSuccess++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Launch witness cross-check asynchronously (must not hold lock).
|
|
|
|
|
if len(resp.Witnesses) > 0 {
|
|
|
|
|
go queryWitnesses(h, ai.Info.ID.String(), resp.BornAt, resp.FillRate, resp.Witnesses, score)
|
|
|
|
|
} else if resp.MaxNodes > 0 {
|
|
|
|
|
// No witnesses offered. Valid if indexer only has us (PeerCount==1).
|
|
|
|
|
// Cross-check: FillRate should equal 1/MaxNodes within ±10%.
|
|
|
|
|
expected := 1.0 / float64(resp.MaxNodes)
|
|
|
|
|
diff := resp.FillRate - expected
|
|
|
|
|
if diff < 0 {
|
|
|
|
|
diff = -diff
|
|
|
|
|
}
|
|
|
|
|
score.witnessChecked++
|
|
|
|
|
if resp.PeerCount == 1 && diff < 0.1 {
|
|
|
|
|
score.witnessConsistent++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
score.Score = score.ComputeNodeSideScore(latencyScore)
|
|
|
|
|
age := score.UptimeTracker.Uptime()
|
|
|
|
|
minScore := dynamicMinScore(age)
|
|
|
|
|
// Fix 4: grace period — at least 2 full heartbeat cycles before ejecting.
|
|
|
|
|
isSeed := score.IsSeed
|
|
|
|
|
// Seeds are sticky: never evicted by score alone (SuggestMigrate handles it).
|
|
|
|
|
// Never eject the last indexer by score alone — we would lose all connectivity.
|
|
|
|
|
belowThreshold := score.Score < minScore &&
|
|
|
|
|
score.UptimeTracker.TotalOnline >= 2*RecommendedHeartbeatInterval &&
|
|
|
|
|
!isSeed &&
|
|
|
|
|
len(directory.Addrs) > 1
|
|
|
|
|
|
|
|
|
|
if belowThreshold {
|
|
|
|
|
logger.Info().Str("peer", ai.Info.ID.String()).
|
|
|
|
|
Float64("score", score.Score).Float64("min", minScore).
|
|
|
|
|
Msg("indexer score below threshold, removing from pool")
|
|
|
|
|
voters := evictPeer(directory, ai.Addr, ai.Info.ID, proto)
|
|
|
|
|
need := max(maxPool-len(voters), 1)
|
|
|
|
|
if len(voters) > 0 {
|
|
|
|
|
go TriggerConsensus(h, voters, need)
|
|
|
|
|
} else {
|
|
|
|
|
go replenishIndexersFromDHT(h, need)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Accept suggestions from this indexer — add unknown ones to the directory.
|
|
|
|
|
if resp != nil && len(resp.Suggestions) > 0 {
|
|
|
|
|
handleSuggestions(directory, ai.Info.ID.String(), resp.Suggestions)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Handle SuggestMigrate: indexer is overloaded and wants us to move.
|
|
|
|
|
if resp != nil && resp.SuggestMigrate && isIndexerHB {
|
|
|
|
|
nonSeedCount := 0
|
|
|
|
|
for _, sc := range directory.GetScores() {
|
|
|
|
|
if !sc.IsSeed {
|
|
|
|
|
nonSeedCount++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if nonSeedCount >= conf.GetConfig().MinIndexer {
|
|
|
|
|
if isSeed {
|
|
|
|
|
// Seed has offloaded us: clear sticky flag, score eviction takes over.
|
|
|
|
|
score.IsSeed = false
|
|
|
|
|
logger.Info().Str("peer", ai.Info.ID.String()).
|
|
|
|
|
Msg("seed discharged via SuggestMigrate, de-stickied")
|
|
|
|
|
} else {
|
|
|
|
|
evictPeer(directory, ai.Addr, ai.Info.ID, proto)
|
|
|
|
|
logger.Info().Str("peer", ai.Info.ID.String()).Msg("accepted migration from overloaded indexer")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for {
|
|
|
|
|
select {
|
|
|
|
|
case <-t.C:
|
|
|
|
|
doTick()
|
|
|
|
|
case <-directory.Nudge:
|
|
|
|
|
if isIndexerHB {
|
|
|
|
|
logger.Info().Msg("nudge received, heartbeating new indexers immediately")
|
|
|
|
|
doTick()
|
|
|
|
|
}
|
|
|
|
|
case <-ctx.Done():
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func HeartbeatFailure(h host.Host, proto protocol.ID, directory *Directory,
|
|
|
|
|
addr string, info *pp.AddrInfo, isIndexerHB bool, maxPool int, err error) {
|
|
|
|
|
logger := oclib.GetLogger()
|
|
|
|
|
logger.Err(err)
|
|
|
|
|
// Seeds are never evicted on heartbeat failure.
|
|
|
|
|
// Keeping them in the pool lets the regular 60-second ticker retry them
|
|
|
|
|
// at a natural cadence — no reconnect storm, no libp2p dial-backoff accumulation.
|
|
|
|
|
// A seed will self-heal once it comes back; DHT and inbound peers fill the gap.
|
|
|
|
|
if isIndexerHB {
|
|
|
|
|
if score := directory.GetScore(addr); score != nil {
|
|
|
|
|
if score.IsSeed {
|
|
|
|
|
logger.Warn().Str("peer", info.ID.String()).
|
|
|
|
|
Msg("[pool] seed heartbeat failed — keeping in pool, ticker will retry " + err.Error())
|
|
|
|
|
return
|
|
|
|
|
}
|
2026-03-11 19:29:39 +01:00
|
|
|
// Indirect probing via other alive indexers:
|
|
|
|
|
// If other indexers in the pool are still responding, they act as implicit
|
|
|
|
|
// third-party witnesses confirming our connectivity is fine — the failed
|
|
|
|
|
// indexer is genuinely dead, evict immediately.
|
|
|
|
|
// If this is the last indexer, there is no third party. Retry up to 3 times
|
|
|
|
|
// (consecutive failures tracked in UptimeTracker) before declaring it dead.
|
|
|
|
|
if len(directory.GetAddrs()) <= 1 {
|
|
|
|
|
score.UptimeTracker.ConsecutiveFails++
|
|
|
|
|
if score.UptimeTracker.ConsecutiveFails < 3 {
|
2026-03-11 16:28:15 +01:00
|
|
|
logger.Warn().Str("peer", info.ID.String()).
|
2026-03-11 19:29:39 +01:00
|
|
|
Int("attempt", score.UptimeTracker.ConsecutiveFails).
|
|
|
|
|
Msg("[indirect] last indexer failed, retrying before eviction")
|
2026-03-11 16:28:15 +01:00
|
|
|
return
|
|
|
|
|
}
|
2026-03-11 19:29:39 +01:00
|
|
|
logger.Warn().Str("peer", info.ID.String()).
|
|
|
|
|
Msg("[indirect] last indexer failed 3 times consecutively, evicting")
|
2026-03-11 16:28:15 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
logger.Info().Str("peer", info.ID.String()).Str("proto", string(proto)).
|
|
|
|
|
Msg("heartbeat failed, removing peer from pool : " + err.Error())
|
|
|
|
|
consensusVoters := evictPeer(directory, addr, info.ID, proto)
|
|
|
|
|
if isIndexerHB {
|
|
|
|
|
need := maxPool - len(consensusVoters)
|
|
|
|
|
if need < 1 {
|
|
|
|
|
need = 1
|
|
|
|
|
}
|
|
|
|
|
logger.Info().Int("remaining", len(consensusVoters)).Int("need", need).Msg("pool state after removal")
|
|
|
|
|
poolSize := len(directory.GetAddrs())
|
|
|
|
|
if poolSize == 0 {
|
|
|
|
|
// Pool is truly empty (no seeds configured or no seeds in pool).
|
|
|
|
|
// Start the backoff retry loop — it will re-add seeds and nudge
|
|
|
|
|
// only once a seed actually responds.
|
|
|
|
|
go retryUntilSeedResponds()
|
|
|
|
|
} else if len(consensusVoters) > 0 {
|
|
|
|
|
go TriggerConsensus(h, consensusVoters, need)
|
|
|
|
|
} else {
|
|
|
|
|
go replenishIndexersFromDHT(h, need)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|