package common import ( "context" "encoding/json" "fmt" "math/rand" "strings" "sync/atomic" "time" "oc-discovery/conf" oclib "cloud.o-forge.io/core/oc-lib" "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/network" pp "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/core/protocol" ) var TimeWatcher time.Time // retryRunning guards against launching multiple retryUntilSeedResponds goroutines. var retryRunning atomic.Bool func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, recordFn ...func() json.RawMessage) error { TimeWatcher = time.Now().UTC() logger := oclib.GetLogger() // Bootstrap from IndexerAddresses seed set. addresses := strings.Split(conf.GetConfig().IndexerAddresses, ",") if len(addresses) > maxIndexer { addresses = addresses[0:maxIndexer] } for _, indexerAddr := range addresses { indexerAddr = strings.TrimSpace(indexerAddr) if indexerAddr == "" { continue } ad, err := pp.AddrInfoFromString(indexerAddr) if err != nil { logger.Err(err) continue } key := ad.ID.String() Indexers.SetAddr(key, ad) // Pre-create score entry with IsSeed=true so the sticky flag is set before // the first heartbeat tick (lazy creation in doTick would lose the flag). if !Indexers.ExistsScore(key) { Indexers.SetScore(key, &Score{ FirstContacted: time.Now().UTC(), UptimeTracker: &UptimeTracker{FirstSeen: time.Now().UTC()}, nextChallenge: rand.Intn(10) + 1, IsSeed: true, }) } } seeds := Indexers.GetAddrs() indexerCount := len(seeds) if indexerCount < minIndexer { return fmt.Errorf("you run a node without indexers... your gonna be isolated.") } // Start long-lived heartbeat to seed indexers. The single goroutine follows // all subsequent StaticIndexers changes. SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name, h, Indexers, 20*time.Second, maxIndexer, recordFn...) // Watch for inbound connections: if a peer connects to us and our pool has // room, probe it first to confirm it supports ProtocolHeartbeat (i.e. it is // an indexer). Plain nodes don't register the handler — the negotiation fails // instantly so we never pollute the pool with non-indexer peers. h.Network().Notify(&network.NotifyBundle{ ConnectedF: func(n network.Network, c network.Conn) { if c.Stat().Direction != network.DirInbound { return } if len(Indexers.GetAddrs()) >= maxIndexer { return } peerID := c.RemotePeer() if Indexers.ExistsAddr(peerID.String()) { return } // Probe in a goroutine — ConnectedF must not block. go func(pid pp.ID) { if !SupportsHeartbeat(h, pid) { return // plain node, skip } if len(Indexers.GetAddrs()) >= maxIndexer { return } if Indexers.ExistsAddr(pid.String()) { return } addrs := h.Peerstore().Addrs(pid) if len(addrs) == 0 { return } ai := FilterLoopbackAddrs(pp.AddrInfo{ID: pid, Addrs: addrs}) if len(ai.Addrs) == 0 { return } adCopy := ai Indexers.SetAddr(pid.String(), &adCopy) Indexers.NudgeIt() log := oclib.GetLogger() log.Info().Str("peer", pid.String()). Msg("[pool] inbound indexer peer added as candidate") }(peerID) }, }) // Proactive DHT upgrade: once seeds are connected and the DHT routing table // is warm, discover better indexers and add them to the pool alongside the seeds. // Seeds stay as guaranteed anchors; scoring will demote poor performers over time. go func(seeds []Entry) { // Let seed connections establish and the DHT routing table warm up. time.Sleep(5 * time.Second) // For pure nodes (no IndexerService), spin up a lightweight DHT client. if discoveryDHT == nil { if len(seeds) == 0 { return } initNodeDHT(h, seeds) } if discoveryDHT == nil { return } current := len(Indexers.GetAddrs()) need := maxIndexer - current if need <= 0 { need = maxIndexer / 2 // diversify even when pool is already at capacity } logger.Info().Int("need", need).Msg("[dht] proactive indexer discovery from DHT") replenishIndexersFromDHT(h, need) }(seeds) return nil } // reconnectToSeeds re-adds the configured seed indexers to StaticIndexers as // sticky fallback entries. Called when the pool drops to zero so the node // never becomes completely isolated. func reconnectToSeeds() { logger := oclib.GetLogger() logger.Warn().Msg("[pool] all indexers lost, reconnecting to configured seeds") addresses := strings.Split(conf.GetConfig().IndexerAddresses, ",") for _, addrStr := range addresses { addrStr = strings.TrimSpace(addrStr) if addrStr == "" { continue } ad, err := pp.AddrInfoFromString(addrStr) if err != nil { continue } key := ad.ID.String() Indexers.SetAddr(key, ad) if score := Indexers.GetScore(key); score == nil { Indexers.SetScore(key, &Score{ FirstContacted: time.Now().UTC(), UptimeTracker: &UptimeTracker{FirstSeen: time.Now().UTC()}, nextChallenge: rand.Intn(10) + 1, IsSeed: true, }) } else { // Restore sticky flag so the seed is not immediately re-ejected. score.IsSeed = true } } } // retryUntilSeedResponds loops with exponential backoff until at least one // configured seed is reachable again. Once seeds are back in the pool it // nudges the heartbeat loop and lets the normal DHT upgrade path take over. // Should be called in a goroutine — it blocks until the situation resolves. // Panics immediately if no seeds are configured: there is nothing to wait for. func retryUntilSeedResponds() { if !retryRunning.CompareAndSwap(false, true) { return // another goroutine is already running the retry loop } defer retryRunning.Store(false) logger := oclib.GetLogger() rawAddresses := strings.TrimSpace(conf.GetConfig().IndexerAddresses) if rawAddresses == "" { // No seeds configured: rely on the inbound-connection notifee to fill // the pool. Just wait patiently — the loop below will return as soon // as any peer connects and NudgeIt() is called. logger.Warn().Msg("[pool] pool empty and no seeds configured — waiting for inbound indexer") } backoff := 10 * time.Second const maxBackoff = 5 * time.Minute for { time.Sleep(backoff) if backoff < maxBackoff { backoff *= 2 } // Check whether someone else already refilled the pool. if len(Indexers.GetAddrs()) > 0 { logger.Info().Msg("[pool] pool refilled externally, stopping seed retry") return } logger.Warn().Dur("backoff", backoff).Msg("[pool] still isolated, retrying seeds") reconnectToSeeds() if len(Indexers.GetAddrs()) > 0 { Indexers.NudgeIt() // Re-bootstrap DHT now that we have at least one connection candidate. if discoveryDHT != nil { ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) discoveryDHT.Bootstrap(ctx) //nolint:errcheck cancel() } return } } } // ensureScore returns the Score for addr, creating it if absent. func ensureScore(d *Directory, addr string) *Score { if !d.ExistsScore(addr) { d.SetScore(addr, &Score{ FirstContacted: time.Now().UTC(), UptimeTracker: &UptimeTracker{FirstSeen: time.Now().UTC()}, nextChallenge: rand.Intn(10) + 1, }) } return d.GetScore(addr) } // evictPeer removes addr from directory atomically and returns a snapshot of // remaining AddrInfos (for consensus voter selection). func evictPeer(d *Directory, addr string, id pp.ID, proto protocol.ID) []pp.AddrInfo { d.Streams.Delete(proto, &id) d.DeleteAddr(addr) voters := make([]pp.AddrInfo, 0, len(d.Addrs)) for _, ai := range d.GetAddrs() { if ai.Info != nil { voters = append(voters, *ai.Info) } } d.DeleteScore(addr) return voters } // handleSuggestions adds unknown suggested indexers to the directory. func handleSuggestions(d *Directory, from string, suggestions []pp.AddrInfo) { added := 0 for _, sug := range suggestions { key := addrKey(sug) if !d.ExistsAddr(key) { cpy := sug d.SetAddr(key, &cpy) added++ } } if added > 0 { logger := oclib.GetLogger() logger.Info().Int("added", added).Str("from", from). Msg("added suggested indexers from heartbeat response") d.NudgeIt() } } // SendHeartbeat starts a goroutine that sends periodic heartbeats to peers. // recordFn, when provided, is called on each tick and its output is embedded in // the heartbeat as a fresh signed PeerRecord so the receiving indexer can // republish it to the DHT without an extra round-trip. // Pass no recordFn (or nil) for indexer→indexer / native heartbeats. func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.Host, directory *Directory, interval time.Duration, maxPool int, recordFn ...func() json.RawMessage) { logger := oclib.GetLogger() isIndexerHB := directory == Indexers var recFn func() json.RawMessage if len(recordFn) > 0 { recFn = recordFn[0] } go func() { logger.Info().Str("proto", string(proto)).Int("peers", len(directory.Addrs)).Msg("heartbeat started") t := time.NewTicker(interval) defer t.Stop() // peerEntry pairs addr key with AddrInfo so doTick can update score maps directly. type peerEntry struct { addr string ai *pp.AddrInfo } doTick := func() { addrs := directory.GetAddrsStr() need := maxPool - len(addrs) if need < 0 { need = 0 } baseHB := Heartbeat{ Name: name, PeerID: h.ID().String(), Timestamp: time.Now().UTC().Unix(), IndexersBinded: addrs, Need: need, } if recFn != nil { baseHB.Record = recFn() } // Determine the referent indexer: highest-scored one receives Referent=true // so it stores us in its referencedNodes for distributed search. var referentAddr string if isIndexerHB { var bestScore float64 = -1 for _, ai2 := range directory.GetAddrs() { if s := directory.GetScore(ai2.Addr); s != nil && s.Score > bestScore { bestScore = s.Score referentAddr = ai2.Addr } } } for _, ai := range directory.GetAddrs() { // Build per-peer heartbeat copy so challenge injection is peer-specific. hb := baseHB if isIndexerHB && referentAddr != "" && ai.Addr == referentAddr { hb.Referent = true } // Ensure an IndexerScore entry exists for this peer. var score *Score if isIndexerHB { score = ensureScore(directory, ai.Addr) // Inject challenge batch if due (random 1-10 HBs between batches). score.hbCount++ if score.hbCount >= score.nextChallenge { // Ground truth: node's own PeerID — indexer MUST have us. challenges := []string{h.ID().String()} // Add up to 2 more known peers (other indexers) for richer data. // Use the already-snapshotted entries to avoid re-locking. for _, ai2 := range directory.GetAddrs() { if ai2.Addr != ai.Addr && ai2.Info != nil { challenges = append(challenges, ai2.Info.ID.String()) if len(challenges) >= 3 { break } } } hb.Challenges = challenges score.hbCount = 0 score.nextChallenge = rand.Intn(10) + 1 score.challengeTotal++ // count own-PeerID challenge (ground truth) score.dhtBatchCounter++ // DHT challenge every 5th batch: ask indexer to retrieve our own DID. if score.dhtBatchCounter%5 == 0 { var selfDID string if len(baseHB.Record) > 0 { var partial struct { DID string `json:"did"` } if json.Unmarshal(baseHB.Record, &partial) == nil { selfDID = partial.DID } } if selfDID != "" { hb.ChallengeDID = selfDID } } } } resp, rtt, err := sendHeartbeat(ctx, h, proto, ai.Info, hb, directory.Streams, interval*time.Second) if err != nil { // Heartbeat fails fmt.Println("EERR", err) HeartbeatFailure(h, proto, directory, ai.Addr, ai.Info, isIndexerHB, maxPool, err) continue } // Update IndexerScore — uptime recorded on any successful send, // even if the indexer does not support bidirectional heartbeat (Fix 1). if isIndexerHB && score != nil { score.UptimeTracker.RecordHeartbeat() maxRTT := BaseRoundTrip * 10 latencyScore := 1.0 - float64(rtt)/float64(maxRTT) if latencyScore < 0 { latencyScore = 0 } if latencyScore > 1 { latencyScore = 1 } // Update fill / challenge fields only when the indexer responded. if resp != nil { // BornAt stability check. if score.LastBornAt.IsZero() { score.LastBornAt = resp.BornAt } else if !resp.BornAt.IsZero() && !resp.BornAt.Equal(score.LastBornAt) { score.bornAtChanges++ score.LastBornAt = resp.BornAt logger.Warn().Str("peer", ai.Info.ID.String()). Int("changes", score.bornAtChanges). Msg("indexer BornAt changed — possible restart or impersonation") } score.LastFillRate = resp.FillRate // Fill rate consistency: cross-check peerCount/maxNodes vs reported fillRate. if resp.MaxNodes > 0 { expected := float64(resp.PeerCount) / float64(resp.MaxNodes) diff := expected - resp.FillRate if diff < 0 { diff = -diff } score.fillChecked++ if diff < 0.1 { score.fillConsistent++ } } // Validate challenge responses. Only own-PeerID counts as ground truth. if len(hb.Challenges) > 0 && len(resp.Challenges) > 0 { ownID := h.ID().String() for _, ce := range resp.Challenges { if ce.PeerID != ownID { continue // informational only } recentEnough := !ce.LastSeen.IsZero() && time.Since(ce.LastSeen) < 2*RecommendedHeartbeatInterval if ce.Found && recentEnough { score.challengeCorrect++ } logger.Info().Str("peer", ai.Info.ID.String()). Bool("found", ce.Found). Bool("recent", recentEnough). Msg("own-PeerID challenge result") break } } // DHT challenge result. if hb.ChallengeDID != "" { score.dhtChecked++ if resp.DHTFound { score.dhtSuccess++ } } // Refresh local witness cache for indirect probing on future failure. for _, w := range resp.Witnesses { score.UpdateWitnessPool(w) } // Launch witness cross-check asynchronously (must not hold lock). if len(resp.Witnesses) > 0 { go queryWitnesses(h, ai.Info.ID.String(), resp.BornAt, resp.FillRate, resp.Witnesses, score) } else if resp.MaxNodes > 0 { // No witnesses offered. Valid if indexer only has us (PeerCount==1). // Cross-check: FillRate should equal 1/MaxNodes within ±10%. expected := 1.0 / float64(resp.MaxNodes) diff := resp.FillRate - expected if diff < 0 { diff = -diff } score.witnessChecked++ if resp.PeerCount == 1 && diff < 0.1 { score.witnessConsistent++ } } } score.Score = score.ComputeNodeSideScore(latencyScore) age := score.UptimeTracker.Uptime() minScore := dynamicMinScore(age) // Fix 4: grace period — at least 2 full heartbeat cycles before ejecting. isSeed := score.IsSeed // Seeds are sticky: never evicted by score alone (SuggestMigrate handles it). // Never eject the last indexer by score alone — we would lose all connectivity. belowThreshold := score.Score < minScore && score.UptimeTracker.TotalOnline >= 2*RecommendedHeartbeatInterval && !isSeed && len(directory.Addrs) > 1 if belowThreshold { logger.Info().Str("peer", ai.Info.ID.String()). Float64("score", score.Score).Float64("min", minScore). Msg("indexer score below threshold, removing from pool") voters := evictPeer(directory, ai.Addr, ai.Info.ID, proto) need := max(maxPool-len(voters), 1) if len(voters) > 0 { go TriggerConsensus(h, voters, need) } else { go replenishIndexersFromDHT(h, need) } } // Accept suggestions from this indexer — add unknown ones to the directory. if resp != nil && len(resp.Suggestions) > 0 { handleSuggestions(directory, ai.Info.ID.String(), resp.Suggestions) } // Handle SuggestMigrate: indexer is overloaded and wants us to move. if resp != nil && resp.SuggestMigrate && isIndexerHB { nonSeedCount := 0 for _, sc := range directory.GetScores() { if !sc.IsSeed { nonSeedCount++ } } if nonSeedCount >= conf.GetConfig().MinIndexer { if isSeed { // Seed has offloaded us: clear sticky flag, score eviction takes over. score.IsSeed = false logger.Info().Str("peer", ai.Info.ID.String()). Msg("seed discharged via SuggestMigrate, de-stickied") } else { evictPeer(directory, ai.Addr, ai.Info.ID, proto) logger.Info().Str("peer", ai.Info.ID.String()).Msg("accepted migration from overloaded indexer") } } } } } } for { select { case <-t.C: doTick() case <-directory.Nudge: if isIndexerHB { logger.Info().Msg("nudge received, heartbeating new indexers immediately") doTick() } case <-ctx.Done(): return } } }() } func HeartbeatFailure(h host.Host, proto protocol.ID, directory *Directory, addr string, info *pp.AddrInfo, isIndexerHB bool, maxPool int, err error) { logger := oclib.GetLogger() logger.Err(err) // Seeds are never evicted on heartbeat failure. // Keeping them in the pool lets the regular 60-second ticker retry them // at a natural cadence — no reconnect storm, no libp2p dial-backoff accumulation. // A seed will self-heal once it comes back; DHT and inbound peers fill the gap. if isIndexerHB { if score := directory.GetScore(addr); score != nil { if score.IsSeed { logger.Warn().Str("peer", info.ID.String()). Msg("[pool] seed heartbeat failed — keeping in pool, ticker will retry " + err.Error()) return } // Indirect probe: query cached witnesses before declaring the indexer dead. // If a witness confirms it is alive, the failure is a local asymmetric // link — not the indexer. Skip eviction; next tick will retry directly. if len(score.WitnessPool) > 0 { pool := append([]WitnessCacheEntry(nil), score.WitnessPool...) if IndirectProbeIndexer(h, info.ID.String(), pool) { logger.Warn().Str("peer", info.ID.String()). Msg("[indirect] witness confirms indexer alive — asymmetric link, skipping eviction " + err.Error()) return } } } } logger.Info().Str("peer", info.ID.String()).Str("proto", string(proto)). Msg("heartbeat failed, removing peer from pool : " + err.Error()) consensusVoters := evictPeer(directory, addr, info.ID, proto) if isIndexerHB { need := maxPool - len(consensusVoters) if need < 1 { need = 1 } logger.Info().Int("remaining", len(consensusVoters)).Int("need", need).Msg("pool state after removal") poolSize := len(directory.GetAddrs()) if poolSize == 0 { // Pool is truly empty (no seeds configured or no seeds in pool). // Start the backoff retry loop — it will re-add seeds and nudge // only once a seed actually responds. go retryUntilSeedResponds() } else if len(consensusVoters) > 0 { go TriggerConsensus(h, consensusVoters, need) } else { go replenishIndexersFromDHT(h, need) } } }