Files
oc-discovery/daemons/node/common/common_indexer_hb.go
2026-03-11 19:29:39 +01:00

590 lines
19 KiB
Go

package common
import (
"context"
"encoding/json"
"fmt"
"math/rand"
"strings"
"sync/atomic"
"time"
"oc-discovery/conf"
oclib "cloud.o-forge.io/core/oc-lib"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
)
var TimeWatcher time.Time
// retryRunning guards against launching multiple retryUntilSeedResponds goroutines.
var retryRunning atomic.Bool
func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, recordFn ...func() json.RawMessage) error {
TimeWatcher = time.Now().UTC()
logger := oclib.GetLogger()
// Bootstrap from IndexerAddresses seed set.
addresses := strings.Split(conf.GetConfig().IndexerAddresses, ",")
if len(addresses) > maxIndexer {
addresses = addresses[0:maxIndexer]
}
for _, indexerAddr := range addresses {
indexerAddr = strings.TrimSpace(indexerAddr)
if indexerAddr == "" {
continue
}
ad, err := pp.AddrInfoFromString(indexerAddr)
if err != nil {
logger.Err(err)
continue
}
key := ad.ID.String()
Indexers.SetAddr(key, ad)
// Pre-create score entry with IsSeed=true so the sticky flag is set before
// the first heartbeat tick (lazy creation in doTick would lose the flag).
if !Indexers.ExistsScore(key) {
Indexers.SetScore(key, &Score{
FirstContacted: time.Now().UTC(),
UptimeTracker: &UptimeTracker{FirstSeen: time.Now().UTC()},
nextChallenge: rand.Intn(10) + 1,
IsSeed: true,
})
}
}
seeds := Indexers.GetAddrs()
indexerCount := len(seeds)
if indexerCount < minIndexer {
return fmt.Errorf("you run a node without indexers... your gonna be isolated.")
}
// Start long-lived heartbeat to seed indexers. The single goroutine follows
// all subsequent StaticIndexers changes.
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name,
h, Indexers, 20*time.Second, maxIndexer, recordFn...)
// Watch for inbound connections: if a peer connects to us and our pool has
// room, probe it first to confirm it supports ProtocolHeartbeat (i.e. it is
// an indexer). Plain nodes don't register the handler — the negotiation fails
// instantly so we never pollute the pool with non-indexer peers.
h.Network().Notify(&network.NotifyBundle{
ConnectedF: func(n network.Network, c network.Conn) {
if c.Stat().Direction != network.DirInbound {
return
}
if len(Indexers.GetAddrs()) >= maxIndexer {
return
}
peerID := c.RemotePeer()
if Indexers.ExistsAddr(peerID.String()) {
return
}
// Probe in a goroutine — ConnectedF must not block.
go func(pid pp.ID) {
if !SupportsHeartbeat(h, pid) {
return // plain node, skip
}
if len(Indexers.GetAddrs()) >= maxIndexer {
return
}
if Indexers.ExistsAddr(pid.String()) {
return
}
addrs := h.Peerstore().Addrs(pid)
if len(addrs) == 0 {
return
}
ai := FilterLoopbackAddrs(pp.AddrInfo{ID: pid, Addrs: addrs})
if len(ai.Addrs) == 0 {
return
}
adCopy := ai
Indexers.SetAddr(pid.String(), &adCopy)
Indexers.NudgeIt()
log := oclib.GetLogger()
log.Info().Str("peer", pid.String()).
Msg("[pool] inbound indexer peer added as candidate")
}(peerID)
},
})
// Proactive DHT upgrade: once seeds are connected and the DHT routing table
// is warm, discover better indexers and add them to the pool alongside the seeds.
// Seeds stay as guaranteed anchors; scoring will demote poor performers over time.
go func(seeds []Entry) {
// Let seed connections establish and the DHT routing table warm up.
time.Sleep(5 * time.Second)
// For pure nodes (no IndexerService), spin up a lightweight DHT client.
if discoveryDHT == nil {
if len(seeds) == 0 {
return
}
initNodeDHT(h, seeds)
}
if discoveryDHT == nil {
return
}
current := len(Indexers.GetAddrs())
need := maxIndexer - current
if need <= 0 {
need = maxIndexer / 2 // diversify even when pool is already at capacity
}
logger.Info().Int("need", need).Msg("[dht] proactive indexer discovery from DHT")
replenishIndexersFromDHT(h, need)
}(seeds)
return nil
}
// reconnectToSeeds re-adds the configured seed indexers to StaticIndexers as
// sticky fallback entries. Called when the pool drops to zero so the node
// never becomes completely isolated.
func reconnectToSeeds() {
logger := oclib.GetLogger()
logger.Warn().Msg("[pool] all indexers lost, reconnecting to configured seeds")
addresses := strings.Split(conf.GetConfig().IndexerAddresses, ",")
for _, addrStr := range addresses {
addrStr = strings.TrimSpace(addrStr)
if addrStr == "" {
continue
}
ad, err := pp.AddrInfoFromString(addrStr)
if err != nil {
continue
}
key := ad.ID.String()
Indexers.SetAddr(key, ad)
if score := Indexers.GetScore(key); score == nil {
Indexers.SetScore(key, &Score{
FirstContacted: time.Now().UTC(),
UptimeTracker: &UptimeTracker{FirstSeen: time.Now().UTC()},
nextChallenge: rand.Intn(10) + 1,
IsSeed: true,
})
} else {
// Restore sticky flag so the seed is not immediately re-ejected.
score.IsSeed = true
}
}
}
// retryUntilSeedResponds loops with exponential backoff until at least one
// configured seed is reachable again. Once seeds are back in the pool it
// nudges the heartbeat loop and lets the normal DHT upgrade path take over.
// Should be called in a goroutine — it blocks until the situation resolves.
// Panics immediately if no seeds are configured: there is nothing to wait for.
func retryUntilSeedResponds() {
if !retryRunning.CompareAndSwap(false, true) {
return // another goroutine is already running the retry loop
}
defer retryRunning.Store(false)
logger := oclib.GetLogger()
rawAddresses := strings.TrimSpace(conf.GetConfig().IndexerAddresses)
if rawAddresses == "" {
// No seeds configured: rely on the inbound-connection notifee to fill
// the pool. Just wait patiently — the loop below will return as soon
// as any peer connects and NudgeIt() is called.
logger.Warn().Msg("[pool] pool empty and no seeds configured — waiting for inbound indexer")
}
backoff := 10 * time.Second
const maxBackoff = 5 * time.Minute
for {
time.Sleep(backoff)
if backoff < maxBackoff {
backoff *= 2
}
// Check whether someone else already refilled the pool.
if len(Indexers.GetAddrs()) > 0 {
logger.Info().Msg("[pool] pool refilled externally, stopping seed retry")
return
}
logger.Warn().Dur("backoff", backoff).Msg("[pool] still isolated, retrying seeds")
reconnectToSeeds()
if len(Indexers.GetAddrs()) > 0 {
Indexers.NudgeIt()
// Re-bootstrap DHT now that we have at least one connection candidate.
if discoveryDHT != nil {
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
discoveryDHT.Bootstrap(ctx) //nolint:errcheck
cancel()
}
return
}
}
}
// ensureScore returns the Score for addr, creating it if absent.
func ensureScore(d *Directory, addr string) *Score {
if !d.ExistsScore(addr) {
d.SetScore(addr, &Score{
FirstContacted: time.Now().UTC(),
UptimeTracker: &UptimeTracker{FirstSeen: time.Now().UTC()},
nextChallenge: rand.Intn(10) + 1,
})
}
return d.GetScore(addr)
}
// evictPeer removes addr from directory atomically and returns a snapshot of
// remaining AddrInfos (for consensus voter selection).
func evictPeer(d *Directory, addr string, id pp.ID, proto protocol.ID) []pp.AddrInfo {
d.Streams.Delete(proto, &id)
d.DeleteAddr(addr)
voters := make([]pp.AddrInfo, 0, len(d.Addrs))
for _, ai := range d.GetAddrs() {
if ai.Info != nil {
voters = append(voters, *ai.Info)
}
}
d.DeleteScore(addr)
return voters
}
// handleSuggestions adds unknown suggested indexers to the directory.
func handleSuggestions(d *Directory, from string, suggestions []pp.AddrInfo) {
added := 0
for _, sug := range suggestions {
key := addrKey(sug)
if !d.ExistsAddr(key) {
cpy := sug
d.SetAddr(key, &cpy)
added++
}
}
if added > 0 {
logger := oclib.GetLogger()
logger.Info().Int("added", added).Str("from", from).
Msg("added suggested indexers from heartbeat response")
d.NudgeIt()
}
}
// SendHeartbeat starts a goroutine that sends periodic heartbeats to peers.
// recordFn, when provided, is called on each tick and its output is embedded in
// the heartbeat as a fresh signed PeerRecord so the receiving indexer can
// republish it to the DHT without an extra round-trip.
// Pass no recordFn (or nil) for indexer→indexer / native heartbeats.
func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.Host, directory *Directory, interval time.Duration, maxPool int, recordFn ...func() json.RawMessage) {
logger := oclib.GetLogger()
isIndexerHB := directory == Indexers
var recFn func() json.RawMessage
if len(recordFn) > 0 {
recFn = recordFn[0]
}
go func() {
logger.Info().Str("proto", string(proto)).Int("peers", len(directory.Addrs)).Msg("heartbeat started")
t := time.NewTicker(interval)
defer t.Stop()
// peerEntry pairs addr key with AddrInfo so doTick can update score maps directly.
type peerEntry struct {
addr string
ai *pp.AddrInfo
}
doTick := func() {
addrs := directory.GetAddrsStr()
need := maxPool - len(addrs)
if need < 0 {
need = 0
}
baseHB := Heartbeat{
Name: name,
PeerID: h.ID().String(),
Timestamp: time.Now().UTC().Unix(),
IndexersBinded: addrs,
Need: need,
}
if recFn != nil {
baseHB.Record = recFn()
}
// Determine the referent indexer: highest-scored one receives Referent=true
// so it stores us in its referencedNodes for distributed search.
var referentAddr string
if isIndexerHB {
var bestScore float64 = -1
for _, ai2 := range directory.GetAddrs() {
if s := directory.GetScore(ai2.Addr); s != nil && s.Score > bestScore {
bestScore = s.Score
referentAddr = ai2.Addr
}
}
}
for _, ai := range directory.GetAddrs() {
// Build per-peer heartbeat copy so challenge injection is peer-specific.
hb := baseHB
if isIndexerHB && referentAddr != "" && ai.Addr == referentAddr {
hb.Referent = true
}
// Ensure an IndexerScore entry exists for this peer.
var score *Score
if isIndexerHB {
score = ensureScore(directory, ai.Addr)
// Inject challenge batch if due (random 1-10 HBs between batches).
score.hbCount++
if score.hbCount >= score.nextChallenge {
// Ground truth: node's own PeerID — indexer MUST have us.
challenges := []string{h.ID().String()}
// Add up to 2 more known peers (other indexers) for richer data.
// Use the already-snapshotted entries to avoid re-locking.
for _, ai2 := range directory.GetAddrs() {
if ai2.Addr != ai.Addr && ai2.Info != nil {
challenges = append(challenges, ai2.Info.ID.String())
if len(challenges) >= 3 {
break
}
}
}
hb.Challenges = challenges
score.hbCount = 0
score.nextChallenge = rand.Intn(10) + 1
score.challengeTotal++ // count own-PeerID challenge (ground truth)
score.dhtBatchCounter++
// DHT challenge every 5th batch: ask indexer to retrieve our own DID.
if score.dhtBatchCounter%5 == 0 {
var selfDID string
if len(baseHB.Record) > 0 {
var partial struct {
DID string `json:"did"`
}
if json.Unmarshal(baseHB.Record, &partial) == nil {
selfDID = partial.DID
}
}
if selfDID != "" {
hb.ChallengeDID = selfDID
}
}
}
}
resp, rtt, err := sendHeartbeat(ctx, h, proto, ai.Info, hb, directory.Streams, interval*time.Second)
if err != nil { // Heartbeat fails
HeartbeatFailure(h, proto, directory, ai.Addr, ai.Info, isIndexerHB, maxPool, err)
continue
}
// Update IndexerScore — uptime recorded on any successful send,
// even if the indexer does not support bidirectional heartbeat (Fix 1).
if isIndexerHB && score != nil {
score.UptimeTracker.RecordHeartbeat()
score.UptimeTracker.ConsecutiveFails = 0 // reset on success
maxRTT := BaseRoundTrip * 10
latencyScore := 1.0 - float64(rtt)/float64(maxRTT)
if latencyScore < 0 {
latencyScore = 0
}
if latencyScore > 1 {
latencyScore = 1
}
// Update fill / challenge fields only when the indexer responded.
if resp != nil {
// BornAt stability check.
if score.LastBornAt.IsZero() {
score.LastBornAt = resp.BornAt
} else if !resp.BornAt.IsZero() && !resp.BornAt.Equal(score.LastBornAt) {
score.bornAtChanges++
score.LastBornAt = resp.BornAt
logger.Warn().Str("peer", ai.Info.ID.String()).
Int("changes", score.bornAtChanges).
Msg("indexer BornAt changed — possible restart or impersonation")
}
score.LastFillRate = resp.FillRate
// Fill rate consistency: cross-check peerCount/maxNodes vs reported fillRate.
if resp.MaxNodes > 0 {
expected := float64(resp.PeerCount) / float64(resp.MaxNodes)
diff := expected - resp.FillRate
if diff < 0 {
diff = -diff
}
score.fillChecked++
if diff < 0.1 {
score.fillConsistent++
}
}
// Validate challenge responses. Only own-PeerID counts as ground truth.
if len(hb.Challenges) > 0 && len(resp.Challenges) > 0 {
ownID := h.ID().String()
for _, ce := range resp.Challenges {
if ce.PeerID != ownID {
continue // informational only
}
recentEnough := !ce.LastSeen.IsZero() &&
time.Since(ce.LastSeen) < 2*RecommendedHeartbeatInterval
if ce.Found && recentEnough {
score.challengeCorrect++
}
logger.Info().Str("peer", ai.Info.ID.String()).
Bool("found", ce.Found).
Bool("recent", recentEnough).
Msg("own-PeerID challenge result")
break
}
}
// DHT challenge result.
if hb.ChallengeDID != "" {
score.dhtChecked++
if resp.DHTFound {
score.dhtSuccess++
}
}
// Launch witness cross-check asynchronously (must not hold lock).
if len(resp.Witnesses) > 0 {
go queryWitnesses(h, ai.Info.ID.String(), resp.BornAt, resp.FillRate, resp.Witnesses, score)
} else if resp.MaxNodes > 0 {
// No witnesses offered. Valid if indexer only has us (PeerCount==1).
// Cross-check: FillRate should equal 1/MaxNodes within ±10%.
expected := 1.0 / float64(resp.MaxNodes)
diff := resp.FillRate - expected
if diff < 0 {
diff = -diff
}
score.witnessChecked++
if resp.PeerCount == 1 && diff < 0.1 {
score.witnessConsistent++
}
}
}
score.Score = score.ComputeNodeSideScore(latencyScore)
age := score.UptimeTracker.Uptime()
minScore := dynamicMinScore(age)
// Fix 4: grace period — at least 2 full heartbeat cycles before ejecting.
isSeed := score.IsSeed
// Seeds are sticky: never evicted by score alone (SuggestMigrate handles it).
// Never eject the last indexer by score alone — we would lose all connectivity.
belowThreshold := score.Score < minScore &&
score.UptimeTracker.TotalOnline >= 2*RecommendedHeartbeatInterval &&
!isSeed &&
len(directory.Addrs) > 1
if belowThreshold {
logger.Info().Str("peer", ai.Info.ID.String()).
Float64("score", score.Score).Float64("min", minScore).
Msg("indexer score below threshold, removing from pool")
voters := evictPeer(directory, ai.Addr, ai.Info.ID, proto)
need := max(maxPool-len(voters), 1)
if len(voters) > 0 {
go TriggerConsensus(h, voters, need)
} else {
go replenishIndexersFromDHT(h, need)
}
}
// Accept suggestions from this indexer — add unknown ones to the directory.
if resp != nil && len(resp.Suggestions) > 0 {
handleSuggestions(directory, ai.Info.ID.String(), resp.Suggestions)
}
// Handle SuggestMigrate: indexer is overloaded and wants us to move.
if resp != nil && resp.SuggestMigrate && isIndexerHB {
nonSeedCount := 0
for _, sc := range directory.GetScores() {
if !sc.IsSeed {
nonSeedCount++
}
}
if nonSeedCount >= conf.GetConfig().MinIndexer {
if isSeed {
// Seed has offloaded us: clear sticky flag, score eviction takes over.
score.IsSeed = false
logger.Info().Str("peer", ai.Info.ID.String()).
Msg("seed discharged via SuggestMigrate, de-stickied")
} else {
evictPeer(directory, ai.Addr, ai.Info.ID, proto)
logger.Info().Str("peer", ai.Info.ID.String()).Msg("accepted migration from overloaded indexer")
}
}
}
}
}
}
for {
select {
case <-t.C:
doTick()
case <-directory.Nudge:
if isIndexerHB {
logger.Info().Msg("nudge received, heartbeating new indexers immediately")
doTick()
}
case <-ctx.Done():
return
}
}
}()
}
func HeartbeatFailure(h host.Host, proto protocol.ID, directory *Directory,
addr string, info *pp.AddrInfo, isIndexerHB bool, maxPool int, err error) {
logger := oclib.GetLogger()
logger.Err(err)
// Seeds are never evicted on heartbeat failure.
// Keeping them in the pool lets the regular 60-second ticker retry them
// at a natural cadence — no reconnect storm, no libp2p dial-backoff accumulation.
// A seed will self-heal once it comes back; DHT and inbound peers fill the gap.
if isIndexerHB {
if score := directory.GetScore(addr); score != nil {
if score.IsSeed {
logger.Warn().Str("peer", info.ID.String()).
Msg("[pool] seed heartbeat failed — keeping in pool, ticker will retry " + err.Error())
return
}
// Indirect probing via other alive indexers:
// If other indexers in the pool are still responding, they act as implicit
// third-party witnesses confirming our connectivity is fine — the failed
// indexer is genuinely dead, evict immediately.
// If this is the last indexer, there is no third party. Retry up to 3 times
// (consecutive failures tracked in UptimeTracker) before declaring it dead.
if len(directory.GetAddrs()) <= 1 {
score.UptimeTracker.ConsecutiveFails++
if score.UptimeTracker.ConsecutiveFails < 3 {
logger.Warn().Str("peer", info.ID.String()).
Int("attempt", score.UptimeTracker.ConsecutiveFails).
Msg("[indirect] last indexer failed, retrying before eviction")
return
}
logger.Warn().Str("peer", info.ID.String()).
Msg("[indirect] last indexer failed 3 times consecutively, evicting")
}
}
}
logger.Info().Str("peer", info.ID.String()).Str("proto", string(proto)).
Msg("heartbeat failed, removing peer from pool : " + err.Error())
consensusVoters := evictPeer(directory, addr, info.ID, proto)
if isIndexerHB {
need := maxPool - len(consensusVoters)
if need < 1 {
need = 1
}
logger.Info().Int("remaining", len(consensusVoters)).Int("need", need).Msg("pool state after removal")
poolSize := len(directory.GetAddrs())
if poolSize == 0 {
// Pool is truly empty (no seeds configured or no seeds in pool).
// Start the backoff retry loop — it will re-add seeds and nudge
// only once a seed actually responds.
go retryUntilSeedResponds()
} else if len(consensusVoters) > 0 {
go TriggerConsensus(h, consensusVoters, need)
} else {
go replenishIndexersFromDHT(h, need)
}
}
}