Simple Architecture
This commit is contained in:
254
daemons/node/indexer/behavior.go
Normal file
254
daemons/node/indexer/behavior.go
Normal file
@@ -0,0 +1,254 @@
|
||||
package indexer
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"oc-discovery/conf"
|
||||
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
)
|
||||
|
||||
// ── defaults ──────────────────────────────────────────────────────────────────
|
||||
|
||||
const (
|
||||
defaultMaxConnPerWindow = 20
|
||||
defaultConnWindowSecs = 30
|
||||
defaultMaxHBPerMinute = 5
|
||||
defaultMaxPublishPerMin = 10
|
||||
defaultMaxGetPerMin = 50
|
||||
strikeThreshold = 3
|
||||
banDuration = 10 * time.Minute
|
||||
behaviorWindowDur = 60 * time.Second
|
||||
)
|
||||
|
||||
func cfgOr(v, def int) int {
|
||||
if v > 0 {
|
||||
return v
|
||||
}
|
||||
return def
|
||||
}
|
||||
|
||||
// ── ConnectionRateGuard ───────────────────────────────────────────────────────
|
||||
|
||||
// ConnectionRateGuard limits the number of NEW incoming connections accepted
|
||||
// within a sliding time window. It protects public indexers against coordinated
|
||||
// registration floods (Sybil bursts).
|
||||
type ConnectionRateGuard struct {
|
||||
mu sync.Mutex
|
||||
window []time.Time
|
||||
maxInWindow int
|
||||
windowDur time.Duration
|
||||
}
|
||||
|
||||
func newConnectionRateGuard() *ConnectionRateGuard {
|
||||
cfg := conf.GetConfig()
|
||||
return &ConnectionRateGuard{
|
||||
maxInWindow: cfgOr(cfg.MaxConnPerWindow, defaultMaxConnPerWindow),
|
||||
windowDur: time.Duration(cfgOr(cfg.ConnWindowSecs, defaultConnWindowSecs)) * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
// Allow returns true if a new connection may be accepted.
|
||||
// The internal window is pruned on each call so memory stays bounded.
|
||||
func (g *ConnectionRateGuard) Allow() bool {
|
||||
g.mu.Lock()
|
||||
defer g.mu.Unlock()
|
||||
now := time.Now()
|
||||
cutoff := now.Add(-g.windowDur)
|
||||
i := 0
|
||||
for i < len(g.window) && g.window[i].Before(cutoff) {
|
||||
i++
|
||||
}
|
||||
g.window = g.window[i:]
|
||||
if len(g.window) >= g.maxInWindow {
|
||||
return false
|
||||
}
|
||||
g.window = append(g.window, now)
|
||||
return true
|
||||
}
|
||||
|
||||
// ── per-node state ────────────────────────────────────────────────────────────
|
||||
|
||||
type nodeBehavior struct {
|
||||
mu sync.Mutex
|
||||
knownDID string
|
||||
hbTimes []time.Time
|
||||
pubTimes []time.Time
|
||||
getTimes []time.Time
|
||||
strikes int
|
||||
bannedUntil time.Time
|
||||
}
|
||||
|
||||
func (nb *nodeBehavior) isBanned() bool {
|
||||
return time.Now().Before(nb.bannedUntil)
|
||||
}
|
||||
|
||||
func (nb *nodeBehavior) strike(n int) {
|
||||
nb.strikes += n
|
||||
if nb.strikes >= strikeThreshold {
|
||||
nb.bannedUntil = time.Now().Add(banDuration)
|
||||
}
|
||||
}
|
||||
|
||||
func pruneWindow(ts []time.Time, dur time.Duration) []time.Time {
|
||||
cutoff := time.Now().Add(-dur)
|
||||
i := 0
|
||||
for i < len(ts) && ts[i].Before(cutoff) {
|
||||
i++
|
||||
}
|
||||
return ts[i:]
|
||||
}
|
||||
|
||||
// recordInWindow appends now to the window slice and returns false (+ adds a
|
||||
// strike) when the count exceeds max.
|
||||
func (nb *nodeBehavior) recordInWindow(ts *[]time.Time, max int) bool {
|
||||
*ts = pruneWindow(*ts, behaviorWindowDur)
|
||||
if len(*ts) >= max {
|
||||
nb.strike(1)
|
||||
return false
|
||||
}
|
||||
*ts = append(*ts, time.Now())
|
||||
return true
|
||||
}
|
||||
|
||||
// ── NodeBehaviorTracker ───────────────────────────────────────────────────────
|
||||
|
||||
// NodeBehaviorTracker is the indexer-side per-node compliance monitor.
|
||||
// It is entirely local: no state is shared with other indexers.
|
||||
type NodeBehaviorTracker struct {
|
||||
mu sync.RWMutex
|
||||
nodes map[pp.ID]*nodeBehavior
|
||||
|
||||
maxHB int
|
||||
maxPub int
|
||||
maxGet int
|
||||
}
|
||||
|
||||
func newNodeBehaviorTracker() *NodeBehaviorTracker {
|
||||
cfg := conf.GetConfig()
|
||||
return &NodeBehaviorTracker{
|
||||
nodes: make(map[pp.ID]*nodeBehavior),
|
||||
maxHB: cfgOr(cfg.MaxHBPerMinute, defaultMaxHBPerMinute),
|
||||
maxPub: cfgOr(cfg.MaxPublishPerMinute, defaultMaxPublishPerMin),
|
||||
maxGet: cfgOr(cfg.MaxGetPerMinute, defaultMaxGetPerMin),
|
||||
}
|
||||
}
|
||||
|
||||
func (t *NodeBehaviorTracker) get(pid pp.ID) *nodeBehavior {
|
||||
t.mu.RLock()
|
||||
nb := t.nodes[pid]
|
||||
t.mu.RUnlock()
|
||||
if nb != nil {
|
||||
return nb
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
if nb = t.nodes[pid]; nb == nil {
|
||||
nb = &nodeBehavior{}
|
||||
t.nodes[pid] = nb
|
||||
}
|
||||
return nb
|
||||
}
|
||||
|
||||
// IsBanned returns true when the peer is in an active ban period.
|
||||
func (t *NodeBehaviorTracker) IsBanned(pid pp.ID) bool {
|
||||
nb := t.get(pid)
|
||||
nb.mu.Lock()
|
||||
defer nb.mu.Unlock()
|
||||
return nb.isBanned()
|
||||
}
|
||||
|
||||
// RecordHeartbeat checks heartbeat cadence. Returns an error if the peer is
|
||||
// flooding (too many heartbeats in the sliding window).
|
||||
func (t *NodeBehaviorTracker) RecordHeartbeat(pid pp.ID) error {
|
||||
nb := t.get(pid)
|
||||
nb.mu.Lock()
|
||||
defer nb.mu.Unlock()
|
||||
if nb.isBanned() {
|
||||
return errors.New("peer is banned")
|
||||
}
|
||||
if !nb.recordInWindow(&nb.hbTimes, t.maxHB) {
|
||||
return errors.New("heartbeat flood detected")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// CheckIdentity verifies that the DID associated with a PeerID never changes.
|
||||
// A DID change is a strong signal of identity spoofing.
|
||||
func (t *NodeBehaviorTracker) CheckIdentity(pid pp.ID, did string) error {
|
||||
if did == "" {
|
||||
return nil
|
||||
}
|
||||
nb := t.get(pid)
|
||||
nb.mu.Lock()
|
||||
defer nb.mu.Unlock()
|
||||
if nb.knownDID == "" {
|
||||
nb.knownDID = did
|
||||
return nil
|
||||
}
|
||||
if nb.knownDID != did {
|
||||
nb.strike(2) // identity change is severe
|
||||
return errors.New("DID mismatch for peer " + pid.String())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// RecordBadSignature registers a cryptographic verification failure.
|
||||
// A single bad signature is worth 2 strikes (near-immediate ban).
|
||||
func (t *NodeBehaviorTracker) RecordBadSignature(pid pp.ID) {
|
||||
nb := t.get(pid)
|
||||
nb.mu.Lock()
|
||||
defer nb.mu.Unlock()
|
||||
nb.strike(2)
|
||||
}
|
||||
|
||||
// RecordPublish checks publish volume. Returns an error if the peer is
|
||||
// sending too many publish requests.
|
||||
func (t *NodeBehaviorTracker) RecordPublish(pid pp.ID) error {
|
||||
nb := t.get(pid)
|
||||
nb.mu.Lock()
|
||||
defer nb.mu.Unlock()
|
||||
if nb.isBanned() {
|
||||
return errors.New("peer is banned")
|
||||
}
|
||||
if !nb.recordInWindow(&nb.pubTimes, t.maxPub) {
|
||||
return errors.New("publish volume exceeded")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// RecordGet checks get volume. Returns an error if the peer is enumerating
|
||||
// the DHT at an abnormal rate.
|
||||
func (t *NodeBehaviorTracker) RecordGet(pid pp.ID) error {
|
||||
nb := t.get(pid)
|
||||
nb.mu.Lock()
|
||||
defer nb.mu.Unlock()
|
||||
if nb.isBanned() {
|
||||
return errors.New("peer is banned")
|
||||
}
|
||||
if !nb.recordInWindow(&nb.getTimes, t.maxGet) {
|
||||
return errors.New("get volume exceeded")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Cleanup removes the behavior entry for a peer if it is not currently banned.
|
||||
// Called when the peer is evicted from StreamRecords by the GC.
|
||||
func (t *NodeBehaviorTracker) Cleanup(pid pp.ID) {
|
||||
t.mu.RLock()
|
||||
nb := t.nodes[pid]
|
||||
t.mu.RUnlock()
|
||||
if nb == nil {
|
||||
return
|
||||
}
|
||||
nb.mu.Lock()
|
||||
banned := nb.isBanned()
|
||||
nb.mu.Unlock()
|
||||
if !banned {
|
||||
t.mu.Lock()
|
||||
delete(t.nodes, pid)
|
||||
t.mu.Unlock()
|
||||
}
|
||||
}
|
||||
@@ -7,7 +7,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"oc-discovery/conf"
|
||||
"math/rand"
|
||||
"oc-discovery/daemons/node/common"
|
||||
"strings"
|
||||
"time"
|
||||
@@ -18,7 +18,7 @@ import (
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"github.com/libp2p/go-libp2p/core/crypto"
|
||||
"github.com/libp2p/go-libp2p/core/network"
|
||||
"github.com/libp2p/go-libp2p/core/peer"
|
||||
lpp "github.com/libp2p/go-libp2p/core/peer"
|
||||
)
|
||||
|
||||
type PeerRecordPayload struct {
|
||||
@@ -118,6 +118,7 @@ func (ix *IndexerService) genPIDKey(peerID string) string {
|
||||
func (ix *IndexerService) initNodeHandler() {
|
||||
logger := oclib.GetLogger()
|
||||
logger.Info().Msg("Init Node Handler")
|
||||
|
||||
// Each heartbeat from a node carries a freshly signed PeerRecord.
|
||||
// Republish it to the DHT so the record never expires as long as the node
|
||||
// is alive — no separate publish stream needed from the node side.
|
||||
@@ -177,49 +178,48 @@ func (ix *IndexerService) initNodeHandler() {
|
||||
ix.Host.SetStreamHandler(common.ProtocolHeartbeat, ix.HandleHeartbeat)
|
||||
ix.Host.SetStreamHandler(common.ProtocolPublish, ix.handleNodePublish)
|
||||
ix.Host.SetStreamHandler(common.ProtocolGet, ix.handleNodeGet)
|
||||
ix.Host.SetStreamHandler(common.ProtocolIndexerGetNatives, ix.handleGetNatives)
|
||||
ix.Host.SetStreamHandler(common.ProtocolIndexerConsensus, ix.handleIndexerConsensus)
|
||||
ix.Host.SetStreamHandler(common.ProtocolIndexerCandidates, ix.handleCandidateRequest)
|
||||
ix.initSearchHandlers()
|
||||
}
|
||||
|
||||
// handleIndexerConsensus implements Phase 2 liveness voting (ProtocolIndexerConsensus).
|
||||
// The caller sends a list of candidate multiaddrs; this indexer replies with the
|
||||
// subset it considers currently alive (recent heartbeat in StreamRecords).
|
||||
func (ix *IndexerService) handleIndexerConsensus(stream network.Stream) {
|
||||
defer stream.Reset()
|
||||
|
||||
var req common.IndexerConsensusRequest
|
||||
if err := json.NewDecoder(stream).Decode(&req); err != nil {
|
||||
// handleCandidateRequest responds to a node's consensus candidate request.
|
||||
// Returns a random sample of indexers from the local DHT cache.
|
||||
func (ix *IndexerService) handleCandidateRequest(s network.Stream) {
|
||||
defer s.Close()
|
||||
s.SetDeadline(time.Now().Add(5 * time.Second))
|
||||
var req common.IndexerCandidatesRequest
|
||||
if err := json.NewDecoder(s).Decode(&req); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
ix.StreamMU.RLock()
|
||||
streams := ix.StreamRecords[common.ProtocolHeartbeat]
|
||||
ix.StreamMU.RUnlock()
|
||||
|
||||
alive := make([]string, 0, len(req.Candidates))
|
||||
for _, addr := range req.Candidates {
|
||||
ad, err := peer.AddrInfoFromString(addr)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
ix.StreamMU.RLock()
|
||||
rec, ok := streams[ad.ID]
|
||||
ix.StreamMU.RUnlock()
|
||||
if !ok || rec.HeartbeatStream == nil || rec.HeartbeatStream.UptimeTracker == nil {
|
||||
continue
|
||||
}
|
||||
// D: consider alive only if recent heartbeat AND score above minimum quality bar.
|
||||
if time.Since(rec.HeartbeatStream.UptimeTracker.LastSeen) <= 2*common.RecommendedHeartbeatInterval &&
|
||||
rec.LastScore >= 30.0 {
|
||||
alive = append(alive, addr)
|
||||
}
|
||||
if req.Count <= 0 || req.Count > 10 {
|
||||
req.Count = 3
|
||||
}
|
||||
json.NewEncoder(stream).Encode(common.IndexerConsensusResponse{Alive: alive})
|
||||
ix.dhtCacheMu.RLock()
|
||||
cache := make([]dhtCacheEntry, len(ix.dhtCache))
|
||||
copy(cache, ix.dhtCache)
|
||||
ix.dhtCacheMu.RUnlock()
|
||||
|
||||
// Shuffle for randomness: each voter offers a different subset.
|
||||
rand.Shuffle(len(cache), func(i, j int) { cache[i], cache[j] = cache[j], cache[i] })
|
||||
candidates := make([]lpp.AddrInfo, 0, req.Count)
|
||||
for _, e := range cache {
|
||||
if len(candidates) >= req.Count {
|
||||
break
|
||||
}
|
||||
candidates = append(candidates, e.AI)
|
||||
}
|
||||
json.NewEncoder(s).Encode(common.IndexerCandidatesResponse{Candidates: candidates})
|
||||
}
|
||||
|
||||
func (ix *IndexerService) handleNodePublish(s network.Stream) {
|
||||
defer s.Close()
|
||||
logger := oclib.GetLogger()
|
||||
remotePeer := s.Conn().RemotePeer()
|
||||
if err := ix.behavior.RecordPublish(remotePeer); err != nil {
|
||||
logger.Warn().Err(err).Str("peer", remotePeer.String()).Msg("publish refused")
|
||||
s.Reset()
|
||||
return
|
||||
}
|
||||
for {
|
||||
var rec PeerRecord
|
||||
if err := json.NewDecoder(s).Decode(&rec); err != nil {
|
||||
@@ -233,14 +233,20 @@ func (ix *IndexerService) handleNodePublish(s network.Stream) {
|
||||
continue
|
||||
}
|
||||
if _, err := rec.Verify(); err != nil {
|
||||
logger.Err(err)
|
||||
ix.behavior.RecordBadSignature(remotePeer)
|
||||
logger.Warn().Err(err).Str("peer", remotePeer.String()).Msg("bad signature on publish")
|
||||
return
|
||||
}
|
||||
if err := ix.behavior.CheckIdentity(remotePeer, rec.DID); err != nil {
|
||||
logger.Warn().Err(err).Msg("identity mismatch on publish")
|
||||
s.Reset()
|
||||
return
|
||||
}
|
||||
if rec.PeerID == "" || rec.ExpiryDate.Before(time.Now().UTC()) {
|
||||
logger.Err(errors.New(rec.PeerID + " is expired."))
|
||||
return
|
||||
}
|
||||
pid, err := peer.Decode(rec.PeerID)
|
||||
pid, err := lpp.Decode(rec.PeerID)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
@@ -248,7 +254,7 @@ func (ix *IndexerService) handleNodePublish(s network.Stream) {
|
||||
ix.StreamMU.Lock()
|
||||
defer ix.StreamMU.Unlock()
|
||||
if ix.StreamRecords[common.ProtocolHeartbeat] == nil {
|
||||
ix.StreamRecords[common.ProtocolHeartbeat] = map[peer.ID]*common.StreamRecord[PeerRecord]{}
|
||||
ix.StreamRecords[common.ProtocolHeartbeat] = map[lpp.ID]*common.StreamRecord[PeerRecord]{}
|
||||
}
|
||||
streams := ix.StreamRecords[common.ProtocolHeartbeat]
|
||||
if srec, ok := streams[pid]; ok {
|
||||
@@ -297,6 +303,12 @@ func (ix *IndexerService) handleNodePublish(s network.Stream) {
|
||||
func (ix *IndexerService) handleNodeGet(s network.Stream) {
|
||||
defer s.Close()
|
||||
logger := oclib.GetLogger()
|
||||
remotePeer := s.Conn().RemotePeer()
|
||||
if err := ix.behavior.RecordGet(remotePeer); err != nil {
|
||||
logger.Warn().Err(err).Str("peer", remotePeer.String()).Msg("get refused")
|
||||
s.Reset()
|
||||
return
|
||||
}
|
||||
for {
|
||||
var req GetValue
|
||||
if err := json.NewDecoder(s).Decode(&req); err != nil {
|
||||
@@ -367,43 +379,3 @@ func (ix *IndexerService) handleNodeGet(s network.Stream) {
|
||||
}
|
||||
}
|
||||
|
||||
// handleGetNatives returns this indexer's configured native addresses,
|
||||
// excluding any in the request's Exclude list.
|
||||
func (ix *IndexerService) handleGetNatives(s network.Stream) {
|
||||
defer s.Close()
|
||||
logger := oclib.GetLogger()
|
||||
for {
|
||||
var req common.GetIndexerNativesRequest
|
||||
if err := json.NewDecoder(s).Decode(&req); err != nil {
|
||||
logger.Err(err).Msg("indexer get natives: decode")
|
||||
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
|
||||
strings.Contains(err.Error(), "reset") ||
|
||||
strings.Contains(err.Error(), "closed") ||
|
||||
strings.Contains(err.Error(), "too many connections") {
|
||||
return
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
excludeSet := make(map[string]struct{}, len(req.Exclude))
|
||||
for _, e := range req.Exclude {
|
||||
excludeSet[e] = struct{}{}
|
||||
}
|
||||
|
||||
resp := common.GetIndexerNativesResponse{}
|
||||
for _, addr := range strings.Split(conf.GetConfig().NativeIndexerAddresses, ",") {
|
||||
addr = strings.TrimSpace(addr)
|
||||
if addr == "" {
|
||||
continue
|
||||
}
|
||||
if _, excluded := excludeSet[addr]; !excluded {
|
||||
resp.Natives = append(resp.Natives, addr)
|
||||
}
|
||||
}
|
||||
|
||||
if err := json.NewEncoder(s).Encode(resp); err != nil {
|
||||
logger.Err(err).Msg("indexer get natives: encode response")
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ package indexer
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -43,19 +42,69 @@ type NameIndexEvent struct {
|
||||
|
||||
// nameIndexState holds the local in-memory name index and the sender-side
|
||||
// deduplication tracker.
|
||||
//
|
||||
// Search strategy: trigram inverted index.
|
||||
// - byName: lowercased name → peerID → DID (for delete and exact resolution)
|
||||
// - byPeer: peerID → lowercased name (to recompute trigrams on delete)
|
||||
// - trigrams: 3-char substring → set of peerIDs (for O(1) substring lookup)
|
||||
//
|
||||
// For needles shorter than 3 chars the trigram index cannot help; a linear
|
||||
// scan of byName is used as fallback (rare and fast enough at small N).
|
||||
type nameIndexState struct {
|
||||
// index: name → peerID → DID, built from events received from all indexers.
|
||||
index map[string]map[string]string
|
||||
indexMu sync.RWMutex
|
||||
byName map[string]map[string]string // name → peerID → DID
|
||||
byPeer map[string]string // peerID → name
|
||||
trigrams map[string]map[string]struct{} // trigram → peerID set
|
||||
indexMu sync.RWMutex
|
||||
|
||||
// emitted tracks the last emission time for each (action, name, peerID) key
|
||||
// to suppress duplicates within nameIndexDedupWindow.
|
||||
// emitted deduplicates GossipSub emissions within nameIndexDedupWindow.
|
||||
// Purged periodically to prevent unbounded growth.
|
||||
emitted map[string]time.Time
|
||||
emittedMu sync.Mutex
|
||||
}
|
||||
|
||||
// trigramsOf returns all overlapping 3-char substrings of s (already lowercased).
|
||||
// If s is shorter than 3 chars the string itself is returned as the sole token.
|
||||
func trigramsOf(s string) []string {
|
||||
if len(s) < 3 {
|
||||
return []string{s}
|
||||
}
|
||||
out := make([]string, 0, len(s)-2)
|
||||
for i := 0; i <= len(s)-3; i++ {
|
||||
out = append(out, s[i:i+3])
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// addTrigrams inserts peerID into every trigram bucket for name.
|
||||
func (s *nameIndexState) addTrigrams(name, peerID string) {
|
||||
for _, tg := range trigramsOf(name) {
|
||||
if s.trigrams[tg] == nil {
|
||||
s.trigrams[tg] = map[string]struct{}{}
|
||||
}
|
||||
s.trigrams[tg][peerID] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
// removeTrigrams deletes peerID from every trigram bucket for name,
|
||||
// cleaning up empty buckets to keep memory tight.
|
||||
func (s *nameIndexState) removeTrigrams(name, peerID string) {
|
||||
for _, tg := range trigramsOf(name) {
|
||||
if m := s.trigrams[tg]; m != nil {
|
||||
delete(m, peerID)
|
||||
if len(m) == 0 {
|
||||
delete(s.trigrams, tg)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// shouldEmit returns true if the (action, name, peerID) tuple has not been
|
||||
// emitted within nameIndexDedupWindow, updating the tracker if so.
|
||||
//
|
||||
// On DELETE: the ADD entry for the same peer is immediately removed — the peer
|
||||
// is gone, keeping it would cause the map to grow with departed peers forever.
|
||||
// The DELETE entry itself is kept for the dedup window to absorb duplicate
|
||||
// delete events, then cleaned by the purgeEmitted ticker.
|
||||
func (s *nameIndexState) shouldEmit(action NameIndexAction, name, peerID string) bool {
|
||||
key := string(action) + ":" + name + ":" + peerID
|
||||
s.emittedMu.Lock()
|
||||
@@ -64,9 +113,27 @@ func (s *nameIndexState) shouldEmit(action NameIndexAction, name, peerID string)
|
||||
return false
|
||||
}
|
||||
s.emitted[key] = time.Now()
|
||||
if action == NameIndexDelete {
|
||||
// Peer is leaving: drop its ADD entry — no longer needed.
|
||||
delete(s.emitted, string(NameIndexAdd)+":"+name+":"+peerID)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// purgeEmitted removes stale DELETE entries from the emitted dedup map.
|
||||
// ADD entries are cleaned eagerly on DELETE, so only short-lived DELETE
|
||||
// entries remain here; the ticker just trims those stragglers.
|
||||
func (s *nameIndexState) purgeEmitted() {
|
||||
now := time.Now()
|
||||
s.emittedMu.Lock()
|
||||
defer s.emittedMu.Unlock()
|
||||
for k, t := range s.emitted {
|
||||
if now.Sub(t) >= nameIndexDedupWindow {
|
||||
delete(s.emitted, k)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// onEvent applies a received NameIndexEvent to the local index.
|
||||
// "add" inserts/updates the mapping; "delete" removes it.
|
||||
// Operations are idempotent — duplicate events from multiple indexers are harmless.
|
||||
@@ -74,19 +141,40 @@ func (s *nameIndexState) onEvent(evt NameIndexEvent) {
|
||||
if evt.Name == "" || evt.PeerID == "" {
|
||||
return
|
||||
}
|
||||
nameLow := strings.ToLower(evt.Name)
|
||||
s.indexMu.Lock()
|
||||
defer s.indexMu.Unlock()
|
||||
switch evt.Action {
|
||||
case NameIndexAdd:
|
||||
if s.index[evt.Name] == nil {
|
||||
s.index[evt.Name] = map[string]string{}
|
||||
// If the peer previously had a different name, clean up old trigrams.
|
||||
if old, ok := s.byPeer[evt.PeerID]; ok && old != nameLow {
|
||||
s.removeTrigrams(old, evt.PeerID)
|
||||
if s.byName[old] != nil {
|
||||
delete(s.byName[old], evt.PeerID)
|
||||
if len(s.byName[old]) == 0 {
|
||||
delete(s.byName, old)
|
||||
}
|
||||
}
|
||||
}
|
||||
s.index[evt.Name][evt.PeerID] = evt.DID
|
||||
if s.byName[nameLow] == nil {
|
||||
s.byName[nameLow] = map[string]string{}
|
||||
}
|
||||
s.byName[nameLow][evt.PeerID] = evt.DID
|
||||
s.byPeer[evt.PeerID] = nameLow
|
||||
s.addTrigrams(nameLow, evt.PeerID)
|
||||
|
||||
case NameIndexDelete:
|
||||
if s.index[evt.Name] != nil {
|
||||
delete(s.index[evt.Name], evt.PeerID)
|
||||
if len(s.index[evt.Name]) == 0 {
|
||||
delete(s.index, evt.Name)
|
||||
// Use stored name so trigrams match exactly what was indexed.
|
||||
name := nameLow
|
||||
if stored, ok := s.byPeer[evt.PeerID]; ok {
|
||||
name = stored
|
||||
}
|
||||
s.removeTrigrams(name, evt.PeerID)
|
||||
delete(s.byPeer, evt.PeerID)
|
||||
if s.byName[name] != nil {
|
||||
delete(s.byName[name], evt.PeerID)
|
||||
if len(s.byName[name]) == 0 {
|
||||
delete(s.byName, name)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -96,10 +184,22 @@ func (s *nameIndexState) onEvent(evt NameIndexEvent) {
|
||||
// Must be called after ix.PS is ready.
|
||||
func (ix *IndexerService) initNameIndex(ps *pubsub.PubSub) {
|
||||
logger := oclib.GetLogger()
|
||||
ix.nameIndex = &nameIndexState{
|
||||
index: map[string]map[string]string{},
|
||||
emitted: map[string]time.Time{},
|
||||
state := &nameIndexState{
|
||||
byName: map[string]map[string]string{},
|
||||
byPeer: map[string]string{},
|
||||
trigrams: map[string]map[string]struct{}{},
|
||||
emitted: map[string]time.Time{},
|
||||
}
|
||||
ix.nameIndex = state
|
||||
|
||||
// Periodically purge the emitted dedup map so it doesn't grow forever.
|
||||
go func() {
|
||||
t := time.NewTicker(nameIndexDedupWindow)
|
||||
defer t.Stop()
|
||||
for range t.C {
|
||||
state.purgeEmitted()
|
||||
}
|
||||
}()
|
||||
|
||||
ps.RegisterTopicValidator(TopicNameIndex, func(_ context.Context, _ pp.ID, _ *pubsub.Message) bool {
|
||||
return true
|
||||
@@ -149,23 +249,72 @@ func (ix *IndexerService) publishNameEvent(action NameIndexAction, name, peerID,
|
||||
|
||||
// LookupNameIndex searches the distributed name index for peers whose name
|
||||
// contains needle (case-insensitive). Returns peerID → DID for matched peers.
|
||||
// Returns nil if the name index is not initialised (e.g. native indexers).
|
||||
// Returns nil if the name index is not initialised.
|
||||
//
|
||||
// Algorithm:
|
||||
// - needle ≥ 3 chars: trigram intersection → O(|candidates|) verify pass.
|
||||
// The trigram index immediately narrows the candidate set; false positives
|
||||
// are eliminated by the full-string contains check.
|
||||
// - needle < 3 chars: linear scan of byName (rare, still fast at small N).
|
||||
func (ix *IndexerService) LookupNameIndex(needle string) map[string]string {
|
||||
if ix.nameIndex == nil {
|
||||
return nil
|
||||
}
|
||||
result := map[string]string{}
|
||||
needleLow := strings.ToLower(needle)
|
||||
result := map[string]string{}
|
||||
|
||||
ix.nameIndex.indexMu.RLock()
|
||||
defer ix.nameIndex.indexMu.RUnlock()
|
||||
for name, peers := range ix.nameIndex.index {
|
||||
fmt.Println(strings.Contains(strings.ToLower(name), needleLow), needleLow, strings.ToLower(name))
|
||||
if strings.Contains(strings.ToLower(name), needleLow) {
|
||||
for peerID, did := range peers {
|
||||
result[peerID] = did
|
||||
|
||||
if len(needleLow) < 3 {
|
||||
// Short needle: linear scan fallback.
|
||||
for name, peers := range ix.nameIndex.byName {
|
||||
if strings.Contains(name, needleLow) {
|
||||
for peerID, did := range peers {
|
||||
result[peerID] = did
|
||||
}
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// Trigram intersection: start with the first trigram's set, then
|
||||
// progressively intersect with each subsequent trigram's set.
|
||||
tgs := trigramsOf(needleLow)
|
||||
var candidates map[string]struct{}
|
||||
for _, tg := range tgs {
|
||||
set := ix.nameIndex.trigrams[tg]
|
||||
if len(set) == 0 {
|
||||
return result // any empty trigram set → no possible match
|
||||
}
|
||||
if candidates == nil {
|
||||
candidates = make(map[string]struct{}, len(set))
|
||||
for pid := range set {
|
||||
candidates[pid] = struct{}{}
|
||||
}
|
||||
} else {
|
||||
for pid := range candidates {
|
||||
if _, ok := set[pid]; !ok {
|
||||
delete(candidates, pid)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(candidates) == 0 {
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
// Full-string verification pass: trigrams admit false positives
|
||||
// (e.g. "abc" and "bca" share the trigram "bc_" with a rotated name).
|
||||
for peerID := range candidates {
|
||||
name := ix.nameIndex.byPeer[peerID]
|
||||
if strings.Contains(name, needleLow) {
|
||||
did := ""
|
||||
if m := ix.nameIndex.byName[name]; m != nil {
|
||||
did = m[peerID]
|
||||
}
|
||||
result[peerID] = did
|
||||
}
|
||||
}
|
||||
fmt.Println("RESULT", result)
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -1,783 +0,0 @@
|
||||
package indexer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"math/rand"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"oc-discovery/daemons/node/common"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
pubsub "github.com/libp2p/go-libp2p-pubsub"
|
||||
"github.com/libp2p/go-libp2p/core/crypto"
|
||||
"github.com/libp2p/go-libp2p/core/network"
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
)
|
||||
|
||||
const (
|
||||
// IndexerTTL is the lifetime of a live-indexer cache entry. Set to 50% above
|
||||
// the recommended 60s heartbeat interval so a single delayed renewal does not
|
||||
// evict a healthy indexer from the native's cache.
|
||||
IndexerTTL = 90 * time.Second
|
||||
// offloadInterval is how often the native checks if it can release responsible peers.
|
||||
offloadInterval = 30 * time.Second
|
||||
// dhtRefreshInterval is how often the background goroutine queries the DHT for
|
||||
// known-but-expired indexer entries (written by neighbouring natives).
|
||||
dhtRefreshInterval = 30 * time.Second
|
||||
// maxFallbackPeers caps how many peers the native will accept in self-delegation
|
||||
// mode. Beyond this limit the native refuses to act as a fallback indexer so it
|
||||
// is not overwhelmed during prolonged indexer outages.
|
||||
maxFallbackPeers = 50
|
||||
)
|
||||
|
||||
// liveIndexerEntry tracks a registered indexer in the native's in-memory cache and DHT.
|
||||
// PubKey and Signature are forwarded from the IndexerRegistration so the DHT validator
|
||||
// can verify that the entry was produced by the peer owning the declared PeerID.
|
||||
// FillRate is the fraction of capacity used (0=empty, 1=full) at last registration.
|
||||
type liveIndexerEntry struct {
|
||||
PeerID string `json:"peer_id"`
|
||||
Addr string `json:"addr"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
RegTimestamp int64 `json:"reg_ts,omitempty"` // Timestamp from the original IndexerRegistration
|
||||
PubKey []byte `json:"pub_key,omitempty"`
|
||||
Signature []byte `json:"sig,omitempty"`
|
||||
FillRate float64 `json:"fill_rate,omitempty"`
|
||||
}
|
||||
|
||||
// NativeState holds runtime state specific to native indexer operation.
|
||||
type NativeState struct {
|
||||
liveIndexers map[string]*liveIndexerEntry // keyed by PeerID, local cache with TTL
|
||||
liveIndexersMu sync.RWMutex
|
||||
responsiblePeers map[pp.ID]struct{} // peers for which the native is fallback indexer
|
||||
responsibleMu sync.RWMutex
|
||||
// knownPeerIDs accumulates all indexer PeerIDs ever seen (local stream or gossip).
|
||||
// Used by refreshIndexersFromDHT to re-hydrate expired entries from the shared DHT,
|
||||
// including entries written by other natives.
|
||||
knownPeerIDs map[string]string
|
||||
knownMu sync.RWMutex
|
||||
|
||||
// cancel stops background goroutines (runOffloadLoop, refreshIndexersFromDHT)
|
||||
// when the native shuts down.
|
||||
cancel context.CancelFunc
|
||||
}
|
||||
|
||||
func newNativeState(cancel context.CancelFunc) *NativeState {
|
||||
return &NativeState{
|
||||
liveIndexers: map[string]*liveIndexerEntry{},
|
||||
responsiblePeers: map[pp.ID]struct{}{},
|
||||
knownPeerIDs: map[string]string{},
|
||||
cancel: cancel,
|
||||
}
|
||||
}
|
||||
|
||||
// IndexerRecordValidator validates indexer DHT entries under the "indexer" namespace.
|
||||
type IndexerRecordValidator struct{}
|
||||
|
||||
func (v IndexerRecordValidator) Validate(_ string, value []byte) error {
|
||||
var e liveIndexerEntry
|
||||
if err := json.Unmarshal(value, &e); err != nil {
|
||||
return err
|
||||
}
|
||||
if e.Addr == "" {
|
||||
return errors.New("missing addr")
|
||||
}
|
||||
if e.ExpiresAt.Before(time.Now().UTC()) {
|
||||
return errors.New("expired indexer record")
|
||||
}
|
||||
// Verify self-signature when present — rejects entries forged by a
|
||||
// compromised native that does not control the declared PeerID.
|
||||
if len(e.Signature) > 0 && len(e.PubKey) > 0 {
|
||||
pub, err := crypto.UnmarshalPublicKey(e.PubKey)
|
||||
if err != nil {
|
||||
return fmt.Errorf("indexer entry: invalid public key: %w", err)
|
||||
}
|
||||
payload := []byte(fmt.Sprintf("%s|%s|%d", e.PeerID, e.Addr, e.RegTimestamp))
|
||||
if ok, err := pub.Verify(payload, e.Signature); err != nil || !ok {
|
||||
return errors.New("indexer entry: invalid signature")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (v IndexerRecordValidator) Select(_ string, values [][]byte) (int, error) {
|
||||
var newest time.Time
|
||||
index := 0
|
||||
for i, val := range values {
|
||||
var e liveIndexerEntry
|
||||
if err := json.Unmarshal(val, &e); err != nil {
|
||||
continue
|
||||
}
|
||||
if e.ExpiresAt.After(newest) {
|
||||
newest = e.ExpiresAt
|
||||
index = i
|
||||
}
|
||||
}
|
||||
return index, nil
|
||||
}
|
||||
|
||||
// InitNative registers native-specific stream handlers and starts background loops.
|
||||
// Must be called after DHT is initialized.
|
||||
func (ix *IndexerService) InitNative() {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
ix.Native = newNativeState(cancel)
|
||||
ix.Host.SetStreamHandler(common.ProtocolHeartbeat, ix.HandleHeartbeat) // specific heartbeat for Indexer.
|
||||
ix.Host.SetStreamHandler(common.ProtocolNativeSubscription, ix.handleNativeSubscription)
|
||||
ix.Host.SetStreamHandler(common.ProtocolNativeUnsubscribe, ix.handleNativeUnsubscribe)
|
||||
ix.Host.SetStreamHandler(common.ProtocolNativeGetIndexers, ix.handleNativeGetIndexers)
|
||||
ix.Host.SetStreamHandler(common.ProtocolNativeConsensus, ix.handleNativeConsensus)
|
||||
ix.Host.SetStreamHandler(common.ProtocolNativeGetPeers, ix.handleNativeGetPeers)
|
||||
ix.Host.SetStreamHandler(common.ProtocolIndexerGetNatives, ix.handleGetNatives)
|
||||
ix.subscribeIndexerRegistry()
|
||||
// Ensure long connections to other configured natives (native-to-native mesh).
|
||||
common.EnsureNativePeers(ix.Host)
|
||||
go ix.runOffloadLoop(ctx)
|
||||
go ix.refreshIndexersFromDHT(ctx)
|
||||
}
|
||||
|
||||
// subscribeIndexerRegistry joins the PubSub topic used by natives to gossip newly
|
||||
// registered indexer PeerIDs to one another, enabling cross-native DHT discovery.
|
||||
func (ix *IndexerService) subscribeIndexerRegistry() {
|
||||
logger := oclib.GetLogger()
|
||||
ix.PS.RegisterTopicValidator(common.TopicIndexerRegistry, func(_ context.Context, _ pp.ID, msg *pubsub.Message) bool {
|
||||
// Parse as a signed IndexerRegistration.
|
||||
var reg common.IndexerRegistration
|
||||
if err := json.Unmarshal(msg.Data, ®); err != nil {
|
||||
return false
|
||||
}
|
||||
if reg.Addr == "" {
|
||||
return false
|
||||
}
|
||||
if _, err := pp.AddrInfoFromString(reg.Addr); err != nil {
|
||||
return false
|
||||
}
|
||||
// Verify the self-signature when present (rejects forged gossip from a
|
||||
// compromised native that does not control the announced PeerID).
|
||||
if ok, _ := reg.Verify(); !ok {
|
||||
return false
|
||||
}
|
||||
// Accept only messages from known native peers or from this host itself.
|
||||
// This prevents external PSK participants from injecting registry entries.
|
||||
from := msg.GetFrom()
|
||||
if from == ix.Host.ID() {
|
||||
return true
|
||||
}
|
||||
common.StreamNativeMu.RLock()
|
||||
_, knownNative := common.StaticNatives[from.String()]
|
||||
if !knownNative {
|
||||
for _, ad := range common.StaticNatives {
|
||||
if ad.ID == from {
|
||||
knownNative = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
common.StreamNativeMu.RUnlock()
|
||||
return knownNative
|
||||
})
|
||||
topic, err := ix.PS.Join(common.TopicIndexerRegistry)
|
||||
if err != nil {
|
||||
logger.Err(err).Msg("native: failed to join indexer registry topic")
|
||||
return
|
||||
}
|
||||
sub, err := topic.Subscribe()
|
||||
if err != nil {
|
||||
logger.Err(err).Msg("native: failed to subscribe to indexer registry topic")
|
||||
return
|
||||
}
|
||||
ix.PubsubMu.Lock()
|
||||
ix.LongLivedPubSubs[common.TopicIndexerRegistry] = topic
|
||||
ix.PubsubMu.Unlock()
|
||||
|
||||
go func() {
|
||||
for {
|
||||
msg, err := sub.Next(context.Background())
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
// The gossip payload is a JSON-encoded IndexerRegistration (signed).
|
||||
var gossipReg common.IndexerRegistration
|
||||
if jsonErr := json.Unmarshal(msg.Data, &gossipReg); jsonErr != nil {
|
||||
continue
|
||||
}
|
||||
if gossipReg.Addr == "" || gossipReg.PeerID == "" {
|
||||
continue
|
||||
}
|
||||
// A neighbouring native registered this PeerID; add to known set for DHT refresh.
|
||||
ix.Native.knownMu.Lock()
|
||||
ix.Native.knownPeerIDs[gossipReg.PeerID] = gossipReg.Addr
|
||||
ix.Native.knownMu.Unlock()
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// handleNativeSubscription stores an indexer's alive registration in the local cache
|
||||
// immediately, then persists it to the DHT asynchronously.
|
||||
// The stream is temporary: indexer sends one IndexerRegistration and closes.
|
||||
func (ix *IndexerService) handleNativeSubscription(s network.Stream) {
|
||||
defer s.Close()
|
||||
logger := oclib.GetLogger()
|
||||
|
||||
logger.Info().Msg("Subscription")
|
||||
for {
|
||||
var reg common.IndexerRegistration
|
||||
if err := json.NewDecoder(s).Decode(®); err != nil {
|
||||
logger.Err(err).Msg("native subscription: decode")
|
||||
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
|
||||
strings.Contains(err.Error(), "reset") ||
|
||||
strings.Contains(err.Error(), "closed") ||
|
||||
strings.Contains(err.Error(), "too many connections") {
|
||||
return
|
||||
}
|
||||
continue
|
||||
}
|
||||
logger.Info().Msg("Subscription " + reg.Addr)
|
||||
|
||||
if reg.Addr == "" {
|
||||
logger.Error().Msg("native subscription: missing addr")
|
||||
return
|
||||
}
|
||||
if reg.PeerID == "" {
|
||||
ad, err := pp.AddrInfoFromString(reg.Addr)
|
||||
if err != nil {
|
||||
logger.Err(err).Msg("native subscription: invalid addr")
|
||||
return
|
||||
}
|
||||
reg.PeerID = ad.ID.String()
|
||||
}
|
||||
|
||||
// Reject registrations with an invalid self-signature.
|
||||
if ok, err := reg.Verify(); !ok {
|
||||
logger.Warn().Str("peer", reg.PeerID).Err(err).Msg("native subscription: invalid signature, rejecting")
|
||||
return
|
||||
}
|
||||
|
||||
// Build entry with a fresh TTL — must happen before the cache write so the
|
||||
// TTL window is not consumed by DHT retries.
|
||||
entry := &liveIndexerEntry{
|
||||
PeerID: reg.PeerID,
|
||||
Addr: reg.Addr,
|
||||
ExpiresAt: time.Now().UTC().Add(IndexerTTL),
|
||||
RegTimestamp: reg.Timestamp,
|
||||
PubKey: reg.PubKey,
|
||||
Signature: reg.Signature,
|
||||
FillRate: reg.FillRate,
|
||||
}
|
||||
|
||||
// Verify that the declared address is actually reachable before admitting
|
||||
// the registration. This async dial runs in the background; the indexer is
|
||||
// tentatively admitted immediately (so heartbeats don't get stuck) but is
|
||||
// evicted from the cache if the dial fails within 5 s.
|
||||
go func(e *liveIndexerEntry) {
|
||||
ad, err := pp.AddrInfoFromString(e.Addr)
|
||||
if err != nil {
|
||||
logger.Warn().Str("addr", e.Addr).Msg("native subscription: invalid addr during validation, rejecting")
|
||||
ix.Native.liveIndexersMu.Lock()
|
||||
if cur := ix.Native.liveIndexers[e.PeerID]; cur == e {
|
||||
delete(ix.Native.liveIndexers, e.PeerID)
|
||||
}
|
||||
ix.Native.liveIndexersMu.Unlock()
|
||||
return
|
||||
}
|
||||
dialCtx, dialCancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer dialCancel()
|
||||
if err := ix.Host.Connect(dialCtx, *ad); err != nil {
|
||||
logger.Warn().Str("addr", e.Addr).Err(err).Msg("native subscription: declared address unreachable, rejecting")
|
||||
ix.Native.liveIndexersMu.Lock()
|
||||
if cur := ix.Native.liveIndexers[e.PeerID]; cur == e {
|
||||
delete(ix.Native.liveIndexers, e.PeerID)
|
||||
}
|
||||
ix.Native.liveIndexersMu.Unlock()
|
||||
}
|
||||
}(entry)
|
||||
|
||||
// Update local cache and known set immediately so concurrent GetIndexers calls
|
||||
// can already see this indexer without waiting for the DHT write to complete.
|
||||
ix.Native.liveIndexersMu.Lock()
|
||||
_, isRenewal := ix.Native.liveIndexers[reg.PeerID]
|
||||
ix.Native.liveIndexers[reg.PeerID] = entry
|
||||
ix.Native.liveIndexersMu.Unlock()
|
||||
|
||||
ix.Native.knownMu.Lock()
|
||||
ix.Native.knownPeerIDs[reg.PeerID] = reg.Addr
|
||||
ix.Native.knownMu.Unlock()
|
||||
|
||||
// Gossip the signed registration to neighbouring natives.
|
||||
// The payload is JSON-encoded so the receiver can verify the self-signature.
|
||||
ix.PubsubMu.RLock()
|
||||
topic := ix.LongLivedPubSubs[common.TopicIndexerRegistry]
|
||||
ix.PubsubMu.RUnlock()
|
||||
if topic != nil {
|
||||
if gossipData, marshalErr := json.Marshal(reg); marshalErr == nil {
|
||||
if err := topic.Publish(context.Background(), gossipData); err != nil {
|
||||
logger.Err(err).Msg("native subscription: registry gossip publish")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if isRenewal {
|
||||
// logger.Debug().Str("peer", reg.PeerID).Msg("native: indexer TTL renewed : " + fmt.Sprintf("%v", len(ix.Native.liveIndexers)))
|
||||
} else {
|
||||
logger.Info().Str("peer", reg.PeerID).Msg("native: indexer registered : " + fmt.Sprintf("%v", len(ix.Native.liveIndexers)))
|
||||
}
|
||||
|
||||
// Persist in DHT asynchronously with bounded retry.
|
||||
// Max retry window = IndexerTTL (90 s) — retrying past entry expiry is pointless.
|
||||
// Backoff: 10 s → 20 s → 40 s, then repeats at 40 s until deadline.
|
||||
key := ix.genIndexerKey(reg.PeerID)
|
||||
data, err := json.Marshal(entry)
|
||||
if err != nil {
|
||||
logger.Err(err).Msg("native subscription: marshal entry")
|
||||
return
|
||||
}
|
||||
go func() {
|
||||
deadline := time.Now().Add(IndexerTTL)
|
||||
backoff := 10 * time.Second
|
||||
for {
|
||||
if time.Now().After(deadline) {
|
||||
logger.Warn().Str("key", key).Msg("native subscription: DHT put abandoned, entry TTL exceeded")
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
err := ix.DHT.PutValue(ctx, key, data)
|
||||
cancel()
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
logger.Err(err).Msg("native subscription: DHT put " + key)
|
||||
if !strings.Contains(err.Error(), "failed to find any peer in table") {
|
||||
return // non-retryable error
|
||||
}
|
||||
remaining := time.Until(deadline)
|
||||
if backoff > remaining {
|
||||
backoff = remaining
|
||||
}
|
||||
if backoff <= 0 {
|
||||
return
|
||||
}
|
||||
time.Sleep(backoff)
|
||||
if backoff < 40*time.Second {
|
||||
backoff *= 2
|
||||
}
|
||||
}
|
||||
}()
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// handleNativeUnsubscribe removes a departing indexer from the local cache and
|
||||
// known set immediately, without waiting for TTL expiry.
|
||||
func (ix *IndexerService) handleNativeUnsubscribe(s network.Stream) {
|
||||
defer s.Close()
|
||||
logger := oclib.GetLogger()
|
||||
var reg common.IndexerRegistration
|
||||
if err := json.NewDecoder(s).Decode(®); err != nil {
|
||||
logger.Err(err).Msg("native unsubscribe: decode")
|
||||
return
|
||||
}
|
||||
if reg.PeerID == "" {
|
||||
logger.Warn().Msg("native unsubscribe: missing peer_id")
|
||||
return
|
||||
}
|
||||
ix.Native.liveIndexersMu.Lock()
|
||||
delete(ix.Native.liveIndexers, reg.PeerID)
|
||||
ix.Native.liveIndexersMu.Unlock()
|
||||
ix.Native.knownMu.Lock()
|
||||
delete(ix.Native.knownPeerIDs, reg.PeerID)
|
||||
ix.Native.knownMu.Unlock()
|
||||
logger.Info().Str("peer", reg.PeerID).Msg("native: indexer explicitly unregistered")
|
||||
}
|
||||
|
||||
// handleNativeGetIndexers returns this native's own list of reachable indexers.
|
||||
// Self-delegation (native acting as temporary fallback indexer) is only permitted
|
||||
// for nodes — never for peers that are themselves registered indexers in knownPeerIDs.
|
||||
// The consensus across natives is the responsibility of the requesting node/indexer.
|
||||
func (ix *IndexerService) handleNativeGetIndexers(s network.Stream) {
|
||||
defer s.Close()
|
||||
logger := oclib.GetLogger()
|
||||
for {
|
||||
var req common.GetIndexersRequest
|
||||
if err := json.NewDecoder(s).Decode(&req); err != nil {
|
||||
logger.Err(err).Msg("native get indexers: decode")
|
||||
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
|
||||
strings.Contains(err.Error(), "reset") ||
|
||||
strings.Contains(err.Error(), "closed") ||
|
||||
strings.Contains(err.Error(), "too many connections") {
|
||||
return
|
||||
}
|
||||
continue
|
||||
}
|
||||
if req.Count <= 0 {
|
||||
req.Count = 3
|
||||
}
|
||||
callerPeerID := s.Conn().RemotePeer().String()
|
||||
reachable := ix.reachableLiveIndexers(req.Count, callerPeerID)
|
||||
var resp common.GetIndexersResponse
|
||||
|
||||
if len(reachable) == 0 {
|
||||
// No live indexers reachable — try to self-delegate.
|
||||
if ix.selfDelegate(s.Conn().RemotePeer(), &resp) {
|
||||
logger.Info().Str("peer", callerPeerID).Msg("native: no indexers, acting as fallback for node")
|
||||
} else {
|
||||
// Fallback pool saturated: return empty so the caller retries another
|
||||
// native instead of piling more load onto this one.
|
||||
logger.Warn().Str("peer", callerPeerID).Int("pool", maxFallbackPeers).Msg(
|
||||
"native: fallback pool saturated, refusing self-delegation")
|
||||
}
|
||||
} else {
|
||||
// Sort by fill rate ascending so less-full indexers are preferred for routing.
|
||||
ix.Native.liveIndexersMu.RLock()
|
||||
fillRates := make(map[string]float64, len(reachable))
|
||||
for _, addr := range reachable {
|
||||
ad, err := pp.AddrInfoFromString(addr)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
for _, e := range ix.Native.liveIndexers {
|
||||
if e.PeerID == ad.ID.String() {
|
||||
fillRates[addr] = e.FillRate
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
ix.Native.liveIndexersMu.RUnlock()
|
||||
|
||||
// Sort by routing weight descending: weight = fillRate × (1 − fillRate).
|
||||
// This prefers indexers in the "trust sweet spot" — proven popular (fillRate > 0)
|
||||
// but not saturated (fillRate < 1). Peak at fillRate ≈ 0.5.
|
||||
routingWeight := func(addr string) float64 {
|
||||
f := fillRates[addr]
|
||||
return f * (1 - f)
|
||||
}
|
||||
for i := 1; i < len(reachable); i++ {
|
||||
for j := i; j > 0 && routingWeight(reachable[j]) > routingWeight(reachable[j-1]); j-- {
|
||||
reachable[j], reachable[j-1] = reachable[j-1], reachable[j]
|
||||
}
|
||||
}
|
||||
if req.Count > len(reachable) {
|
||||
req.Count = len(reachable)
|
||||
}
|
||||
resp.Indexers = reachable[:req.Count]
|
||||
resp.FillRates = fillRates
|
||||
}
|
||||
|
||||
if err := json.NewEncoder(s).Encode(resp); err != nil {
|
||||
logger.Err(err).Msg("native get indexers: encode response")
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// handleNativeConsensus answers a consensus challenge from a node/indexer.
|
||||
// It returns:
|
||||
// - Trusted: which of the candidates it considers alive.
|
||||
// - Suggestions: extras it knows and trusts that were not in the candidate list.
|
||||
func (ix *IndexerService) handleNativeConsensus(s network.Stream) {
|
||||
defer s.Close()
|
||||
logger := oclib.GetLogger()
|
||||
for {
|
||||
var req common.ConsensusRequest
|
||||
if err := json.NewDecoder(s).Decode(&req); err != nil {
|
||||
logger.Err(err).Msg("native consensus: decode")
|
||||
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
|
||||
strings.Contains(err.Error(), "reset") ||
|
||||
strings.Contains(err.Error(), "closed") ||
|
||||
strings.Contains(err.Error(), "too many connections") {
|
||||
return
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
myList := ix.reachableLiveIndexers(-1, s.Conn().RemotePeer().String())
|
||||
mySet := make(map[string]struct{}, len(myList))
|
||||
for _, addr := range myList {
|
||||
mySet[addr] = struct{}{}
|
||||
}
|
||||
|
||||
trusted := []string{}
|
||||
candidateSet := make(map[string]struct{}, len(req.Candidates))
|
||||
for _, addr := range req.Candidates {
|
||||
candidateSet[addr] = struct{}{}
|
||||
if _, ok := mySet[addr]; ok {
|
||||
trusted = append(trusted, addr) // candidate we also confirm as reachable
|
||||
}
|
||||
}
|
||||
|
||||
// Extras we trust but that the requester didn't include → suggestions.
|
||||
suggestions := []string{}
|
||||
for _, addr := range myList {
|
||||
if _, inCandidates := candidateSet[addr]; !inCandidates {
|
||||
suggestions = append(suggestions, addr)
|
||||
}
|
||||
}
|
||||
|
||||
resp := common.ConsensusResponse{Trusted: trusted, Suggestions: suggestions}
|
||||
if err := json.NewEncoder(s).Encode(resp); err != nil {
|
||||
logger.Err(err).Msg("native consensus: encode response")
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// selfDelegate marks the caller as a responsible peer and exposes this native's own
|
||||
// address as its temporary indexer. Returns false when the fallback pool is saturated
|
||||
// (maxFallbackPeers reached) — the caller must return an empty response so the node
|
||||
// retries later instead of pinning indefinitely to an overloaded native.
|
||||
func (ix *IndexerService) selfDelegate(remotePeer pp.ID, resp *common.GetIndexersResponse) bool {
|
||||
ix.Native.responsibleMu.Lock()
|
||||
defer ix.Native.responsibleMu.Unlock()
|
||||
if len(ix.Native.responsiblePeers) >= maxFallbackPeers {
|
||||
return false
|
||||
}
|
||||
ix.Native.responsiblePeers[remotePeer] = struct{}{}
|
||||
resp.IsSelfFallback = true
|
||||
resp.Indexers = []string{ix.Host.Addrs()[len(ix.Host.Addrs())-1].String() + "/p2p/" + ix.Host.ID().String()}
|
||||
return true
|
||||
}
|
||||
|
||||
// reachableLiveIndexers returns the multiaddrs of non-expired, pingable indexers
|
||||
// from the local cache (kept fresh by refreshIndexersFromDHT in background).
|
||||
func (ix *IndexerService) reachableLiveIndexers(count int, from ...string) []string {
|
||||
ix.Native.liveIndexersMu.RLock()
|
||||
now := time.Now().UTC()
|
||||
candidates := []*liveIndexerEntry{}
|
||||
for _, e := range ix.Native.liveIndexers {
|
||||
fmt.Println("liveIndexers", slices.Contains(from, e.PeerID), from, e.PeerID)
|
||||
if e.ExpiresAt.After(now) && !slices.Contains(from, e.PeerID) {
|
||||
candidates = append(candidates, e)
|
||||
}
|
||||
}
|
||||
ix.Native.liveIndexersMu.RUnlock()
|
||||
|
||||
fmt.Println("midway...", candidates, from, ix.Native.knownPeerIDs)
|
||||
|
||||
if (count > 0 && len(candidates) < count) || count < 0 {
|
||||
ix.Native.knownMu.RLock()
|
||||
for k, v := range ix.Native.knownPeerIDs {
|
||||
// Include peers whose liveIndexers entry is absent OR expired.
|
||||
// A non-nil but expired entry means the peer was once known but
|
||||
// has since timed out — PeerIsAlive below will decide if it's back.
|
||||
fmt.Println("knownPeerIDs", slices.Contains(from, k), from, k)
|
||||
if !slices.Contains(from, k) {
|
||||
candidates = append(candidates, &liveIndexerEntry{
|
||||
PeerID: k,
|
||||
Addr: v,
|
||||
})
|
||||
}
|
||||
}
|
||||
ix.Native.knownMu.RUnlock()
|
||||
}
|
||||
|
||||
fmt.Println("midway...1", candidates)
|
||||
|
||||
reachable := []string{}
|
||||
for _, e := range candidates {
|
||||
ad, err := pp.AddrInfoFromString(e.Addr)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if common.PeerIsAlive(ix.Host, *ad) {
|
||||
reachable = append(reachable, e.Addr)
|
||||
}
|
||||
}
|
||||
return reachable
|
||||
}
|
||||
|
||||
// refreshIndexersFromDHT runs in background and queries the shared DHT for every known
|
||||
// indexer PeerID whose local cache entry is missing or expired. This supplements the
|
||||
// local cache with entries written by neighbouring natives.
|
||||
func (ix *IndexerService) refreshIndexersFromDHT(ctx context.Context) {
|
||||
t := time.NewTicker(dhtRefreshInterval)
|
||||
defer t.Stop()
|
||||
logger := oclib.GetLogger()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-t.C:
|
||||
}
|
||||
ix.Native.knownMu.RLock()
|
||||
peerIDs := make([]string, 0, len(ix.Native.knownPeerIDs))
|
||||
for pid := range ix.Native.knownPeerIDs {
|
||||
peerIDs = append(peerIDs, pid)
|
||||
}
|
||||
ix.Native.knownMu.RUnlock()
|
||||
|
||||
now := time.Now().UTC()
|
||||
for _, pid := range peerIDs {
|
||||
ix.Native.liveIndexersMu.RLock()
|
||||
existing := ix.Native.liveIndexers[pid]
|
||||
ix.Native.liveIndexersMu.RUnlock()
|
||||
if existing != nil && existing.ExpiresAt.After(now) {
|
||||
continue // still fresh in local cache
|
||||
}
|
||||
key := ix.genIndexerKey(pid)
|
||||
dhtCtx, dhtCancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
ch, err := ix.DHT.SearchValue(dhtCtx, key)
|
||||
if err != nil {
|
||||
dhtCancel()
|
||||
continue
|
||||
}
|
||||
var best *liveIndexerEntry
|
||||
for b := range ch {
|
||||
var e liveIndexerEntry
|
||||
if err := json.Unmarshal(b, &e); err != nil {
|
||||
continue
|
||||
}
|
||||
if e.ExpiresAt.After(time.Now().UTC()) {
|
||||
if best == nil || e.ExpiresAt.After(best.ExpiresAt) {
|
||||
best = &e
|
||||
}
|
||||
}
|
||||
}
|
||||
dhtCancel()
|
||||
if best != nil {
|
||||
ix.Native.liveIndexersMu.Lock()
|
||||
ix.Native.liveIndexers[best.PeerID] = best
|
||||
ix.Native.liveIndexersMu.Unlock()
|
||||
logger.Info().Str("peer", best.PeerID).Msg("native: refreshed indexer from DHT")
|
||||
} else {
|
||||
// DHT has no fresh entry — peer is gone, prune from known set.
|
||||
ix.Native.knownMu.Lock()
|
||||
delete(ix.Native.knownPeerIDs, pid)
|
||||
ix.Native.knownMu.Unlock()
|
||||
logger.Info().Str("peer", pid).Msg("native: pruned stale peer from knownPeerIDs")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ix *IndexerService) genIndexerKey(peerID string) string {
|
||||
return "/indexer/" + peerID
|
||||
}
|
||||
|
||||
// runOffloadLoop periodically checks if real indexers are available and releases
|
||||
// responsible peers so they can reconnect to actual indexers on their next attempt.
|
||||
func (ix *IndexerService) runOffloadLoop(ctx context.Context) {
|
||||
t := time.NewTicker(offloadInterval)
|
||||
defer t.Stop()
|
||||
logger := oclib.GetLogger()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-t.C:
|
||||
}
|
||||
fmt.Println("runOffloadLoop", ix.Native.responsiblePeers)
|
||||
ix.Native.responsibleMu.RLock()
|
||||
count := len(ix.Native.responsiblePeers)
|
||||
ix.Native.responsibleMu.RUnlock()
|
||||
if count == 0 {
|
||||
continue
|
||||
}
|
||||
ix.Native.responsibleMu.RLock()
|
||||
peerIDS := []string{}
|
||||
for p := range ix.Native.responsiblePeers {
|
||||
peerIDS = append(peerIDS, p.String())
|
||||
}
|
||||
fmt.Println("COUNT --> ", count, len(ix.reachableLiveIndexers(-1, peerIDS...)))
|
||||
ix.Native.responsibleMu.RUnlock()
|
||||
if len(ix.reachableLiveIndexers(-1, peerIDS...)) > 0 {
|
||||
ix.Native.responsibleMu.RLock()
|
||||
released := ix.Native.responsiblePeers
|
||||
ix.Native.responsibleMu.RUnlock()
|
||||
|
||||
// Reset (not Close) heartbeat streams of released peers.
|
||||
// Close() only half-closes the native's write direction — the peer's write
|
||||
// direction stays open and sendHeartbeat never sees an error.
|
||||
// Reset() abruptly terminates both directions, making the peer's next
|
||||
// json.Encode return an error which triggers replenishIndexersFromNative.
|
||||
ix.StreamMU.Lock()
|
||||
if streams := ix.StreamRecords[common.ProtocolHeartbeat]; streams != nil {
|
||||
for pid := range released {
|
||||
if rec, ok := streams[pid]; ok {
|
||||
if rec.HeartbeatStream != nil && rec.HeartbeatStream.Stream != nil {
|
||||
rec.HeartbeatStream.Stream.Reset()
|
||||
}
|
||||
ix.Native.responsibleMu.Lock()
|
||||
delete(ix.Native.responsiblePeers, pid)
|
||||
ix.Native.responsibleMu.Unlock()
|
||||
|
||||
delete(streams, pid)
|
||||
logger.Info().Str("peer", pid.String()).Str("proto", string(common.ProtocolHeartbeat)).Msg(
|
||||
"native: offload — stream reset, peer will reconnect to real indexer")
|
||||
} else {
|
||||
// No recorded heartbeat stream for this peer: either it never
|
||||
// passed the score check (new peer, uptime=0 → score<75) or the
|
||||
// stream was GC'd. We cannot send a Reset signal, so close the
|
||||
// whole connection instead — this makes the peer's sendHeartbeat
|
||||
// return an error, which triggers replenishIndexersFromNative and
|
||||
// migrates it to a real indexer.
|
||||
ix.Native.responsibleMu.Lock()
|
||||
delete(ix.Native.responsiblePeers, pid)
|
||||
ix.Native.responsibleMu.Unlock()
|
||||
go ix.Host.Network().ClosePeer(pid)
|
||||
logger.Info().Str("peer", pid.String()).Msg(
|
||||
"native: offload — no heartbeat stream, closing connection so peer re-requests real indexers")
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
ix.StreamMU.Unlock()
|
||||
|
||||
logger.Info().Int("released", count).Msg("native: offloaded responsible peers to real indexers")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleNativeGetPeers returns a random selection of this native's known native
|
||||
// contacts, excluding any in the request's Exclude list.
|
||||
func (ix *IndexerService) handleNativeGetPeers(s network.Stream) {
|
||||
defer s.Close()
|
||||
logger := oclib.GetLogger()
|
||||
for {
|
||||
var req common.GetNativePeersRequest
|
||||
if err := json.NewDecoder(s).Decode(&req); err != nil {
|
||||
logger.Err(err).Msg("native get peers: decode")
|
||||
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
|
||||
strings.Contains(err.Error(), "reset") ||
|
||||
strings.Contains(err.Error(), "closed") ||
|
||||
strings.Contains(err.Error(), "too many connections") {
|
||||
return
|
||||
}
|
||||
continue
|
||||
}
|
||||
if req.Count <= 0 {
|
||||
req.Count = 1
|
||||
}
|
||||
|
||||
excludeSet := make(map[string]struct{}, len(req.Exclude))
|
||||
for _, e := range req.Exclude {
|
||||
excludeSet[e] = struct{}{}
|
||||
}
|
||||
|
||||
common.StreamNativeMu.RLock()
|
||||
candidates := make([]string, 0, len(common.StaticNatives))
|
||||
for addr := range common.StaticNatives {
|
||||
if _, excluded := excludeSet[addr]; !excluded {
|
||||
candidates = append(candidates, addr)
|
||||
}
|
||||
}
|
||||
common.StreamNativeMu.RUnlock()
|
||||
|
||||
rand.Shuffle(len(candidates), func(i, j int) { candidates[i], candidates[j] = candidates[j], candidates[i] })
|
||||
if req.Count > len(candidates) {
|
||||
req.Count = len(candidates)
|
||||
}
|
||||
|
||||
resp := common.GetNativePeersResponse{Peers: candidates[:req.Count]}
|
||||
if err := json.NewEncoder(s).Encode(resp); err != nil {
|
||||
logger.Err(err).Msg("native get peers: encode response")
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// StartNativeRegistration starts a goroutine that periodically registers this
|
||||
// indexer with all configured native indexers (every RecommendedHeartbeatInterval).
|
||||
228
daemons/node/indexer/search.go
Normal file
228
daemons/node/indexer/search.go
Normal file
@@ -0,0 +1,228 @@
|
||||
package indexer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"oc-discovery/conf"
|
||||
"oc-discovery/daemons/node/common"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
"github.com/libp2p/go-libp2p/core/network"
|
||||
)
|
||||
|
||||
const TopicSearchPeer = "oc-search-peer"
|
||||
|
||||
// searchTimeout returns the configured search timeout, defaulting to 5s.
|
||||
func searchTimeout() time.Duration {
|
||||
if t := conf.GetConfig().SearchTimeout; t > 0 {
|
||||
return time.Duration(t) * time.Second
|
||||
}
|
||||
return 5 * time.Second
|
||||
}
|
||||
|
||||
// initSearchHandlers registers ProtocolSearchPeer and ProtocolSearchPeerResponse
|
||||
// and subscribes to TopicSearchPeer on GossipSub.
|
||||
func (ix *IndexerService) initSearchHandlers() {
|
||||
ix.Host.SetStreamHandler(common.ProtocolSearchPeer, ix.handleSearchPeer)
|
||||
ix.Host.SetStreamHandler(common.ProtocolSearchPeerResponse, ix.handleSearchPeerResponse)
|
||||
ix.initSearchSubscription()
|
||||
}
|
||||
|
||||
// updateReferent is called from HandleHeartbeat when Referent flag changes.
|
||||
// If referent=true the node is added to referencedNodes; if false it is removed.
|
||||
func (ix *IndexerService) updateReferent(pid pp.ID, rec PeerRecord, referent bool) {
|
||||
ix.referencedNodesMu.Lock()
|
||||
defer ix.referencedNodesMu.Unlock()
|
||||
if referent {
|
||||
ix.referencedNodes[pid] = rec
|
||||
} else {
|
||||
delete(ix.referencedNodes, pid)
|
||||
}
|
||||
}
|
||||
|
||||
// searchReferenced looks up nodes in referencedNodes matching the query.
|
||||
// Matches on peerID (exact), DID (exact), or name (case-insensitive contains).
|
||||
func (ix *IndexerService) searchReferenced(peerID, did, name string) []common.SearchHit {
|
||||
ix.referencedNodesMu.RLock()
|
||||
defer ix.referencedNodesMu.RUnlock()
|
||||
nameLow := strings.ToLower(name)
|
||||
var hits []common.SearchHit
|
||||
for pid, rec := range ix.referencedNodes {
|
||||
pidStr := pid.String()
|
||||
matchPeerID := peerID != "" && pidStr == peerID
|
||||
matchDID := did != "" && rec.DID == did
|
||||
matchName := name != "" && strings.Contains(strings.ToLower(rec.Name), nameLow)
|
||||
if matchPeerID || matchDID || matchName {
|
||||
hits = append(hits, common.SearchHit{
|
||||
PeerID: pidStr,
|
||||
DID: rec.DID,
|
||||
Name: rec.Name,
|
||||
})
|
||||
}
|
||||
}
|
||||
return hits
|
||||
}
|
||||
|
||||
// handleSearchPeer is the ProtocolSearchPeer handler.
|
||||
// The node opens this stream, sends a SearchPeerRequest, and reads results
|
||||
// as they stream in. The stream stays open until timeout or node closes it.
|
||||
func (ix *IndexerService) handleSearchPeer(s network.Stream) {
|
||||
logger := oclib.GetLogger()
|
||||
defer s.Reset()
|
||||
|
||||
var req common.SearchPeerRequest
|
||||
if err := json.NewDecoder(s).Decode(&req); err != nil || req.QueryID == "" {
|
||||
return
|
||||
}
|
||||
|
||||
// streamCtx is cancelled when the node closes its end of the stream.
|
||||
streamCtx, streamCancel := context.WithCancel(context.Background())
|
||||
go func() {
|
||||
// Block until the stream is reset/closed, then cancel our context.
|
||||
buf := make([]byte, 1)
|
||||
s.Read(buf) //nolint:errcheck — we only care about EOF/reset
|
||||
streamCancel()
|
||||
}()
|
||||
defer streamCancel()
|
||||
|
||||
resultCh := make(chan []common.SearchHit, 16)
|
||||
ix.pendingSearchesMu.Lock()
|
||||
ix.pendingSearches[req.QueryID] = resultCh
|
||||
ix.pendingSearchesMu.Unlock()
|
||||
defer func() {
|
||||
ix.pendingSearchesMu.Lock()
|
||||
delete(ix.pendingSearches, req.QueryID)
|
||||
ix.pendingSearchesMu.Unlock()
|
||||
}()
|
||||
|
||||
// Check own referencedNodes immediately.
|
||||
if hits := ix.searchReferenced(req.PeerID, req.DID, req.Name); len(hits) > 0 {
|
||||
resultCh <- hits
|
||||
}
|
||||
|
||||
// Broadcast search on GossipSub so other indexers can respond.
|
||||
ix.publishSearchQuery(req.QueryID, req.PeerID, req.DID, req.Name)
|
||||
|
||||
// Stream results back to node as they arrive; reset idle timer on each result.
|
||||
enc := json.NewEncoder(s)
|
||||
idleTimer := time.NewTimer(searchTimeout())
|
||||
defer idleTimer.Stop()
|
||||
for {
|
||||
select {
|
||||
case hits := <-resultCh:
|
||||
if err := enc.Encode(common.SearchPeerResult{QueryID: req.QueryID, Records: hits}); err != nil {
|
||||
logger.Debug().Err(err).Msg("[search] stream write failed")
|
||||
return
|
||||
}
|
||||
// Reset idle timeout: keep alive as long as results trickle in.
|
||||
if !idleTimer.Stop() {
|
||||
select {
|
||||
case <-idleTimer.C:
|
||||
default:
|
||||
}
|
||||
}
|
||||
idleTimer.Reset(searchTimeout())
|
||||
case <-idleTimer.C:
|
||||
// No new result within timeout — close gracefully.
|
||||
return
|
||||
case <-streamCtx.Done():
|
||||
// Node closed the stream (new search superseded this one).
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleSearchPeerResponse is the ProtocolSearchPeerResponse handler.
|
||||
// Another indexer opens this stream to deliver hits for a pending queryID.
|
||||
func (ix *IndexerService) handleSearchPeerResponse(s network.Stream) {
|
||||
defer s.Reset()
|
||||
var result common.SearchPeerResult
|
||||
if err := json.NewDecoder(s).Decode(&result); err != nil || result.QueryID == "" {
|
||||
return
|
||||
}
|
||||
ix.pendingSearchesMu.Lock()
|
||||
ch := ix.pendingSearches[result.QueryID]
|
||||
ix.pendingSearchesMu.Unlock()
|
||||
if ch != nil {
|
||||
select {
|
||||
case ch <- result.Records:
|
||||
default: // channel full, drop — node may be slow
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// publishSearchQuery broadcasts a SearchQuery on TopicSearchPeer.
|
||||
func (ix *IndexerService) publishSearchQuery(queryID, peerID, did, name string) {
|
||||
ix.LongLivedStreamRecordedService.LongLivedPubSubService.PubsubMu.RLock()
|
||||
topic := ix.LongLivedStreamRecordedService.LongLivedPubSubService.LongLivedPubSubs[TopicSearchPeer]
|
||||
ix.LongLivedStreamRecordedService.LongLivedPubSubService.PubsubMu.RUnlock()
|
||||
if topic == nil {
|
||||
return
|
||||
}
|
||||
q := common.SearchQuery{
|
||||
QueryID: queryID,
|
||||
PeerID: peerID,
|
||||
DID: did,
|
||||
Name: name,
|
||||
EmitterID: ix.Host.ID().String(),
|
||||
}
|
||||
b, err := json.Marshal(q)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
_ = topic.Publish(context.Background(), b)
|
||||
}
|
||||
|
||||
// initSearchSubscription joins TopicSearchPeer and dispatches incoming queries.
|
||||
func (ix *IndexerService) initSearchSubscription() {
|
||||
logger := oclib.GetLogger()
|
||||
ix.LongLivedStreamRecordedService.LongLivedPubSubService.PubsubMu.Lock()
|
||||
topic, err := ix.PS.Join(TopicSearchPeer)
|
||||
if err != nil {
|
||||
ix.LongLivedStreamRecordedService.LongLivedPubSubService.PubsubMu.Unlock()
|
||||
logger.Err(err).Msg("[search] failed to join search topic")
|
||||
return
|
||||
}
|
||||
ix.LongLivedStreamRecordedService.LongLivedPubSubService.LongLivedPubSubs[TopicSearchPeer] = topic
|
||||
ix.LongLivedStreamRecordedService.LongLivedPubSubService.PubsubMu.Unlock()
|
||||
|
||||
common.SubscribeEvents(
|
||||
ix.LongLivedStreamRecordedService.LongLivedPubSubService,
|
||||
context.Background(),
|
||||
TopicSearchPeer,
|
||||
-1,
|
||||
func(_ context.Context, q common.SearchQuery, _ string) {
|
||||
ix.onSearchQuery(q)
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
// onSearchQuery handles an incoming GossipSub search broadcast.
|
||||
// If we have matching referencedNodes, we respond to the emitting indexer.
|
||||
func (ix *IndexerService) onSearchQuery(q common.SearchQuery) {
|
||||
// Don't respond to our own broadcasts.
|
||||
if q.EmitterID == ix.Host.ID().String() {
|
||||
return
|
||||
}
|
||||
hits := ix.searchReferenced(q.PeerID, q.DID, q.Name)
|
||||
if len(hits) == 0 {
|
||||
return
|
||||
}
|
||||
emitterID, err := pp.Decode(q.EmitterID)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
s, err := ix.Host.NewStream(ctx, emitterID, common.ProtocolSearchPeerResponse)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer s.Reset()
|
||||
s.SetDeadline(time.Now().Add(5 * time.Second))
|
||||
json.NewEncoder(s).Encode(common.SearchPeerResult{QueryID: q.QueryID, Records: hits})
|
||||
}
|
||||
@@ -2,10 +2,14 @@ package indexer
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"math/rand"
|
||||
"oc-discovery/conf"
|
||||
"oc-discovery/daemons/node/common"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
dht "github.com/libp2p/go-libp2p-kad-dht"
|
||||
@@ -15,28 +19,70 @@ import (
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
)
|
||||
|
||||
// dhtCacheEntry holds one indexer discovered via DHT for use in suggestion responses.
|
||||
type dhtCacheEntry struct {
|
||||
AI pp.AddrInfo
|
||||
LastSeen time.Time
|
||||
}
|
||||
|
||||
// offloadState tracks which nodes we've already proposed migration to.
|
||||
// When an indexer is overloaded (fill rate > offloadThreshold) it only sends
|
||||
// SuggestMigrate to a small batch at a time; peers that don't migrate within
|
||||
// offloadGracePeriod are moved to alreadyTried so a new batch can be picked.
|
||||
type offloadState struct {
|
||||
inBatch map[pp.ID]time.Time // peer → time added to current batch
|
||||
alreadyTried map[pp.ID]struct{} // peers proposed to that didn't migrate
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
const (
|
||||
offloadThreshold = 0.80 // fill rate above which to start offloading
|
||||
offloadBatchSize = 5 // max concurrent "please migrate" proposals
|
||||
offloadGracePeriod = 3 * common.RecommendedHeartbeatInterval
|
||||
)
|
||||
|
||||
// IndexerService manages the indexer node's state: stream records, DHT, pubsub.
|
||||
type IndexerService struct {
|
||||
*common.LongLivedStreamRecordedService[PeerRecord]
|
||||
PS *pubsub.PubSub
|
||||
DHT *dht.IpfsDHT
|
||||
isStrictIndexer bool
|
||||
mu sync.RWMutex
|
||||
IsNative bool
|
||||
Native *NativeState // non-nil when IsNative == true
|
||||
nameIndex *nameIndexState
|
||||
PS *pubsub.PubSub
|
||||
DHT *dht.IpfsDHT
|
||||
isStrictIndexer bool
|
||||
mu sync.RWMutex
|
||||
nameIndex *nameIndexState
|
||||
dhtProvideCancel context.CancelFunc
|
||||
bornAt time.Time
|
||||
// Passive DHT cache: refreshed every 2 min in background, used for suggestions.
|
||||
dhtCache []dhtCacheEntry
|
||||
dhtCacheMu sync.RWMutex
|
||||
// Offload state for overloaded-indexer migration proposals.
|
||||
offload offloadState
|
||||
// referencedNodes holds nodes that have designated this indexer as their
|
||||
// search referent (Heartbeat.Referent=true). Used for distributed search.
|
||||
referencedNodes map[pp.ID]PeerRecord
|
||||
referencedNodesMu sync.RWMutex
|
||||
// pendingSearches maps queryID → result channel for in-flight searches.
|
||||
pendingSearches map[string]chan []common.SearchHit
|
||||
pendingSearchesMu sync.Mutex
|
||||
// behavior tracks per-node compliance (heartbeat rate, publish/get volume,
|
||||
// identity consistency, signature failures).
|
||||
behavior *NodeBehaviorTracker
|
||||
// connGuard limits new-connection bursts to protect public indexers.
|
||||
connGuard *ConnectionRateGuard
|
||||
}
|
||||
|
||||
// NewIndexerService creates an IndexerService.
|
||||
// If ps is nil, this is a strict indexer (no pre-existing gossip sub from a node).
|
||||
func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int, isNative bool) *IndexerService {
|
||||
func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int) *IndexerService {
|
||||
logger := oclib.GetLogger()
|
||||
logger.Info().Msg("open indexer mode...")
|
||||
var err error
|
||||
ix := &IndexerService{
|
||||
LongLivedStreamRecordedService: common.NewStreamRecordedService[PeerRecord](h, maxNode),
|
||||
isStrictIndexer: ps == nil,
|
||||
IsNative: isNative,
|
||||
referencedNodes: map[pp.ID]PeerRecord{},
|
||||
pendingSearches: map[string]chan []common.SearchHit{},
|
||||
behavior: newNodeBehaviorTracker(),
|
||||
connGuard: newConnectionRateGuard(),
|
||||
}
|
||||
if ps == nil {
|
||||
ps, err = pubsub.NewGossipSub(context.Background(), ix.Host)
|
||||
@@ -46,25 +92,45 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int, isNative boo
|
||||
}
|
||||
ix.PS = ps
|
||||
|
||||
if ix.isStrictIndexer && !isNative {
|
||||
if ix.isStrictIndexer {
|
||||
logger.Info().Msg("connect to indexers as strict indexer...")
|
||||
common.ConnectToIndexers(h, conf.GetConfig().MinIndexer, conf.GetConfig().MaxIndexer, ix.Host.ID())
|
||||
common.ConnectToIndexers(h, conf.GetConfig().MinIndexer, conf.GetConfig().MaxIndexer*2)
|
||||
logger.Info().Msg("subscribe to decentralized search flow as strict indexer...")
|
||||
go ix.SubscribeToSearch(ix.PS, nil)
|
||||
}
|
||||
|
||||
if !isNative {
|
||||
logger.Info().Msg("init distributed name index...")
|
||||
ix.initNameIndex(ps)
|
||||
ix.LongLivedStreamRecordedService.AfterDelete = func(pid pp.ID, name, did string) {
|
||||
ix.publishNameEvent(NameIndexDelete, name, pid.String(), did)
|
||||
}
|
||||
logger.Info().Msg("init distributed name index...")
|
||||
ix.initNameIndex(ps)
|
||||
ix.LongLivedStreamRecordedService.AfterDelete = func(pid pp.ID, name, did string) {
|
||||
ix.publishNameEvent(NameIndexDelete, name, pid.String(), did)
|
||||
// Remove behavior state for peers that are no longer connected and
|
||||
// have no active ban — keeps memory bounded to the live node set.
|
||||
ix.behavior.Cleanup(pid)
|
||||
}
|
||||
|
||||
// Parse bootstrap peers from configured native/indexer addresses so that the
|
||||
// DHT can find its routing table entries even in a fresh deployment.
|
||||
// AllowInbound: fired once per stream open, before any heartbeat is decoded.
|
||||
// 1. Reject peers that are currently banned (behavioral strikes).
|
||||
// 2. For genuinely new connections, apply the burst guard.
|
||||
ix.AllowInbound = func(remotePeer pp.ID, isNew bool) error {
|
||||
if ix.behavior.IsBanned(remotePeer) {
|
||||
return errors.New("peer is banned")
|
||||
}
|
||||
if isNew && !ix.connGuard.Allow() {
|
||||
return errors.New("connection rate limit exceeded, retry later")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ValidateHeartbeat: fired on every heartbeat tick for an established stream.
|
||||
// Checks heartbeat cadence — rejects if the node is sending too fast.
|
||||
ix.ValidateHeartbeat = func(remotePeer pp.ID) error {
|
||||
return ix.behavior.RecordHeartbeat(remotePeer)
|
||||
}
|
||||
|
||||
// Parse bootstrap peers from configured indexer addresses so the DHT can
|
||||
// find its routing table entries even in a fresh deployment.
|
||||
var bootstrapPeers []pp.AddrInfo
|
||||
for _, addrStr := range strings.Split(conf.GetConfig().NativeIndexerAddresses+","+conf.GetConfig().IndexerAddresses, ",") {
|
||||
for _, addrStr := range strings.Split(conf.GetConfig().IndexerAddresses, ",") {
|
||||
addrStr = strings.TrimSpace(addrStr)
|
||||
if addrStr == "" {
|
||||
continue
|
||||
@@ -75,12 +141,11 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int, isNative boo
|
||||
}
|
||||
dhtOpts := []dht.Option{
|
||||
dht.Mode(dht.ModeServer),
|
||||
dht.ProtocolPrefix("oc"), // 🔥 réseau privé
|
||||
dht.ProtocolPrefix("oc"),
|
||||
dht.Validator(record.NamespacedValidator{
|
||||
"node": PeerRecordValidator{},
|
||||
"indexer": IndexerRecordValidator{}, // for native indexer registry
|
||||
"name": DefaultValidator{},
|
||||
"pid": DefaultValidator{},
|
||||
"node": PeerRecordValidator{},
|
||||
"name": DefaultValidator{},
|
||||
"pid": DefaultValidator{},
|
||||
}),
|
||||
}
|
||||
if len(bootstrapPeers) > 0 {
|
||||
@@ -91,44 +156,336 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int, isNative boo
|
||||
return nil
|
||||
}
|
||||
|
||||
// InitNative must happen after DHT is ready
|
||||
if isNative {
|
||||
ix.InitNative()
|
||||
} else {
|
||||
ix.initNodeHandler()
|
||||
// Register with configured natives so this indexer appears in their cache.
|
||||
// Pass a fill rate provider so the native can route new nodes to less-loaded indexers.
|
||||
if nativeAddrs := conf.GetConfig().NativeIndexerAddresses; nativeAddrs != "" {
|
||||
fillRateFn := func() float64 {
|
||||
ix.StreamMU.RLock()
|
||||
n := len(ix.StreamRecords[common.ProtocolHeartbeat])
|
||||
ix.StreamMU.RUnlock()
|
||||
maxN := ix.MaxNodesConn()
|
||||
if maxN <= 0 {
|
||||
return 0
|
||||
}
|
||||
rate := float64(n) / float64(maxN)
|
||||
if rate > 1 {
|
||||
rate = 1
|
||||
}
|
||||
return rate
|
||||
}
|
||||
common.StartNativeRegistration(ix.Host, nativeAddrs, fillRateFn)
|
||||
// Make the DHT available for replenishment from other packages.
|
||||
common.SetDiscoveryDHT(ix.DHT)
|
||||
|
||||
ix.bornAt = time.Now().UTC()
|
||||
ix.offload.inBatch = make(map[pp.ID]time.Time)
|
||||
ix.offload.alreadyTried = make(map[pp.ID]struct{})
|
||||
ix.initNodeHandler()
|
||||
|
||||
// Build and send a HeartbeatResponse after each received node heartbeat.
|
||||
// Raw metrics only — no pre-cooked score. Node computes the score itself.
|
||||
ix.BuildHeartbeatResponse = func(remotePeer pp.ID, need int, challenges []string, challengeDID string, referent bool) *common.HeartbeatResponse {
|
||||
ix.StreamMU.RLock()
|
||||
peerCount := len(ix.StreamRecords[common.ProtocolHeartbeat])
|
||||
// Collect lastSeen per active peer for challenge responses.
|
||||
type peerMeta struct {
|
||||
found bool
|
||||
lastSeen time.Time
|
||||
}
|
||||
peerLookup := make(map[string]peerMeta, peerCount)
|
||||
var remotePeerRecord PeerRecord
|
||||
for pid, rec := range ix.StreamRecords[common.ProtocolHeartbeat] {
|
||||
var ls time.Time
|
||||
if rec.HeartbeatStream != nil && rec.HeartbeatStream.UptimeTracker != nil {
|
||||
ls = rec.HeartbeatStream.UptimeTracker.LastSeen
|
||||
}
|
||||
peerLookup[pid.String()] = peerMeta{found: true, lastSeen: ls}
|
||||
if pid == remotePeer {
|
||||
remotePeerRecord = rec.Record
|
||||
}
|
||||
}
|
||||
ix.StreamMU.RUnlock()
|
||||
|
||||
// Update referent designation: node marks its best-scored indexer with Referent=true.
|
||||
ix.updateReferent(remotePeer, remotePeerRecord, referent)
|
||||
|
||||
maxN := ix.MaxNodesConn()
|
||||
fillRate := 0.0
|
||||
if maxN > 0 {
|
||||
fillRate = float64(peerCount) / float64(maxN)
|
||||
if fillRate > 1 {
|
||||
fillRate = 1
|
||||
}
|
||||
}
|
||||
|
||||
resp := &common.HeartbeatResponse{
|
||||
FillRate: fillRate,
|
||||
PeerCount: peerCount,
|
||||
MaxNodes: maxN,
|
||||
BornAt: ix.bornAt,
|
||||
}
|
||||
|
||||
// Answer each challenged PeerID with raw found + lastSeen.
|
||||
for _, pidStr := range challenges {
|
||||
meta := peerLookup[pidStr] // zero value if not found
|
||||
entry := common.ChallengeEntry{
|
||||
PeerID: pidStr,
|
||||
Found: meta.found,
|
||||
LastSeen: meta.lastSeen,
|
||||
}
|
||||
resp.Challenges = append(resp.Challenges, entry)
|
||||
}
|
||||
|
||||
// DHT challenge: retrieve the node's own DID to prove DHT is functional.
|
||||
if challengeDID != "" {
|
||||
ctx3, cancel3 := context.WithTimeout(context.Background(), 3*time.Second)
|
||||
val, err := ix.DHT.GetValue(ctx3, "/node/"+challengeDID)
|
||||
cancel3()
|
||||
resp.DHTFound = err == nil
|
||||
if err == nil {
|
||||
resp.DHTPayload = json.RawMessage(val)
|
||||
}
|
||||
}
|
||||
|
||||
// Random sample of connected nodes as witnesses (up to 3).
|
||||
// Never include the requesting peer itself — asking a node to witness
|
||||
// itself is circular and meaningless.
|
||||
ix.StreamMU.RLock()
|
||||
for pidStr := range peerLookup {
|
||||
if len(resp.Witnesses) >= 3 {
|
||||
break
|
||||
}
|
||||
pid, err := pp.Decode(pidStr)
|
||||
if err != nil || pid == remotePeer || pid == ix.Host.ID() {
|
||||
continue
|
||||
}
|
||||
addrs := ix.Host.Peerstore().Addrs(pid)
|
||||
ai := common.FilterLoopbackAddrs(pp.AddrInfo{ID: pid, Addrs: addrs})
|
||||
if len(ai.Addrs) > 0 {
|
||||
resp.Witnesses = append(resp.Witnesses, ai)
|
||||
}
|
||||
}
|
||||
ix.StreamMU.RUnlock()
|
||||
|
||||
// Attach suggestions: exactly `need` entries from the DHT cache.
|
||||
// If the indexer is overloaded (SuggestMigrate will be set below), always
|
||||
// provide at least 1 suggestion even when need == 0, so the node has
|
||||
// somewhere to go.
|
||||
suggestionsNeeded := need
|
||||
if fillRate > offloadThreshold && suggestionsNeeded < 1 {
|
||||
suggestionsNeeded = 1
|
||||
}
|
||||
if suggestionsNeeded > 0 {
|
||||
ix.dhtCacheMu.RLock()
|
||||
// When offloading, pick from a random offset within the top N of the
|
||||
// cache so concurrent migrations spread across multiple targets rather
|
||||
// than all rushing to the same least-loaded indexer (thundering herd).
|
||||
// For normal need-based suggestions the full sorted order is fine.
|
||||
cache := ix.dhtCache
|
||||
if fillRate > offloadThreshold && len(cache) > suggestionsNeeded {
|
||||
const spreadWindow = 5 // sample from the top-5 least-loaded
|
||||
window := spreadWindow
|
||||
if window > len(cache) {
|
||||
window = len(cache)
|
||||
}
|
||||
start := rand.Intn(window)
|
||||
cache = cache[start:]
|
||||
}
|
||||
for _, e := range cache {
|
||||
if len(resp.Suggestions) >= suggestionsNeeded {
|
||||
break
|
||||
}
|
||||
// Never suggest the requesting peer itself or this indexer.
|
||||
if e.AI.ID == remotePeer || e.AI.ID == h.ID() {
|
||||
continue
|
||||
}
|
||||
resp.Suggestions = append(resp.Suggestions, e.AI)
|
||||
}
|
||||
ix.dhtCacheMu.RUnlock()
|
||||
}
|
||||
|
||||
// Offload logic: when fill rate is too high, selectively ask nodes to migrate.
|
||||
if fillRate > offloadThreshold && len(resp.Suggestions) > 0 {
|
||||
now := time.Now()
|
||||
ix.offload.mu.Lock()
|
||||
// Expire stale batch entries -> move to alreadyTried.
|
||||
for pid, addedAt := range ix.offload.inBatch {
|
||||
if now.Sub(addedAt) > offloadGracePeriod {
|
||||
ix.offload.alreadyTried[pid] = struct{}{}
|
||||
delete(ix.offload.inBatch, pid)
|
||||
}
|
||||
}
|
||||
// Reset alreadyTried if we've exhausted the whole pool.
|
||||
if len(ix.offload.alreadyTried) >= peerCount {
|
||||
ix.offload.alreadyTried = make(map[pp.ID]struct{})
|
||||
}
|
||||
_, tried := ix.offload.alreadyTried[remotePeer]
|
||||
_, inBatch := ix.offload.inBatch[remotePeer]
|
||||
if !tried {
|
||||
if inBatch {
|
||||
resp.SuggestMigrate = true
|
||||
} else if len(ix.offload.inBatch) < offloadBatchSize {
|
||||
ix.offload.inBatch[remotePeer] = now
|
||||
resp.SuggestMigrate = true
|
||||
}
|
||||
}
|
||||
ix.offload.mu.Unlock()
|
||||
} else if fillRate <= offloadThreshold {
|
||||
// Fill rate back to normal: reset offload state.
|
||||
ix.offload.mu.Lock()
|
||||
if len(ix.offload.inBatch) > 0 || len(ix.offload.alreadyTried) > 0 {
|
||||
ix.offload.inBatch = make(map[pp.ID]time.Time)
|
||||
ix.offload.alreadyTried = make(map[pp.ID]struct{})
|
||||
}
|
||||
ix.offload.mu.Unlock()
|
||||
}
|
||||
|
||||
// Bootstrap: if this indexer has no indexers of its own, probe the
|
||||
// connecting peer to check it supports ProtocolHeartbeat (i.e. it is
|
||||
// itself an indexer). Plain nodes do not register the handler and the
|
||||
// negotiation fails instantly — no wasted heartbeat cycle.
|
||||
// Run in a goroutine: the probe is a short blocking stream open.
|
||||
if len(common.Indexers.GetAddrs()) == 0 && remotePeer != h.ID() {
|
||||
pid := remotePeer
|
||||
go func() {
|
||||
if !common.SupportsHeartbeat(h, pid) {
|
||||
logger.Debug().Str("peer", pid.String()).
|
||||
Msg("[bootstrap] inbound peer has no heartbeat handler — not an indexer, skipping")
|
||||
return
|
||||
}
|
||||
addrs := h.Peerstore().Addrs(pid)
|
||||
ai := common.FilterLoopbackAddrs(pp.AddrInfo{ID: pid, Addrs: addrs})
|
||||
if len(ai.Addrs) == 0 {
|
||||
return
|
||||
}
|
||||
key := pid.String()
|
||||
if !common.Indexers.ExistsAddr(key) {
|
||||
adCopy := ai
|
||||
common.Indexers.SetAddr(key, &adCopy)
|
||||
common.Indexers.NudgeIt()
|
||||
logger.Info().Str("peer", key).Msg("[bootstrap] no indexers — added inbound indexer peer as candidate")
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
return resp
|
||||
}
|
||||
|
||||
// Advertise this indexer in the DHT so nodes can discover it.
|
||||
fillRateFn := func() float64 {
|
||||
ix.StreamMU.RLock()
|
||||
n := len(ix.StreamRecords[common.ProtocolHeartbeat])
|
||||
ix.StreamMU.RUnlock()
|
||||
maxN := ix.MaxNodesConn()
|
||||
if maxN <= 0 {
|
||||
return 0
|
||||
}
|
||||
rate := float64(n) / float64(maxN)
|
||||
if rate > 1 {
|
||||
rate = 1
|
||||
}
|
||||
return rate
|
||||
}
|
||||
ix.startDHTCacheRefresh()
|
||||
ix.startDHTProvide(fillRateFn)
|
||||
|
||||
return ix
|
||||
}
|
||||
|
||||
func (ix *IndexerService) Close() {
|
||||
if ix.Native != nil && ix.Native.cancel != nil {
|
||||
ix.Native.cancel()
|
||||
}
|
||||
// Explicitly deregister from natives on clean shutdown so they evict this
|
||||
// indexer immediately rather than waiting for TTL expiry (~90 s).
|
||||
if !ix.IsNative {
|
||||
if nativeAddrs := conf.GetConfig().NativeIndexerAddresses; nativeAddrs != "" {
|
||||
common.UnregisterFromNative(ix.Host, nativeAddrs)
|
||||
// startDHTCacheRefresh periodically queries the DHT for peer indexers and
|
||||
// refreshes ix.dhtCache. This passive cache is used by BuildHeartbeatResponse
|
||||
// to suggest better indexers to connected nodes without any per-request cost.
|
||||
func (ix *IndexerService) startDHTCacheRefresh() {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
// Store cancel alongside the provide cancel so Close() stops both.
|
||||
prevCancel := ix.dhtProvideCancel
|
||||
ix.dhtProvideCancel = func() {
|
||||
if prevCancel != nil {
|
||||
prevCancel()
|
||||
}
|
||||
cancel()
|
||||
}
|
||||
go func() {
|
||||
logger := oclib.GetLogger()
|
||||
refresh := func() {
|
||||
if ix.DHT == nil {
|
||||
return
|
||||
}
|
||||
// Fetch more than needed so SelectByFillRate can filter for diversity.
|
||||
raw := common.DiscoverIndexersFromDHT(ix.Host, ix.DHT, 30)
|
||||
if len(raw) == 0 {
|
||||
return
|
||||
}
|
||||
// Remove self before selection.
|
||||
filtered := raw[:0]
|
||||
for _, ai := range raw {
|
||||
if ai.ID != ix.Host.ID() {
|
||||
filtered = append(filtered, ai)
|
||||
}
|
||||
}
|
||||
// SelectByFillRate applies /24 subnet diversity and fill-rate weighting.
|
||||
// Fill rates are unknown at this stage (nil map) so all peers get
|
||||
// the neutral prior f=0.5 — diversity filtering still applies.
|
||||
selected := common.SelectByFillRate(filtered, nil, 10)
|
||||
now := time.Now()
|
||||
ix.dhtCacheMu.Lock()
|
||||
ix.dhtCache = ix.dhtCache[:0]
|
||||
for _, ai := range selected {
|
||||
ix.dhtCache = append(ix.dhtCache, dhtCacheEntry{AI: ai, LastSeen: now})
|
||||
}
|
||||
ix.dhtCacheMu.Unlock()
|
||||
logger.Info().Int("cached", len(selected)).Msg("[dht] indexer suggestion cache refreshed")
|
||||
}
|
||||
// Initial delay: let the DHT routing table warm up first.
|
||||
select {
|
||||
case <-time.After(30 * time.Second):
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
refresh()
|
||||
t := time.NewTicker(2 * time.Minute)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-t.C:
|
||||
refresh()
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// startDHTProvide bootstraps the DHT and starts a goroutine that periodically
|
||||
// advertises this indexer under the well-known provider key.
|
||||
func (ix *IndexerService) startDHTProvide(fillRateFn func() float64) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
ix.dhtProvideCancel = cancel
|
||||
go func() {
|
||||
logger := oclib.GetLogger()
|
||||
// Wait until a routable (non-loopback) address is available.
|
||||
for i := 0; i < 12; i++ {
|
||||
addrs := ix.Host.Addrs()
|
||||
if len(addrs) > 0 && !strings.Contains(addrs[len(addrs)-1].String(), "127.0.0.1") {
|
||||
break
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(5 * time.Second):
|
||||
}
|
||||
}
|
||||
if err := ix.DHT.Bootstrap(ctx); err != nil {
|
||||
logger.Warn().Err(err).Msg("[dht] bootstrap failed")
|
||||
}
|
||||
provide := func() {
|
||||
pCtx, pCancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer pCancel()
|
||||
if err := ix.DHT.Provide(pCtx, common.IndexerCID(), true); err != nil {
|
||||
logger.Warn().Err(err).Msg("[dht] Provide failed")
|
||||
} else {
|
||||
logger.Info().Float64("fill_rate", fillRateFn()).Msg("[dht] indexer advertised in DHT")
|
||||
}
|
||||
}
|
||||
provide()
|
||||
t := time.NewTicker(common.RecommendedHeartbeatInterval)
|
||||
defer t.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-t.C:
|
||||
provide()
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (ix *IndexerService) Close() {
|
||||
if ix.dhtProvideCancel != nil {
|
||||
ix.dhtProvideCancel()
|
||||
}
|
||||
ix.DHT.Close()
|
||||
ix.PS.UnregisterTopicValidator(common.TopicPubSubSearch)
|
||||
|
||||
Reference in New Issue
Block a user