Simple Architecture

This commit is contained in:
mr
2026-03-11 16:28:15 +01:00
parent 83cef6e6f6
commit d0af40f4c7
55 changed files with 4037 additions and 3306 deletions

View File

@@ -0,0 +1,331 @@
package common
import (
"errors"
"sync"
"time"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
)
type Score struct {
FirstContacted time.Time
UptimeTracker *UptimeTracker
LastFillRate float64
Score float64
// IsSeed marks indexers that came from the IndexerAddresses static config.
// Seeds are sticky: they are never evicted by the score threshold alone.
// A seed is only removed when: (a) heartbeat fails, or (b) it sends
// SuggestMigrate and the node already has MinIndexer non-seed alternatives.
IsSeed bool
// challenge bookkeeping (2-3 peers per batch, raw data returned by indexer)
hbCount int // heartbeats sent since last challenge batch
nextChallenge int // send challenges when hbCount reaches this (rand 1-10)
challengeTotal int // number of own-PeerID challenges sent (ground truth)
challengeCorrect int // own PeerID found AND lastSeen within 2×interval
// fill rate consistency: cross-check reported fillRate vs peerCount/maxNodes
fillChecked int
fillConsistent int
// BornAt stability
LastBornAt time.Time
bornAtChanges int
// DHT challenge
dhtChecked int
dhtSuccess int
dhtBatchCounter int
// Peer witnesses
witnessChecked int
witnessConsistent int
// WitnessPool: up to 3 witnesses last reported by this indexer.
// Used for indirect probing when the indexer becomes unreachable.
// Oldest entry is replaced when the pool is full and a fresher witness arrives.
WitnessPool []WitnessCacheEntry
}
// WitnessCacheEntry holds one witness AddrInfo with its last-seen timestamp.
const maxWitnessPool = 3
type WitnessCacheEntry struct {
AI pp.AddrInfo
SeenAt time.Time
}
// UpdateWitnessPool inserts or refreshes a witness entry.
// If the pool is full and the witness is new, the oldest entry is replaced.
func (s *Score) UpdateWitnessPool(w pp.AddrInfo) {
for i, e := range s.WitnessPool {
if e.AI.ID == w.ID {
s.WitnessPool[i].AI = w
s.WitnessPool[i].SeenAt = time.Now()
return
}
}
entry := WitnessCacheEntry{AI: w, SeenAt: time.Now()}
if len(s.WitnessPool) < maxWitnessPool {
s.WitnessPool = append(s.WitnessPool, entry)
return
}
// Replace oldest.
oldest := 0
for i, e := range s.WitnessPool {
if e.SeenAt.Before(s.WitnessPool[oldest].SeenAt) {
oldest = i
}
}
s.WitnessPool[oldest] = entry
}
// computeNodeSideScore computes the node's quality assessment of an indexer from raw metrics.
// All ratios are in [0,1]; result is in [0,100].
// - uptimeRatio : gap-aware fraction of lifetime the indexer was reachable
// - challengeAccuracy: own-PeerID challenges answered correctly (found + recent lastSeen)
// - latencyScore : 1 - RTT/maxRTT, clamped [0,1]
// - fillScore : 1 - fillRate — prefer less-loaded indexers
// - fillConsistency : fraction of ticks where peerCount/maxNodes ≈ fillRate (±10%)
func (s *Score) ComputeNodeSideScore(latencyScore float64) float64 {
uptime := s.UptimeTracker.UptimeRatio()
challengeAccuracy := 1.0
if s.challengeTotal > 0 {
challengeAccuracy = float64(s.challengeCorrect) / float64(s.challengeTotal)
}
fillScore := 1.0 - s.LastFillRate
fillConsistency := 1.0
if s.fillChecked > 0 {
fillConsistency = float64(s.fillConsistent) / float64(s.fillChecked)
}
witnessConsistency := 1.0
if s.witnessChecked > 0 {
witnessConsistency = float64(s.witnessConsistent) / float64(s.witnessChecked)
}
dhtSuccessRate := 1.0
if s.dhtChecked > 0 {
dhtSuccessRate = float64(s.dhtSuccess) / float64(s.dhtChecked)
}
base := ((0.20 * uptime) +
(0.20 * challengeAccuracy) +
(0.15 * latencyScore) +
(0.10 * fillScore) +
(0.10 * fillConsistency) +
(0.15 * witnessConsistency) +
(0.10 * dhtSuccessRate)) * 100
// BornAt stability: each unexpected BornAt change penalises by 30%.
bornAtPenalty := 1.0 - 0.30*float64(s.bornAtChanges)
if bornAtPenalty < 0 {
bornAtPenalty = 0
}
return base * bornAtPenalty
}
type Directory struct {
MuAddr sync.RWMutex
MuScore sync.RWMutex
MuStream sync.RWMutex
Addrs map[string]*pp.AddrInfo
Scores map[string]*Score
Nudge chan struct{}
Streams ProtocolStream
}
func (d *Directory) ExistsScore(a string) bool {
d.MuScore.RLock()
defer d.MuScore.RUnlock()
for addr, ai := range d.Scores {
if ai != nil && (a == addr) {
return true
}
}
return false
}
func (d *Directory) GetScore(a string) *Score {
d.MuScore.RLock()
defer d.MuScore.RUnlock()
for addr, s := range d.Scores {
if s != nil && (a == addr) {
sCopy := *s
return &sCopy
}
}
return nil
}
func (d *Directory) GetScores() map[string]*Score {
d.MuScore.RLock()
defer d.MuScore.RUnlock()
score := map[string]*Score{}
for addr, s := range d.Scores {
score[addr] = s
}
return score
}
func (d *Directory) DeleteScore(a string) {
d.MuScore.RLock()
defer d.MuScore.RUnlock()
score := map[string]*Score{}
for addr, s := range d.Scores {
if a != addr {
score[addr] = s
}
}
d.Scores = score
}
func (d *Directory) SetScore(addr string, score *Score) *pp.AddrInfo {
d.MuScore.Lock()
defer d.MuScore.Unlock()
d.Scores[addr] = score
return nil
}
func (d *Directory) ExistsAddr(addrOrId string) bool {
d.MuAddr.RLock()
defer d.MuAddr.RUnlock()
for addr, ai := range d.Addrs {
if ai != nil && (addrOrId == ai.ID.String() || addrOrId == addr) {
return true
}
}
return false
}
func (d *Directory) GetAddr(addrOrId string) *pp.AddrInfo {
d.MuAddr.RLock()
defer d.MuAddr.RUnlock()
for addr, ai := range d.Addrs {
if ai != nil && (addrOrId == ai.ID.String() || addrOrId == addr) {
aiCopy := *ai
return &aiCopy
}
}
return nil
}
func (d *Directory) DeleteAddr(a string) {
d.MuAddr.RLock()
defer d.MuAddr.RUnlock()
addrs := map[string]*pp.AddrInfo{}
for addr, s := range d.Addrs {
if a != addr {
addrs[addr] = s
}
}
d.Addrs = addrs
}
func (d *Directory) SetAddr(addr string, info *pp.AddrInfo) *pp.AddrInfo {
d.MuAddr.Lock()
defer d.MuAddr.Unlock()
d.Addrs[addr] = info
return nil
}
func (d *Directory) GetAddrIDs() []pp.ID {
d.MuAddr.RLock()
defer d.MuAddr.RUnlock()
indexers := make([]pp.ID, 0, len(d.Addrs))
for _, ai := range d.Addrs {
if ai != nil {
indexers = append(indexers, ai.ID)
}
}
return Shuffle(indexers)
}
func (d *Directory) GetAddrsStr() []string {
d.MuAddr.RLock()
defer d.MuAddr.RUnlock()
indexers := make([]string, 0, len(d.Addrs))
for s, ai := range d.Addrs {
if ai != nil {
indexers = append(indexers, s)
}
}
return Shuffle(indexers)
}
type Entry struct {
Addr string
Info *pp.AddrInfo
}
func (d *Directory) GetAddrs() []Entry {
d.MuAddr.RLock()
defer d.MuAddr.RUnlock()
indexers := make([]Entry, 0, len(d.Addrs))
for addr, ai := range d.Addrs {
if ai != nil {
indexers = append(indexers, Entry{
Addr: addr,
Info: ai,
})
}
}
return Shuffle(indexers)
}
// NudgeIndexerHeartbeat signals the indexer heartbeat goroutine to fire immediately.
func (d *Directory) NudgeIt() {
select {
case d.Nudge <- struct{}{}:
default: // nudge already pending, skip
}
}
type ProtocolStream map[protocol.ID]map[pp.ID]*Stream
func (ps ProtocolStream) Get(protocol protocol.ID) map[pp.ID]*Stream {
if ps[protocol] == nil {
ps[protocol] = map[pp.ID]*Stream{}
}
return ps[protocol]
}
func (ps ProtocolStream) GetPerID(protocol protocol.ID, peerID pp.ID) *Stream {
if ps[protocol] == nil {
ps[protocol] = map[pp.ID]*Stream{}
}
return ps[protocol][peerID]
}
func (ps ProtocolStream) Add(protocol protocol.ID, peerID *pp.ID, s *Stream) error {
if ps[protocol] == nil {
ps[protocol] = map[pp.ID]*Stream{}
}
if peerID != nil {
if s != nil {
ps[protocol][*peerID] = s
} else {
return errors.New("unable to add stream : stream missing")
}
}
return nil
}
func (ps ProtocolStream) Delete(protocol protocol.ID, peerID *pp.ID) {
if streams, ok := ps[protocol]; ok {
if peerID != nil && streams[*peerID] != nil && streams[*peerID].Stream != nil {
streams[*peerID].Stream.Close()
delete(streams, *peerID)
} else {
for _, s := range ps {
for _, v := range s {
if v.Stream != nil {
v.Stream.Close()
}
}
}
delete(ps, protocol)
}
}
}
var Indexers = &Directory{
Addrs: map[string]*pp.AddrInfo{},
Scores: map[string]*Score{},
Nudge: make(chan struct{}, 1),
Streams: ProtocolStream{},
}

View File

@@ -0,0 +1,295 @@
package common
import (
"context"
"encoding/json"
"io"
"time"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
oclib "cloud.o-forge.io/core/oc-lib"
)
type Heartbeat struct {
Name string `json:"name"`
Stream *Stream `json:"stream"`
DID string `json:"did"`
PeerID string `json:"peer_id"`
Timestamp int64 `json:"timestamp"`
IndexersBinded []string `json:"indexers_binded"`
Score float64
// Record carries a fresh signed PeerRecord (JSON) so the receiving indexer
// can republish it to the DHT without an extra round-trip.
// Only set by nodes (not indexers heartbeating other indexers).
Record json.RawMessage `json:"record,omitempty"`
// Need is how many more indexers this node wants (MaxIndexer - current pool size).
// The receiving indexer uses this to know how many suggestions to return.
// 0 means the pool is full — no suggestions needed unless SuggestMigrate.
Need int `json:"need,omitempty"`
// Challenges is a list of PeerIDs the node asks the indexer to spot-check.
// Always includes the node's own PeerID (ground truth) + up to 2 additional
// known peers. Nil means no challenge this tick.
Challenges []string `json:"challenges,omitempty"`
// ChallengeDID asks the indexer to retrieve this DID from the DHT (every 5th batch).
ChallengeDID string `json:"challenge_did,omitempty"`
// Referent marks this indexer as the node's designated search referent.
// Only one indexer per node receives Referent=true at a time (the best-scored one).
// The indexer stores the node in its referencedNodes for distributed search.
Referent bool `json:"referent,omitempty"`
}
// SearchPeerRequest is sent by a node to an indexer via ProtocolSearchPeer.
// The indexer broadcasts it on the GossipSub search mesh and streams results back.
type SearchPeerRequest struct {
QueryID string `json:"query_id"`
// At least one of PeerID, DID, Name must be set.
PeerID string `json:"peer_id,omitempty"`
DID string `json:"did,omitempty"`
Name string `json:"name,omitempty"`
}
// SearchQuery is broadcast on TopicSearchPeer by the receiving indexer.
// EmitterID is the indexer's own PeerID — responding indexers open a
// ProtocolSearchPeerResponse stream back to it.
type SearchQuery struct {
QueryID string `json:"query_id"`
PeerID string `json:"peer_id,omitempty"`
DID string `json:"did,omitempty"`
Name string `json:"name,omitempty"`
EmitterID string `json:"emitter_id"`
}
// SearchPeerResult is sent by a responding indexer to the emitting indexer
// via ProtocolSearchPeerResponse, and forwarded by the emitting indexer to
// the node on the open ProtocolSearchPeer stream.
type SearchPeerResult struct {
QueryID string `json:"query_id"`
Records []SearchHit `json:"records"`
}
// SearchHit is a single peer found during distributed search.
type SearchHit struct {
PeerID string `json:"peer_id"`
DID string `json:"did"`
Name string `json:"name"`
}
// ChallengeEntry is the indexer's raw answer for one challenged peer.
type ChallengeEntry struct {
PeerID string `json:"peer_id"`
Found bool `json:"found"`
LastSeen time.Time `json:"last_seen,omitempty"` // zero if not found
}
// HeartbeatResponse carries raw metrics only — no pre-cooked score.
type HeartbeatResponse struct {
FillRate float64 `json:"fill_rate"`
PeerCount int `json:"peer_count"`
MaxNodes int `json:"max_nodes"` // capacity — lets node cross-check fillRate
BornAt time.Time `json:"born_at"`
Challenges []ChallengeEntry `json:"challenges,omitempty"`
// DHTFound / DHTPayload: response to a ChallengeDID request.
DHTFound bool `json:"dht_found,omitempty"`
DHTPayload json.RawMessage `json:"dht_payload,omitempty"`
// Witnesses: random sample of connected nodes so the querying node can cross-check.
Witnesses []pp.AddrInfo `json:"witnesses,omitempty"`
// Suggestions: better indexers this indexer knows about via its DHT cache.
// The node should open heartbeat connections to these (they become StaticIndexers).
Suggestions []pp.AddrInfo `json:"suggestions,omitempty"`
// SuggestMigrate: set when this indexer is overloaded (fill rate > threshold)
// and is actively trying to hand the node off to the Suggestions list.
// Seeds: node de-stickies this indexer once it has MinIndexer non-seed alternatives.
// Non-seeds: node removes this indexer immediately if it has enough alternatives.
SuggestMigrate bool `json:"suggest_migrate,omitempty"`
}
// ComputeIndexerScore computes a composite quality score [0, 100] for the connecting peer.
// - uptimeRatio: fraction of tracked lifetime online (gap-aware) — peer reliability
// - bpms: bandwidth normalized to MaxExpectedMbps — link capacity
// - diversity: indexer's own /24 subnet diversity — network topology quality
// - latencyScore: 1 - RTT/maxRoundTrip — link responsiveness
// - fillRate: fraction of indexer slots used (0=empty, 1=full) — collective trust signal:
// a fuller indexer has been chosen and retained by many peers, which is evidence of quality.
func (hb *Heartbeat) ComputeIndexerScore(uptimeRatio float64, bpms float64, diversity float64, latencyScore float64, fillRate float64) {
hb.Score = ((0.20 * uptimeRatio) +
(0.20 * bpms) +
(0.20 * diversity) +
(0.15 * latencyScore) +
(0.25 * fillRate)) * 100
}
type HeartbeatInfo []struct {
Info []byte `json:"info"`
}
// WitnessRequest is sent by a node to a peer to ask its view of a given indexer.
type WitnessRequest struct {
IndexerPeerID string `json:"indexer_peer_id"`
}
// WitnessReport is returned by a peer in response to a WitnessRequest.
type WitnessReport struct {
Seen bool `json:"seen"`
BornAt time.Time `json:"born_at,omitempty"`
FillRate float64 `json:"fill_rate,omitempty"`
Score float64 `json:"score,omitempty"`
}
// HandleBandwidthProbe echoes back everything written on the stream, then closes.
// It is registered by all participants so the measuring side (the heartbeat receiver)
// can open a dedicated probe stream and read the round-trip latency + throughput.
func HandleBandwidthProbe(s network.Stream) {
defer s.Close()
s.SetDeadline(time.Now().Add(10 * time.Second))
io.Copy(s, s) // echo every byte back to the sender
}
// HandleWitnessQuery answers a witness query: the caller wants to know
// what this node thinks of a given indexer (identified by its PeerID).
func HandleWitnessQuery(h host.Host, s network.Stream) {
defer s.Close()
s.SetDeadline(time.Now().Add(5 * time.Second))
var req WitnessRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
return
}
report := WitnessReport{}
for _, ai := range Indexers.GetAddrs() {
if ai.Info == nil || ai.Info.ID.String() != req.IndexerPeerID {
continue
}
if score := Indexers.GetScore(addrKey(*ai.Info)); score != nil {
report.Seen = true
report.BornAt = score.LastBornAt
report.FillRate = score.LastFillRate
report.Score = score.Score
}
break
}
json.NewEncoder(s).Encode(report)
}
// IndirectProbeIndexer asks each witness in the cache whether it still sees
// the given indexer (by PeerID). Returns true if at least one witness confirms
// it is alive — meaning our direct link is asymmetrically broken, not the indexer.
// All probes run in parallel; the function blocks at most 5 seconds.
func IndirectProbeIndexer(h host.Host, indexerPeerID string, pool []WitnessCacheEntry) bool {
if len(pool) == 0 {
return false
}
results := make(chan bool, len(pool))
for _, e := range pool {
go func(ai pp.AddrInfo) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
s, err := h.NewStream(ctx, ai.ID, ProtocolWitnessQuery)
if err != nil {
results <- false
return
}
defer s.Reset()
s.SetDeadline(time.Now().Add(5 * time.Second))
if err := json.NewEncoder(s).Encode(WitnessRequest{IndexerPeerID: indexerPeerID}); err != nil {
results <- false
return
}
var rep WitnessReport
if err := json.NewDecoder(s).Decode(&rep); err != nil {
results <- false
return
}
results <- rep.Seen
}(e.AI)
}
for range pool {
if <-results {
return true
}
}
return false
}
// SupportsHeartbeat probes pid with a short-lived stream to verify it has
// a ProtocolHeartbeat handler (i.e. it is an indexer, not a plain node).
// Only protocol negotiation is performed — no data is sent.
// Returns false on any error, including "protocol not supported".
func SupportsHeartbeat(h host.Host, pid pp.ID) bool {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
s, err := h.NewStream(ctx, pid, ProtocolHeartbeat)
if err != nil {
return false
}
s.Reset()
return true
}
// queryWitnesses contacts each witness in parallel, collects their view of the
// indexer, and updates score.witnessChecked / score.witnessConsistent.
// Called in a goroutine — must not hold any lock.
func queryWitnesses(h host.Host, indexerPeerID string, indexerBornAt time.Time, indexerFillRate float64, witnesses []pp.AddrInfo, score *Score) {
logger := oclib.GetLogger()
type result struct{ consistent bool }
results := make(chan result, len(witnesses))
for _, ai := range witnesses {
if ai.ID == h.ID() {
// Never query ourselves — skip and count as inconclusive.
results <- result{}
continue
}
go func(ai pp.AddrInfo) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
s, err := h.NewStream(ctx, ai.ID, ProtocolWitnessQuery)
if err != nil {
results <- result{}
return
}
defer s.Close()
s.SetDeadline(time.Now().Add(5 * time.Second))
if err := json.NewEncoder(s).Encode(WitnessRequest{IndexerPeerID: indexerPeerID}); err != nil {
results <- result{}
return
}
var rep WitnessReport
if err := json.NewDecoder(s).Decode(&rep); err != nil || !rep.Seen {
results <- result{}
return
}
// BornAt must be identical (fixed timestamp).
bornAtOK := !rep.BornAt.IsZero() && rep.BornAt.Equal(indexerBornAt)
// FillRate coherent within ±25% (it fluctuates normally).
diff := rep.FillRate - indexerFillRate
if diff < 0 {
diff = -diff
}
fillOK := diff < 0.25
consistent := bornAtOK && fillOK
logger.Debug().
Str("witness", ai.ID.String()).
Bool("bornAt_ok", bornAtOK).
Bool("fill_ok", fillOK).
Msg("witness report")
results <- result{consistent: consistent}
}(ai)
}
checked, consistent := 0, 0
for range witnesses {
r := <-results
checked++
if r.consistent {
consistent++
}
}
if checked == 0 {
return
}
score.witnessChecked += checked
score.witnessConsistent += consistent
}

View File

@@ -0,0 +1,588 @@
package common
import (
"context"
"encoding/json"
"fmt"
"math/rand"
"strings"
"sync/atomic"
"time"
"oc-discovery/conf"
oclib "cloud.o-forge.io/core/oc-lib"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
)
var TimeWatcher time.Time
// retryRunning guards against launching multiple retryUntilSeedResponds goroutines.
var retryRunning atomic.Bool
func ConnectToIndexers(h host.Host, minIndexer int, maxIndexer int, recordFn ...func() json.RawMessage) error {
TimeWatcher = time.Now().UTC()
logger := oclib.GetLogger()
// Bootstrap from IndexerAddresses seed set.
addresses := strings.Split(conf.GetConfig().IndexerAddresses, ",")
if len(addresses) > maxIndexer {
addresses = addresses[0:maxIndexer]
}
for _, indexerAddr := range addresses {
indexerAddr = strings.TrimSpace(indexerAddr)
if indexerAddr == "" {
continue
}
ad, err := pp.AddrInfoFromString(indexerAddr)
if err != nil {
logger.Err(err)
continue
}
key := ad.ID.String()
Indexers.SetAddr(key, ad)
// Pre-create score entry with IsSeed=true so the sticky flag is set before
// the first heartbeat tick (lazy creation in doTick would lose the flag).
if !Indexers.ExistsScore(key) {
Indexers.SetScore(key, &Score{
FirstContacted: time.Now().UTC(),
UptimeTracker: &UptimeTracker{FirstSeen: time.Now().UTC()},
nextChallenge: rand.Intn(10) + 1,
IsSeed: true,
})
}
}
seeds := Indexers.GetAddrs()
indexerCount := len(seeds)
if indexerCount < minIndexer {
return fmt.Errorf("you run a node without indexers... your gonna be isolated.")
}
// Start long-lived heartbeat to seed indexers. The single goroutine follows
// all subsequent StaticIndexers changes.
SendHeartbeat(context.Background(), ProtocolHeartbeat, conf.GetConfig().Name,
h, Indexers, 20*time.Second, maxIndexer, recordFn...)
// Watch for inbound connections: if a peer connects to us and our pool has
// room, probe it first to confirm it supports ProtocolHeartbeat (i.e. it is
// an indexer). Plain nodes don't register the handler — the negotiation fails
// instantly so we never pollute the pool with non-indexer peers.
h.Network().Notify(&network.NotifyBundle{
ConnectedF: func(n network.Network, c network.Conn) {
if c.Stat().Direction != network.DirInbound {
return
}
if len(Indexers.GetAddrs()) >= maxIndexer {
return
}
peerID := c.RemotePeer()
if Indexers.ExistsAddr(peerID.String()) {
return
}
// Probe in a goroutine — ConnectedF must not block.
go func(pid pp.ID) {
if !SupportsHeartbeat(h, pid) {
return // plain node, skip
}
if len(Indexers.GetAddrs()) >= maxIndexer {
return
}
if Indexers.ExistsAddr(pid.String()) {
return
}
addrs := h.Peerstore().Addrs(pid)
if len(addrs) == 0 {
return
}
ai := FilterLoopbackAddrs(pp.AddrInfo{ID: pid, Addrs: addrs})
if len(ai.Addrs) == 0 {
return
}
adCopy := ai
Indexers.SetAddr(pid.String(), &adCopy)
Indexers.NudgeIt()
log := oclib.GetLogger()
log.Info().Str("peer", pid.String()).
Msg("[pool] inbound indexer peer added as candidate")
}(peerID)
},
})
// Proactive DHT upgrade: once seeds are connected and the DHT routing table
// is warm, discover better indexers and add them to the pool alongside the seeds.
// Seeds stay as guaranteed anchors; scoring will demote poor performers over time.
go func(seeds []Entry) {
// Let seed connections establish and the DHT routing table warm up.
time.Sleep(5 * time.Second)
// For pure nodes (no IndexerService), spin up a lightweight DHT client.
if discoveryDHT == nil {
if len(seeds) == 0 {
return
}
initNodeDHT(h, seeds)
}
if discoveryDHT == nil {
return
}
current := len(Indexers.GetAddrs())
need := maxIndexer - current
if need <= 0 {
need = maxIndexer / 2 // diversify even when pool is already at capacity
}
logger.Info().Int("need", need).Msg("[dht] proactive indexer discovery from DHT")
replenishIndexersFromDHT(h, need)
}(seeds)
return nil
}
// reconnectToSeeds re-adds the configured seed indexers to StaticIndexers as
// sticky fallback entries. Called when the pool drops to zero so the node
// never becomes completely isolated.
func reconnectToSeeds() {
logger := oclib.GetLogger()
logger.Warn().Msg("[pool] all indexers lost, reconnecting to configured seeds")
addresses := strings.Split(conf.GetConfig().IndexerAddresses, ",")
for _, addrStr := range addresses {
addrStr = strings.TrimSpace(addrStr)
if addrStr == "" {
continue
}
ad, err := pp.AddrInfoFromString(addrStr)
if err != nil {
continue
}
key := ad.ID.String()
Indexers.SetAddr(key, ad)
if score := Indexers.GetScore(key); score == nil {
Indexers.SetScore(key, &Score{
FirstContacted: time.Now().UTC(),
UptimeTracker: &UptimeTracker{FirstSeen: time.Now().UTC()},
nextChallenge: rand.Intn(10) + 1,
IsSeed: true,
})
} else {
// Restore sticky flag so the seed is not immediately re-ejected.
score.IsSeed = true
}
}
}
// retryUntilSeedResponds loops with exponential backoff until at least one
// configured seed is reachable again. Once seeds are back in the pool it
// nudges the heartbeat loop and lets the normal DHT upgrade path take over.
// Should be called in a goroutine — it blocks until the situation resolves.
// Panics immediately if no seeds are configured: there is nothing to wait for.
func retryUntilSeedResponds() {
if !retryRunning.CompareAndSwap(false, true) {
return // another goroutine is already running the retry loop
}
defer retryRunning.Store(false)
logger := oclib.GetLogger()
rawAddresses := strings.TrimSpace(conf.GetConfig().IndexerAddresses)
if rawAddresses == "" {
// No seeds configured: rely on the inbound-connection notifee to fill
// the pool. Just wait patiently — the loop below will return as soon
// as any peer connects and NudgeIt() is called.
logger.Warn().Msg("[pool] pool empty and no seeds configured — waiting for inbound indexer")
}
backoff := 10 * time.Second
const maxBackoff = 5 * time.Minute
for {
time.Sleep(backoff)
if backoff < maxBackoff {
backoff *= 2
}
// Check whether someone else already refilled the pool.
if len(Indexers.GetAddrs()) > 0 {
logger.Info().Msg("[pool] pool refilled externally, stopping seed retry")
return
}
logger.Warn().Dur("backoff", backoff).Msg("[pool] still isolated, retrying seeds")
reconnectToSeeds()
if len(Indexers.GetAddrs()) > 0 {
Indexers.NudgeIt()
// Re-bootstrap DHT now that we have at least one connection candidate.
if discoveryDHT != nil {
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
discoveryDHT.Bootstrap(ctx) //nolint:errcheck
cancel()
}
return
}
}
}
// ensureScore returns the Score for addr, creating it if absent.
func ensureScore(d *Directory, addr string) *Score {
if !d.ExistsScore(addr) {
d.SetScore(addr, &Score{
FirstContacted: time.Now().UTC(),
UptimeTracker: &UptimeTracker{FirstSeen: time.Now().UTC()},
nextChallenge: rand.Intn(10) + 1,
})
}
return d.GetScore(addr)
}
// evictPeer removes addr from directory atomically and returns a snapshot of
// remaining AddrInfos (for consensus voter selection).
func evictPeer(d *Directory, addr string, id pp.ID, proto protocol.ID) []pp.AddrInfo {
d.Streams.Delete(proto, &id)
d.DeleteAddr(addr)
voters := make([]pp.AddrInfo, 0, len(d.Addrs))
for _, ai := range d.GetAddrs() {
if ai.Info != nil {
voters = append(voters, *ai.Info)
}
}
d.DeleteScore(addr)
return voters
}
// handleSuggestions adds unknown suggested indexers to the directory.
func handleSuggestions(d *Directory, from string, suggestions []pp.AddrInfo) {
added := 0
for _, sug := range suggestions {
key := addrKey(sug)
if !d.ExistsAddr(key) {
cpy := sug
d.SetAddr(key, &cpy)
added++
}
}
if added > 0 {
logger := oclib.GetLogger()
logger.Info().Int("added", added).Str("from", from).
Msg("added suggested indexers from heartbeat response")
d.NudgeIt()
}
}
// SendHeartbeat starts a goroutine that sends periodic heartbeats to peers.
// recordFn, when provided, is called on each tick and its output is embedded in
// the heartbeat as a fresh signed PeerRecord so the receiving indexer can
// republish it to the DHT without an extra round-trip.
// Pass no recordFn (or nil) for indexer→indexer / native heartbeats.
func SendHeartbeat(ctx context.Context, proto protocol.ID, name string, h host.Host, directory *Directory, interval time.Duration, maxPool int, recordFn ...func() json.RawMessage) {
logger := oclib.GetLogger()
isIndexerHB := directory == Indexers
var recFn func() json.RawMessage
if len(recordFn) > 0 {
recFn = recordFn[0]
}
go func() {
logger.Info().Str("proto", string(proto)).Int("peers", len(directory.Addrs)).Msg("heartbeat started")
t := time.NewTicker(interval)
defer t.Stop()
// peerEntry pairs addr key with AddrInfo so doTick can update score maps directly.
type peerEntry struct {
addr string
ai *pp.AddrInfo
}
doTick := func() {
addrs := directory.GetAddrsStr()
need := maxPool - len(addrs)
if need < 0 {
need = 0
}
baseHB := Heartbeat{
Name: name,
PeerID: h.ID().String(),
Timestamp: time.Now().UTC().Unix(),
IndexersBinded: addrs,
Need: need,
}
if recFn != nil {
baseHB.Record = recFn()
}
// Determine the referent indexer: highest-scored one receives Referent=true
// so it stores us in its referencedNodes for distributed search.
var referentAddr string
if isIndexerHB {
var bestScore float64 = -1
for _, ai2 := range directory.GetAddrs() {
if s := directory.GetScore(ai2.Addr); s != nil && s.Score > bestScore {
bestScore = s.Score
referentAddr = ai2.Addr
}
}
}
for _, ai := range directory.GetAddrs() {
// Build per-peer heartbeat copy so challenge injection is peer-specific.
hb := baseHB
if isIndexerHB && referentAddr != "" && ai.Addr == referentAddr {
hb.Referent = true
}
// Ensure an IndexerScore entry exists for this peer.
var score *Score
if isIndexerHB {
score = ensureScore(directory, ai.Addr)
// Inject challenge batch if due (random 1-10 HBs between batches).
score.hbCount++
if score.hbCount >= score.nextChallenge {
// Ground truth: node's own PeerID — indexer MUST have us.
challenges := []string{h.ID().String()}
// Add up to 2 more known peers (other indexers) for richer data.
// Use the already-snapshotted entries to avoid re-locking.
for _, ai2 := range directory.GetAddrs() {
if ai2.Addr != ai.Addr && ai2.Info != nil {
challenges = append(challenges, ai2.Info.ID.String())
if len(challenges) >= 3 {
break
}
}
}
hb.Challenges = challenges
score.hbCount = 0
score.nextChallenge = rand.Intn(10) + 1
score.challengeTotal++ // count own-PeerID challenge (ground truth)
score.dhtBatchCounter++
// DHT challenge every 5th batch: ask indexer to retrieve our own DID.
if score.dhtBatchCounter%5 == 0 {
var selfDID string
if len(baseHB.Record) > 0 {
var partial struct {
DID string `json:"did"`
}
if json.Unmarshal(baseHB.Record, &partial) == nil {
selfDID = partial.DID
}
}
if selfDID != "" {
hb.ChallengeDID = selfDID
}
}
}
}
resp, rtt, err := sendHeartbeat(ctx, h, proto, ai.Info, hb, directory.Streams, interval*time.Second)
if err != nil { // Heartbeat fails
fmt.Println("EERR", err)
HeartbeatFailure(h, proto, directory, ai.Addr, ai.Info, isIndexerHB, maxPool, err)
continue
}
// Update IndexerScore — uptime recorded on any successful send,
// even if the indexer does not support bidirectional heartbeat (Fix 1).
if isIndexerHB && score != nil {
score.UptimeTracker.RecordHeartbeat()
maxRTT := BaseRoundTrip * 10
latencyScore := 1.0 - float64(rtt)/float64(maxRTT)
if latencyScore < 0 {
latencyScore = 0
}
if latencyScore > 1 {
latencyScore = 1
}
// Update fill / challenge fields only when the indexer responded.
if resp != nil {
// BornAt stability check.
if score.LastBornAt.IsZero() {
score.LastBornAt = resp.BornAt
} else if !resp.BornAt.IsZero() && !resp.BornAt.Equal(score.LastBornAt) {
score.bornAtChanges++
score.LastBornAt = resp.BornAt
logger.Warn().Str("peer", ai.Info.ID.String()).
Int("changes", score.bornAtChanges).
Msg("indexer BornAt changed — possible restart or impersonation")
}
score.LastFillRate = resp.FillRate
// Fill rate consistency: cross-check peerCount/maxNodes vs reported fillRate.
if resp.MaxNodes > 0 {
expected := float64(resp.PeerCount) / float64(resp.MaxNodes)
diff := expected - resp.FillRate
if diff < 0 {
diff = -diff
}
score.fillChecked++
if diff < 0.1 {
score.fillConsistent++
}
}
// Validate challenge responses. Only own-PeerID counts as ground truth.
if len(hb.Challenges) > 0 && len(resp.Challenges) > 0 {
ownID := h.ID().String()
for _, ce := range resp.Challenges {
if ce.PeerID != ownID {
continue // informational only
}
recentEnough := !ce.LastSeen.IsZero() &&
time.Since(ce.LastSeen) < 2*RecommendedHeartbeatInterval
if ce.Found && recentEnough {
score.challengeCorrect++
}
logger.Info().Str("peer", ai.Info.ID.String()).
Bool("found", ce.Found).
Bool("recent", recentEnough).
Msg("own-PeerID challenge result")
break
}
}
// DHT challenge result.
if hb.ChallengeDID != "" {
score.dhtChecked++
if resp.DHTFound {
score.dhtSuccess++
}
}
// Refresh local witness cache for indirect probing on future failure.
for _, w := range resp.Witnesses {
score.UpdateWitnessPool(w)
}
// Launch witness cross-check asynchronously (must not hold lock).
if len(resp.Witnesses) > 0 {
go queryWitnesses(h, ai.Info.ID.String(), resp.BornAt, resp.FillRate, resp.Witnesses, score)
} else if resp.MaxNodes > 0 {
// No witnesses offered. Valid if indexer only has us (PeerCount==1).
// Cross-check: FillRate should equal 1/MaxNodes within ±10%.
expected := 1.0 / float64(resp.MaxNodes)
diff := resp.FillRate - expected
if diff < 0 {
diff = -diff
}
score.witnessChecked++
if resp.PeerCount == 1 && diff < 0.1 {
score.witnessConsistent++
}
}
}
score.Score = score.ComputeNodeSideScore(latencyScore)
age := score.UptimeTracker.Uptime()
minScore := dynamicMinScore(age)
// Fix 4: grace period — at least 2 full heartbeat cycles before ejecting.
isSeed := score.IsSeed
// Seeds are sticky: never evicted by score alone (SuggestMigrate handles it).
// Never eject the last indexer by score alone — we would lose all connectivity.
belowThreshold := score.Score < minScore &&
score.UptimeTracker.TotalOnline >= 2*RecommendedHeartbeatInterval &&
!isSeed &&
len(directory.Addrs) > 1
if belowThreshold {
logger.Info().Str("peer", ai.Info.ID.String()).
Float64("score", score.Score).Float64("min", minScore).
Msg("indexer score below threshold, removing from pool")
voters := evictPeer(directory, ai.Addr, ai.Info.ID, proto)
need := max(maxPool-len(voters), 1)
if len(voters) > 0 {
go TriggerConsensus(h, voters, need)
} else {
go replenishIndexersFromDHT(h, need)
}
}
// Accept suggestions from this indexer — add unknown ones to the directory.
if resp != nil && len(resp.Suggestions) > 0 {
handleSuggestions(directory, ai.Info.ID.String(), resp.Suggestions)
}
// Handle SuggestMigrate: indexer is overloaded and wants us to move.
if resp != nil && resp.SuggestMigrate && isIndexerHB {
nonSeedCount := 0
for _, sc := range directory.GetScores() {
if !sc.IsSeed {
nonSeedCount++
}
}
if nonSeedCount >= conf.GetConfig().MinIndexer {
if isSeed {
// Seed has offloaded us: clear sticky flag, score eviction takes over.
score.IsSeed = false
logger.Info().Str("peer", ai.Info.ID.String()).
Msg("seed discharged via SuggestMigrate, de-stickied")
} else {
evictPeer(directory, ai.Addr, ai.Info.ID, proto)
logger.Info().Str("peer", ai.Info.ID.String()).Msg("accepted migration from overloaded indexer")
}
}
}
}
}
}
for {
select {
case <-t.C:
doTick()
case <-directory.Nudge:
if isIndexerHB {
logger.Info().Msg("nudge received, heartbeating new indexers immediately")
doTick()
}
case <-ctx.Done():
return
}
}
}()
}
func HeartbeatFailure(h host.Host, proto protocol.ID, directory *Directory,
addr string, info *pp.AddrInfo, isIndexerHB bool, maxPool int, err error) {
logger := oclib.GetLogger()
logger.Err(err)
// Seeds are never evicted on heartbeat failure.
// Keeping them in the pool lets the regular 60-second ticker retry them
// at a natural cadence — no reconnect storm, no libp2p dial-backoff accumulation.
// A seed will self-heal once it comes back; DHT and inbound peers fill the gap.
if isIndexerHB {
if score := directory.GetScore(addr); score != nil {
if score.IsSeed {
logger.Warn().Str("peer", info.ID.String()).
Msg("[pool] seed heartbeat failed — keeping in pool, ticker will retry " + err.Error())
return
}
// Indirect probe: query cached witnesses before declaring the indexer dead.
// If a witness confirms it is alive, the failure is a local asymmetric
// link — not the indexer. Skip eviction; next tick will retry directly.
if len(score.WitnessPool) > 0 {
pool := append([]WitnessCacheEntry(nil), score.WitnessPool...)
if IndirectProbeIndexer(h, info.ID.String(), pool) {
logger.Warn().Str("peer", info.ID.String()).
Msg("[indirect] witness confirms indexer alive — asymmetric link, skipping eviction " + err.Error())
return
}
}
}
}
logger.Info().Str("peer", info.ID.String()).Str("proto", string(proto)).
Msg("heartbeat failed, removing peer from pool : " + err.Error())
consensusVoters := evictPeer(directory, addr, info.ID, proto)
if isIndexerHB {
need := maxPool - len(consensusVoters)
if need < 1 {
need = 1
}
logger.Info().Int("remaining", len(consensusVoters)).Int("need", need).Msg("pool state after removal")
poolSize := len(directory.GetAddrs())
if poolSize == 0 {
// Pool is truly empty (no seeds configured or no seeds in pool).
// Start the backoff retry loop — it will re-add seeds and nudge
// only once a seed actually responds.
go retryUntilSeedResponds()
} else if len(consensusVoters) > 0 {
go TriggerConsensus(h, consensusVoters, need)
} else {
go replenishIndexersFromDHT(h, need)
}
}
}

View File

@@ -0,0 +1,182 @@
package common
import (
"context"
cr "crypto/rand"
"io"
"net"
"slices"
"time"
"github.com/libp2p/go-libp2p/core/host"
pp "github.com/libp2p/go-libp2p/core/peer"
)
const MaxExpectedMbps = 100.0
const MinPayloadChallenge = 512
const MaxPayloadChallenge = 2048
const BaseRoundTrip = 400 * time.Millisecond
type UptimeTracker struct {
FirstSeen time.Time
LastSeen time.Time
TotalOnline time.Duration
}
// RecordHeartbeat accumulates online time gap-aware: only counts the interval if
// the gap since the last heartbeat is within 2× the recommended interval (i.e. no
// extended outage). Call this each time a heartbeat is successfully processed.
func (u *UptimeTracker) RecordHeartbeat() {
now := time.Now().UTC()
if !u.LastSeen.IsZero() {
gap := now.Sub(u.LastSeen)
if gap <= 2*RecommendedHeartbeatInterval {
u.TotalOnline += gap
}
}
u.LastSeen = now
}
func (u *UptimeTracker) Uptime() time.Duration {
return time.Since(u.FirstSeen)
}
// UptimeRatio returns the fraction of tracked lifetime during which the peer was
// continuously online (gap ≤ 2×RecommendedHeartbeatInterval). Returns 0 before
// the first heartbeat interval has elapsed.
func (u *UptimeTracker) UptimeRatio() float64 {
total := time.Since(u.FirstSeen)
if total <= 0 {
return 0
}
ratio := float64(u.TotalOnline) / float64(total)
if ratio > 1 {
ratio = 1
}
return ratio
}
func (u *UptimeTracker) IsEligible(min time.Duration) bool {
return u.Uptime() >= min
}
// getBandwidthChallengeRate opens a dedicated ProtocolBandwidthProbe stream to
// remotePeer, sends a random payload, reads the echo, and computes throughput
// and a latency score. Returns (ok, bpms, latencyScore, error).
// latencyScore is 1.0 when RTT is very fast and 0.0 when at or beyond maxRoundTrip.
// Using a separate stream avoids mixing binary data on the JSON heartbeat stream
// and ensures the echo handler is actually running on the remote side.
func getBandwidthChallengeRate(h host.Host, remotePeer pp.ID, payloadSize int) (bool, float64, float64, error) {
payload := make([]byte, payloadSize)
if _, err := cr.Read(payload); err != nil {
return false, 0, 0, err
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
s, err := h.NewStream(ctx, remotePeer, ProtocolBandwidthProbe)
if err != nil {
return false, 0, 0, err
}
defer s.Reset()
s.SetDeadline(time.Now().Add(10 * time.Second))
start := time.Now()
if _, err = s.Write(payload); err != nil {
return false, 0, 0, err
}
s.CloseWrite()
// Half-close the write side so the handler's io.Copy sees EOF and stops.
// Read the echo.
response := make([]byte, payloadSize)
if _, err = io.ReadFull(s, response); err != nil {
return false, 0, 0, err
}
duration := time.Since(start)
maxRoundTrip := BaseRoundTrip + (time.Duration(payloadSize) * (100 * time.Millisecond))
mbps := float64(payloadSize*8) / duration.Seconds() / 1e6
// latencyScore: 1.0 = instant, 0.0 = at maxRoundTrip or beyond.
latencyScore := 1.0 - float64(duration)/float64(maxRoundTrip)
if latencyScore < 0 {
latencyScore = 0
}
if latencyScore > 1 {
latencyScore = 1
}
if duration > maxRoundTrip || mbps < 5.0 {
return false, float64(mbps / MaxExpectedMbps), latencyScore, nil
}
return true, float64(mbps / MaxExpectedMbps), latencyScore, nil
}
func getDiversityRate(h host.Host, peers []string) float64 {
peers, _ = checkPeers(h, peers)
diverse := []string{}
for _, p := range peers {
ip, err := ExtractIP(p)
if err != nil {
continue
}
div := ip.Mask(net.CIDRMask(24, 32)).String()
if !slices.Contains(diverse, div) {
diverse = append(diverse, div)
}
}
if len(diverse) == 0 || len(peers) == 0 {
return 1
}
return float64(len(diverse)) / float64(len(peers))
}
// getOwnDiversityRate measures subnet /24 diversity of the indexer's own connected peers.
// This evaluates the indexer's network position rather than the connecting node's topology.
func getOwnDiversityRate(h host.Host) float64 {
diverse := map[string]struct{}{}
total := 0
for _, pid := range h.Network().Peers() {
for _, maddr := range h.Peerstore().Addrs(pid) {
total++
ip, err := ExtractIP(maddr.String())
if err != nil {
continue
}
diverse[ip.Mask(net.CIDRMask(24, 32)).String()] = struct{}{}
}
}
if total == 0 {
return 1
}
return float64(len(diverse)) / float64(total)
}
func checkPeers(h host.Host, peers []string) ([]string, []string) {
concretePeer := []string{}
ips := []string{}
for _, p := range peers {
ad, err := pp.AddrInfoFromString(p)
if err != nil {
continue
}
if PeerIsAlive(h, *ad) {
concretePeer = append(concretePeer, p)
if ip, err := ExtractIP(p); err == nil {
ips = append(ips, ip.Mask(net.CIDRMask(24, 32)).String())
}
}
}
return concretePeer, ips
}
// dynamicMinScore returns the minimum acceptable score for a peer, starting
// permissive (20%) for brand-new peers and hardening linearly to 80% over 24h.
// This prevents ejecting newcomers in fresh networks while filtering parasites.
func dynamicMinScore(age time.Duration) float64 {
hours := age.Hours()
score := 20.0 + 60.0*(hours/24.0)
if score > 80.0 {
score = 80.0
}
return score
}

View File

@@ -0,0 +1,302 @@
package common
import (
"encoding/json"
"errors"
"fmt"
"io"
"math/rand"
"strings"
"sync"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
)
type LongLivedStreamRecordedService[T interface{}] struct {
*LongLivedPubSubService
StreamRecords map[protocol.ID]map[pp.ID]*StreamRecord[T]
StreamMU sync.RWMutex
maxNodesConn int
// AllowInbound, when set, is called once at stream open before any heartbeat
// is decoded. remotePeer is the connecting peer; isNew is true when no
// StreamRecord exists yet (first-ever connection). Return a non-nil error
// to immediately reset the stream and refuse the peer.
AllowInbound func(remotePeer pp.ID, isNew bool) error
// ValidateHeartbeat, when set, is called inside the heartbeat loop after
// each successful CheckHeartbeat decode. Return a non-nil error to reset
// the stream and terminate the session.
ValidateHeartbeat func(remotePeer pp.ID) error
// AfterHeartbeat is called after each successful heartbeat with the full
// decoded Heartbeat so the hook can use the fresh embedded PeerRecord.
AfterHeartbeat func(hb *Heartbeat)
// AfterDelete is called after gc() evicts an expired peer, outside the lock.
// name and did may be empty if the HeartbeatStream had no metadata.
AfterDelete func(pid pp.ID, name string, did string)
// BuildHeartbeatResponse, when set, is called after each successfully decoded
// heartbeat to build the response sent back to the node.
// remotePeer is the peer that sent the heartbeat (used for offload routing).
// need is how many more indexers the node wants (from hb.Need).
// referent is true when the node designated this indexer as its search referent.
BuildHeartbeatResponse func(remotePeer pp.ID, need int, challenges []string, challengeDID string, referent bool) *HeartbeatResponse
}
func (ix *LongLivedStreamRecordedService[T]) MaxNodesConn() int {
return ix.maxNodesConn
}
func NewStreamRecordedService[T interface{}](h host.Host, maxNodesConn int) *LongLivedStreamRecordedService[T] {
service := &LongLivedStreamRecordedService[T]{
LongLivedPubSubService: NewLongLivedPubSubService(h),
StreamRecords: map[protocol.ID]map[pp.ID]*StreamRecord[T]{},
maxNodesConn: maxNodesConn,
}
go service.StartGC(30 * time.Second)
// Garbage collection is needed on every Map of Long-Lived Stream... it may be a top level redesigned
go service.Snapshot(1 * time.Hour)
return service
}
func (ix *LongLivedStreamRecordedService[T]) StartGC(interval time.Duration) {
go func() {
t := time.NewTicker(interval)
defer t.Stop()
for range t.C {
fmt.Println("ACTUALLY RELATED INDEXERS", Indexers.Addrs, len(Indexers.Addrs))
ix.gc()
}
}()
}
func (ix *LongLivedStreamRecordedService[T]) gc() {
ix.StreamMU.Lock()
now := time.Now().UTC()
if ix.StreamRecords[ProtocolHeartbeat] == nil {
ix.StreamRecords[ProtocolHeartbeat] = map[pp.ID]*StreamRecord[T]{}
ix.StreamMU.Unlock()
return
}
streams := ix.StreamRecords[ProtocolHeartbeat]
type gcEntry struct {
pid pp.ID
name string
did string
}
var evicted []gcEntry
for pid, rec := range streams {
if now.After(rec.HeartbeatStream.Expiry) || now.Sub(rec.HeartbeatStream.UptimeTracker.LastSeen) > 2*rec.HeartbeatStream.Expiry.Sub(now) {
name, did := "", ""
if rec.HeartbeatStream != nil {
name = rec.HeartbeatStream.Name
did = rec.HeartbeatStream.DID
}
evicted = append(evicted, gcEntry{pid, name, did})
for _, sstreams := range ix.StreamRecords {
if sstreams[pid] != nil {
delete(sstreams, pid)
}
}
}
}
ix.StreamMU.Unlock()
if ix.AfterDelete != nil {
for _, e := range evicted {
ix.AfterDelete(e.pid, e.name, e.did)
}
}
}
func (ix *LongLivedStreamRecordedService[T]) Snapshot(interval time.Duration) {
go func() {
logger := oclib.GetLogger()
t := time.NewTicker(interval)
defer t.Stop()
for range t.C {
infos := ix.snapshot()
for _, inf := range infos {
logger.Info().Msg(" -> " + inf.DID)
}
}
}()
}
// -------- Snapshot / Query --------
func (ix *LongLivedStreamRecordedService[T]) snapshot() []*StreamRecord[T] {
ix.StreamMU.Lock()
defer ix.StreamMU.Unlock()
out := make([]*StreamRecord[T], 0, len(ix.StreamRecords))
for _, streams := range ix.StreamRecords {
for _, stream := range streams {
out = append(out, stream)
}
}
return out
}
func (ix *LongLivedStreamRecordedService[T]) HandleHeartbeat(s network.Stream) {
logger := oclib.GetLogger()
defer s.Close()
// AllowInbound: burst guard + ban check before the first byte is read.
if ix.AllowInbound != nil {
remotePeer := s.Conn().RemotePeer()
ix.StreamMU.RLock()
_, exists := ix.StreamRecords[ProtocolHeartbeat][remotePeer]
ix.StreamMU.RUnlock()
if err := ix.AllowInbound(remotePeer, !exists); err != nil {
logger.Warn().Err(err).Str("peer", remotePeer.String()).Msg("inbound connection refused")
s.Reset()
return
}
}
dec := json.NewDecoder(s)
for {
ix.StreamMU.Lock()
if ix.StreamRecords[ProtocolHeartbeat] == nil {
ix.StreamRecords[ProtocolHeartbeat] = map[pp.ID]*StreamRecord[T]{}
}
streams := ix.StreamRecords[ProtocolHeartbeat]
streamsAnonym := map[pp.ID]HeartBeatStreamed{}
for k, v := range streams {
streamsAnonym[k] = v
}
ix.StreamMU.Unlock()
pid, hb, err := CheckHeartbeat(ix.Host, s, dec, streamsAnonym, &ix.StreamMU, ix.maxNodesConn)
if err != nil {
// Stream-level errors (EOF, reset, closed) mean the connection is gone
// — exit so the goroutine doesn't spin forever on a dead stream.
// Metric/policy errors (score too low, too many connections) are transient
// — those are also stream-terminal since the stream carries one session.
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
logger.Info().Err(err).Msg("heartbeat stream terminated, closing handler")
return
}
logger.Warn().Err(err).Msg("heartbeat check failed, retrying on same stream")
continue
}
// ValidateHeartbeat: per-tick behavioral check (rate limiting, bans).
if ix.ValidateHeartbeat != nil {
if err := ix.ValidateHeartbeat(*pid); err != nil {
logger.Warn().Err(err).Str("peer", pid.String()).Msg("heartbeat rejected, closing stream")
s.Reset()
return
}
}
ix.StreamMU.Lock()
// if record already seen update last seen
if rec, ok := streams[*pid]; ok {
rec.DID = hb.DID
// Preserve the existing UptimeTracker so TotalOnline accumulates correctly.
// hb.Stream is a fresh Stream with no UptimeTracker; carry the old one over.
oldTracker := rec.GetUptimeTracker()
rec.HeartbeatStream = hb.Stream
if oldTracker != nil {
rec.HeartbeatStream.UptimeTracker = oldTracker
} else {
rec.HeartbeatStream.UptimeTracker = &UptimeTracker{FirstSeen: time.Now().UTC()}
}
rec.HeartbeatStream.UptimeTracker.RecordHeartbeat()
rec.LastScore = hb.Score
logger.Info().Msg("A new node is updated : " + pid.String())
} else {
tracker := &UptimeTracker{FirstSeen: time.Now().UTC()}
tracker.RecordHeartbeat()
hb.Stream.UptimeTracker = tracker
streams[*pid] = &StreamRecord[T]{
DID: hb.DID,
HeartbeatStream: hb.Stream,
LastScore: hb.Score,
}
logger.Info().Msg("A new node is subscribed : " + pid.String())
}
ix.StreamMU.Unlock()
// Enrich hb.DID before calling the hook: nodes never set hb.DID directly;
// extract it from the embedded signed PeerRecord if available, then fall
// back to the DID stored by handleNodePublish in the stream record.
if hb.DID == "" && len(hb.Record) > 0 {
var partial struct {
DID string `json:"did"`
}
if json.Unmarshal(hb.Record, &partial) == nil && partial.DID != "" {
hb.DID = partial.DID
}
}
if hb.DID == "" {
ix.StreamMU.RLock()
if rec, ok := streams[*pid]; ok {
hb.DID = rec.DID
}
ix.StreamMU.RUnlock()
}
if ix.AfterHeartbeat != nil && hb.DID != "" {
ix.AfterHeartbeat(hb)
}
// Send response back to the node (bidirectional heartbeat).
if ix.BuildHeartbeatResponse != nil {
if resp := ix.BuildHeartbeatResponse(s.Conn().RemotePeer(), hb.Need, hb.Challenges, hb.ChallengeDID, hb.Referent); resp != nil {
s.SetWriteDeadline(time.Now().Add(3 * time.Second))
json.NewEncoder(s).Encode(resp)
s.SetWriteDeadline(time.Time{})
}
}
}
}
func CheckHeartbeat(h host.Host, s network.Stream, dec *json.Decoder, streams map[pp.ID]HeartBeatStreamed, lock *sync.RWMutex, maxNodes int) (*pp.ID, *Heartbeat, error) {
if len(h.Network().Peers()) >= maxNodes {
return nil, nil, fmt.Errorf("too many connections, try another indexer")
}
var hb Heartbeat
if err := dec.Decode(&hb); err != nil {
return nil, nil, err
}
_, bpms, latencyScore, _ := getBandwidthChallengeRate(h, s.Conn().RemotePeer(), MinPayloadChallenge+int(rand.Float64()*(MaxPayloadChallenge-MinPayloadChallenge)))
{
pid, err := pp.Decode(hb.PeerID)
if err != nil {
return nil, nil, err
}
uptimeRatio := float64(0)
age := time.Duration(0)
lock.Lock()
if rec, ok := streams[pid]; ok && rec.GetUptimeTracker() != nil {
uptimeRatio = rec.GetUptimeTracker().UptimeRatio()
age = rec.GetUptimeTracker().Uptime()
}
lock.Unlock()
// E: measure the indexer's own subnet diversity, not the node's view.
diversity := getOwnDiversityRate(h)
// fillRate: fraction of indexer capacity used — higher = more peers trust this indexer.
fillRate := 0.0
if maxNodes > 0 {
fillRate = float64(len(h.Network().Peers())) / float64(maxNodes)
if fillRate > 1 {
fillRate = 1
}
}
hb.ComputeIndexerScore(uptimeRatio, bpms, diversity, latencyScore, fillRate)
// B: dynamic minScore — starts at 20% for brand-new peers, ramps to 80% at 24h.
minScore := dynamicMinScore(age)
if hb.Score < minScore {
return nil, nil, errors.New("not enough trusting value")
}
hb.Stream = &Stream{
Name: hb.Name,
DID: hb.DID,
Stream: s,
Expiry: time.Now().UTC().Add(2 * time.Minute),
} // here is the long-lived bidirectional heartbeat.
return &pid, &hb, err
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,199 @@
package common
import (
"context"
"encoding/json"
"sort"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
)
// ProtocolIndexerCandidates is opened by a node toward its remaining indexers
// to request candidate replacement indexers after an ejection event.
const ProtocolIndexerCandidates = "/opencloud/indexer/candidates/1.0"
// IndexerCandidatesRequest is sent by a node to one of its indexers.
// Count is how many candidates are needed.
type IndexerCandidatesRequest struct {
Count int `json:"count"`
}
// IndexerCandidatesResponse carries a random sample of known indexers from
// the responding indexer's DHT cache.
type IndexerCandidatesResponse struct {
Candidates []pp.AddrInfo `json:"candidates"`
}
// TriggerConsensus asks each remaining indexer for a random pool of candidates,
// scores them asynchronously via a one-shot probe heartbeat, and admits the
// best ones to StaticIndexers. Falls back to DHT replenishment for any gap.
//
// Must be called in a goroutine — it blocks until all probes have returned
// (or timed out), which can take up to ~10s.
func TriggerConsensus(h host.Host, remaining []pp.AddrInfo, need int) {
if need <= 0 || len(remaining) == 0 {
return
}
logger := oclib.GetLogger()
logger.Info().Int("voters", len(remaining)).Int("need", need).
Msg("[consensus] starting indexer candidate consensus")
// Phase 1 — collect candidates from all remaining indexers in parallel.
type collectResult struct{ candidates []pp.AddrInfo }
collectCh := make(chan collectResult, len(remaining))
for _, ai := range remaining {
go func(ai pp.AddrInfo) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
s, err := h.NewStream(ctx, ai.ID, ProtocolIndexerCandidates)
if err != nil {
collectCh <- collectResult{}
return
}
defer s.Close()
s.SetDeadline(time.Now().Add(5 * time.Second))
if err := json.NewEncoder(s).Encode(IndexerCandidatesRequest{Count: need + 2}); err != nil {
collectCh <- collectResult{}
return
}
var resp IndexerCandidatesResponse
if err := json.NewDecoder(s).Decode(&resp); err != nil {
collectCh <- collectResult{}
return
}
collectCh <- collectResult{candidates: resp.Candidates}
}(ai)
}
// Merge and deduplicate, excluding indexers already in the pool.
seen := map[pp.ID]struct{}{}
for _, ai := range Indexers.GetAddrIDs() {
seen[ai] = struct{}{}
}
var candidates []pp.AddrInfo
for range remaining {
r := <-collectCh
for _, ai := range r.candidates {
if _, dup := seen[ai.ID]; !dup {
seen[ai.ID] = struct{}{}
candidates = append(candidates, ai)
}
}
}
if len(candidates) == 0 {
logger.Info().Msg("[consensus] no candidates from voters, falling back to DHT")
replenishIndexersFromDHT(h, need)
return
}
logger.Info().Int("candidates", len(candidates)).Msg("[consensus] scoring candidates")
// Phase 2 — score all candidates in parallel via a one-shot probe heartbeat.
type scoreResult struct {
ai pp.AddrInfo
score float64
}
scoreCh := make(chan scoreResult, len(candidates))
for _, ai := range candidates {
go func(ai pp.AddrInfo) {
resp, rtt, err := probeIndexer(h, ai)
if err != nil {
scoreCh <- scoreResult{ai: ai, score: 0}
return
}
scoreCh <- scoreResult{ai: ai, score: quickScore(resp, rtt)}
}(ai)
}
results := make([]scoreResult, 0, len(candidates))
for range candidates {
results = append(results, <-scoreCh)
}
// Sort descending by quick score, admit top `need` above the minimum bar.
sort.Slice(results, func(i, j int) bool { return results[i].score > results[j].score })
minQ := dynamicMinScore(0) // fresh peer: threshold starts at 20
admitted := 0
for _, res := range results {
if admitted >= need {
break
}
if res.score < minQ {
break // sorted desc: everything after is worse
}
key := addrKey(res.ai)
if Indexers.ExistsAddr(key) {
continue // already in pool (race with heartbeat path)
}
cpy := res.ai
Indexers.SetAddr(key, &cpy)
admitted++
}
if admitted > 0 {
logger.Info().Int("admitted", admitted).Msg("[consensus] candidates admitted to pool")
Indexers.NudgeIt()
}
// Fill any remaining gap with DHT discovery.
if gap := need - admitted; gap > 0 {
logger.Info().Int("gap", gap).Msg("[consensus] gap after consensus, falling back to DHT")
replenishIndexersFromDHT(h, gap)
}
}
// probeIndexer dials the candidate, sends one lightweight heartbeat, and
// returns the HeartbeatResponse (nil if the indexer doesn't support it) and RTT.
func probeIndexer(h host.Host, ai pp.AddrInfo) (*HeartbeatResponse, time.Duration, error) {
ctx, cancel := context.WithTimeout(context.Background(), 8*time.Second)
defer cancel()
if h.Network().Connectedness(ai.ID) != network.Connected {
if err := h.Connect(ctx, ai); err != nil {
return nil, 0, err
}
}
s, err := h.NewStream(ctx, ai.ID, ProtocolHeartbeat)
if err != nil {
return nil, 0, err
}
defer s.Close()
hb := Heartbeat{PeerID: h.ID().String(), Timestamp: time.Now().UTC().Unix()}
s.SetWriteDeadline(time.Now().Add(3 * time.Second))
if err := json.NewEncoder(s).Encode(hb); err != nil {
return nil, 0, err
}
s.SetWriteDeadline(time.Time{})
sentAt := time.Now()
s.SetReadDeadline(time.Now().Add(5 * time.Second))
var resp HeartbeatResponse
if err := json.NewDecoder(s).Decode(&resp); err != nil {
// Indexer connected but no response: connection itself is the signal.
return nil, time.Since(sentAt), nil
}
return &resp, time.Since(sentAt), nil
}
// quickScore computes a lightweight score [0,100] from a probe result.
// Uses only fill rate (inverse) and latency — the two signals available
// without a full heartbeat history.
func quickScore(resp *HeartbeatResponse, rtt time.Duration) float64 {
maxRTT := BaseRoundTrip * 10
latencyScore := 1.0 - float64(rtt)/float64(maxRTT)
if latencyScore < 0 {
latencyScore = 0
}
if resp == nil {
// Connection worked but no response (old indexer): moderate score.
return latencyScore * 50
}
fillScore := 1.0 - resp.FillRate // prefer less-loaded indexers
return (0.5*latencyScore + 0.5*fillScore) * 100
}

View File

@@ -0,0 +1,219 @@
package common
import (
"context"
"math/rand"
"strings"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"github.com/ipfs/go-cid"
dht "github.com/libp2p/go-libp2p-kad-dht"
"github.com/libp2p/go-libp2p/core/host"
pp "github.com/libp2p/go-libp2p/core/peer"
ma "github.com/multiformats/go-multiaddr"
mh "github.com/multiformats/go-multihash"
)
// FilterLoopbackAddrs strips loopback (127.x, ::1) and unspecified addresses
// from an AddrInfo so we never hand peers an address they cannot dial externally.
func FilterLoopbackAddrs(ai pp.AddrInfo) pp.AddrInfo {
filtered := make([]ma.Multiaddr, 0, len(ai.Addrs))
for _, addr := range ai.Addrs {
ip, err := ExtractIP(addr.String())
if err != nil || ip.IsLoopback() || ip.IsUnspecified() {
continue
}
filtered = append(filtered, addr)
}
return pp.AddrInfo{ID: ai.ID, Addrs: filtered}
}
// RecommendedHeartbeatInterval is the target period between heartbeat ticks.
// Indexers use this as the DHT Provide refresh interval.
const RecommendedHeartbeatInterval = 60 * time.Second
// discoveryDHT is the DHT instance used for indexer discovery.
// Set by SetDiscoveryDHT once the indexer service initialises its DHT.
var discoveryDHT *dht.IpfsDHT
// SetDiscoveryDHT stores the DHT instance used by replenishIndexersFromDHT.
// Called by NewIndexerService once the DHT is ready.
func SetDiscoveryDHT(d *dht.IpfsDHT) {
discoveryDHT = d
}
// initNodeDHT creates a lightweight DHT client for pure nodes (no IndexerService).
// Uses the seed indexers as bootstrap peers. Called lazily by ConnectToIndexers
// when discoveryDHT is still nil after the initial warm-up delay.
func initNodeDHT(h host.Host, seeds []Entry) {
logger := oclib.GetLogger()
bootstrapPeers := []pp.AddrInfo{}
for _, s := range seeds {
bootstrapPeers = append(bootstrapPeers, *s.Info)
}
d, err := dht.New(context.Background(), h,
dht.Mode(dht.ModeClient),
dht.ProtocolPrefix("oc"),
dht.BootstrapPeers(bootstrapPeers...),
)
if err != nil {
logger.Warn().Err(err).Msg("[dht] node DHT client init failed")
return
}
SetDiscoveryDHT(d)
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
if err := d.Bootstrap(ctx); err != nil {
logger.Warn().Err(err).Msg("[dht] node DHT client bootstrap failed")
}
logger.Info().Msg("[dht] node DHT client ready")
}
// IndexerCID returns the well-known CID under which all indexers advertise.
func IndexerCID() cid.Cid {
h, _ := mh.Sum([]byte("/opencloud/indexers"), mh.SHA2_256, -1)
return cid.NewCidV1(cid.Raw, h)
}
// DiscoverIndexersFromDHT uses the DHT to find up to count indexers advertising
// under the well-known key. Excludes self. Resolves addresses when the provider
// record carries none.
func DiscoverIndexersFromDHT(h host.Host, d *dht.IpfsDHT, count int) []pp.AddrInfo {
logger := oclib.GetLogger()
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
c := IndexerCID()
ch := d.FindProvidersAsync(ctx, c, count*2)
seen := map[pp.ID]struct{}{}
var results []pp.AddrInfo
for ai := range ch {
if ai.ID == h.ID() {
continue
}
if _, dup := seen[ai.ID]; dup {
continue
}
seen[ai.ID] = struct{}{}
if len(ai.Addrs) == 0 {
resolved, err := d.FindPeer(ctx, ai.ID)
if err != nil {
logger.Warn().Str("peer", ai.ID.String()).Msg("[dht] no addrs and FindPeer failed, skipping")
continue
}
ai = resolved
}
ai = FilterLoopbackAddrs(ai)
if len(ai.Addrs) == 0 {
continue
}
results = append(results, ai)
if len(results) >= count {
break
}
}
logger.Info().Int("found", len(results)).Msg("[dht] indexer discovery complete")
return results
}
// SelectByFillRate picks up to want providers using fill-rate weighted random
// selection w(F) = F*(1-F) — peaks at F=0.5, prefers less-loaded indexers.
// Providers with unknown fill rate receive F=0.5 (neutral prior).
// Enforces subnet /24 diversity: at most one indexer per /24.
func SelectByFillRate(providers []pp.AddrInfo, fillRates map[pp.ID]float64, want int) []pp.AddrInfo {
if len(providers) == 0 || want <= 0 {
return nil
}
type weighted struct {
ai pp.AddrInfo
weight float64
}
ws := make([]weighted, 0, len(providers))
for _, ai := range providers {
f, ok := fillRates[ai.ID]
if !ok {
f = 0.5
}
ws = append(ws, weighted{ai: ai, weight: f * (1 - f)})
}
// Shuffle first for fairness among equal-weight peers.
rand.Shuffle(len(ws), func(i, j int) { ws[i], ws[j] = ws[j], ws[i] })
// Sort descending by weight (simple insertion sort — small N).
for i := 1; i < len(ws); i++ {
for j := i; j > 0 && ws[j].weight > ws[j-1].weight; j-- {
ws[j], ws[j-1] = ws[j-1], ws[j]
}
}
subnets := map[string]struct{}{}
var selected []pp.AddrInfo
for _, w := range ws {
if len(selected) >= want {
break
}
subnet := subnetOf(w.ai)
if subnet != "" {
if _, dup := subnets[subnet]; dup {
continue
}
subnets[subnet] = struct{}{}
}
selected = append(selected, w.ai)
}
return selected
}
// subnetOf returns the /24 subnet string for the first non-loopback address of ai.
func subnetOf(ai pp.AddrInfo) string {
for _, ma := range ai.Addrs {
ip, err := ExtractIP(ma.String())
if err != nil || ip.IsLoopback() {
continue
}
parts := strings.Split(ip.String(), ".")
if len(parts) >= 3 {
return parts[0] + "." + parts[1] + "." + parts[2]
}
}
return ""
}
// replenishIndexersFromDHT is called when an indexer heartbeat fails and more
// indexers are needed. Queries the DHT and adds fresh entries to StaticIndexers.
func replenishIndexersFromDHT(h host.Host, need int) {
if need <= 0 || discoveryDHT == nil {
return
}
logger := oclib.GetLogger()
logger.Info().Int("need", need).Msg("[dht] replenishing indexer pool from DHT")
providers := DiscoverIndexersFromDHT(h, discoveryDHT, need*3)
selected := SelectByFillRate(providers, nil, need)
if len(selected) == 0 {
logger.Warn().Msg("[dht] no indexers found in DHT for replenishment")
return
}
added := 0
for _, ai := range selected {
addr := addrKey(ai)
if !Indexers.ExistsAddr(addr) {
adCopy := ai
Indexers.SetAddr(addr, &adCopy)
added++
}
}
if added > 0 {
logger.Info().Int("added", added).Msg("[dht] indexers added from DHT")
Indexers.NudgeIt()
}
}
// addrKey returns the canonical map key for an AddrInfo.
// The PeerID is used as key so the same peer is never stored twice regardless
// of which of its addresses was seen first.
func addrKey(ai pp.AddrInfo) string {
return ai.ID.String()
}

File diff suppressed because it is too large Load Diff

View File

@@ -3,6 +3,7 @@ package common
import (
"context"
"fmt"
"math/rand"
"net"
"time"
@@ -37,3 +38,31 @@ func ExtractIP(addr string) (net.IP, error) {
}
return ip, nil
}
func GetIndexer(addrOrId string) *pp.AddrInfo {
return Indexers.GetAddr(addrOrId)
}
func GetIndexersIDs() []pp.ID {
return Indexers.GetAddrIDs()
}
func GetIndexersStr() []string {
return Indexers.GetAddrsStr()
}
func GetIndexers() []*pp.AddrInfo {
entries := Indexers.GetAddrs()
result := make([]*pp.AddrInfo, 0, len(entries))
for _, e := range entries {
result = append(result, e.Info)
}
return result
}
func Shuffle[T any](slice []T) []T {
rand.Shuffle(len(slice), func(i, j int) {
slice[i], slice[j] = slice[j], slice[i]
})
return slice
}

View File

@@ -0,0 +1,137 @@
package node
import (
"context"
"encoding/json"
"time"
"oc-discovery/daemons/node/common"
"oc-discovery/daemons/node/indexer"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/dbs"
"cloud.o-forge.io/core/oc-lib/models/peer"
"github.com/libp2p/go-libp2p/core/control"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
ma "github.com/multiformats/go-multiaddr"
)
// OCConnectionGater enforces two rules on every inbound connection:
// 1. If the peer is known locally and blacklisted → reject.
// 2. If the peer is unknown locally → ask indexers one by one whether it
// exists in the DHT. Accept as soon as one confirms it; reject if none do
// (or if no indexers are reachable yet, allow optimistically).
//
// Outbound connections are always allowed — we chose to dial them.
type OCConnectionGater struct {
host host.Host
}
func newOCConnectionGater(h host.Host) *OCConnectionGater {
return &OCConnectionGater{host: h}
}
// InterceptPeerDial — allow all outbound dials.
func (g *OCConnectionGater) InterceptPeerDial(_ pp.ID) bool { return true }
// InterceptAddrDial — allow all outbound dials.
func (g *OCConnectionGater) InterceptAddrDial(_ pp.ID, _ ma.Multiaddr) bool { return true }
// InterceptAccept — allow at transport level (PeerID not yet known).
func (g *OCConnectionGater) InterceptAccept(_ network.ConnMultiaddrs) bool { return true }
// InterceptUpgraded — final gate; always allow (decisions already made in InterceptSecured).
func (g *OCConnectionGater) InterceptUpgraded(_ network.Conn) (bool, control.DisconnectReason) {
return true, 0
}
// InterceptSecured is called after the cryptographic handshake — PeerID is now known.
// Only inbound connections are verified; outbound are trusted.
func (g *OCConnectionGater) InterceptSecured(dir network.Direction, pid pp.ID, _ network.ConnMultiaddrs) bool {
if dir == network.DirOutbound {
return true
}
logger := oclib.GetLogger()
// 1. Local DB lookup by PeerID.
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
results := access.Search(&dbs.Filters{
And: map[string][]dbs.Filter{ // search by name if no filters are provided
"peer_id": {{Operator: dbs.EQUAL.String(), Value: pid.String()}},
},
}, pid.String(), false)
for _, item := range results.Data {
p, ok := item.(*peer.Peer)
if !ok || p.PeerID != pid.String() {
continue
}
if p.Relation == peer.BLACKLIST {
logger.Warn().Str("peer", pid.String()).Msg("[gater] rejected blacklisted peer")
return false
}
// Known, not blacklisted.
return true
}
// 2. Unknown locally — verify via indexers.
indexers := common.Indexers.GetAddrs()
if len(indexers) == 0 {
// No indexers reachable yet — allow optimistically (bootstrap phase).
logger.Warn().Str("peer", pid.String()).Msg("[gater] no indexers available, allowing unverified inbound")
return true
}
req := indexer.GetValue{PeerID: pid.String()}
// A single DHT GetValue already traverses the entire DHT network, so asking
// a second indexer would yield the same result. We only fall through to the
// next indexer if the current one is unreachable (transport error), not if
// it returns found=false (that answer is already DHT-wide authoritative).
for _, ai := range indexers {
found, reachable := queryIndexerPeerExists(g.host, *ai.Info, req)
if !reachable {
continue // indexer down — try next
}
if !found {
logger.Warn().Str("peer", pid.String()).Msg("[gater] peer not found in DHT, rejecting inbound")
}
return found // definitive DHT answer
}
// All indexers unreachable — allow optimistically rather than blocking indefinitely.
logger.Warn().Str("peer", pid.String()).Msg("[gater] all indexers unreachable, allowing unverified inbound")
return true
}
// queryIndexerPeerExists opens a fresh one-shot stream to ai, sends a GetValue
// request, and returns (found, reachable).
// reachable=false means the indexer could not be reached (transport error);
// the caller should then try another indexer.
// reachable=true means the indexer answered — found is the DHT-wide authoritative result.
func queryIndexerPeerExists(h host.Host, ai pp.AddrInfo, req indexer.GetValue) (found, reachable bool) {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
if h.Network().Connectedness(ai.ID) != network.Connected {
if err := h.Connect(ctx, ai); err != nil {
return false, false
}
}
s, err := h.NewStream(ctx, ai.ID, common.ProtocolGet)
if err != nil {
return false, false
}
defer s.Close()
s.SetDeadline(time.Now().Add(3 * time.Second))
if err := json.NewEncoder(s).Encode(req); err != nil {
return false, false
}
var resp indexer.GetResponse
if err := json.NewDecoder(s).Decode(&resp); err != nil {
return false, false
}
return resp.Found, true
}

View File

@@ -0,0 +1,254 @@
package indexer
import (
"errors"
"sync"
"time"
"oc-discovery/conf"
pp "github.com/libp2p/go-libp2p/core/peer"
)
// ── defaults ──────────────────────────────────────────────────────────────────
const (
defaultMaxConnPerWindow = 20
defaultConnWindowSecs = 30
defaultMaxHBPerMinute = 5
defaultMaxPublishPerMin = 10
defaultMaxGetPerMin = 50
strikeThreshold = 3
banDuration = 10 * time.Minute
behaviorWindowDur = 60 * time.Second
)
func cfgOr(v, def int) int {
if v > 0 {
return v
}
return def
}
// ── ConnectionRateGuard ───────────────────────────────────────────────────────
// ConnectionRateGuard limits the number of NEW incoming connections accepted
// within a sliding time window. It protects public indexers against coordinated
// registration floods (Sybil bursts).
type ConnectionRateGuard struct {
mu sync.Mutex
window []time.Time
maxInWindow int
windowDur time.Duration
}
func newConnectionRateGuard() *ConnectionRateGuard {
cfg := conf.GetConfig()
return &ConnectionRateGuard{
maxInWindow: cfgOr(cfg.MaxConnPerWindow, defaultMaxConnPerWindow),
windowDur: time.Duration(cfgOr(cfg.ConnWindowSecs, defaultConnWindowSecs)) * time.Second,
}
}
// Allow returns true if a new connection may be accepted.
// The internal window is pruned on each call so memory stays bounded.
func (g *ConnectionRateGuard) Allow() bool {
g.mu.Lock()
defer g.mu.Unlock()
now := time.Now()
cutoff := now.Add(-g.windowDur)
i := 0
for i < len(g.window) && g.window[i].Before(cutoff) {
i++
}
g.window = g.window[i:]
if len(g.window) >= g.maxInWindow {
return false
}
g.window = append(g.window, now)
return true
}
// ── per-node state ────────────────────────────────────────────────────────────
type nodeBehavior struct {
mu sync.Mutex
knownDID string
hbTimes []time.Time
pubTimes []time.Time
getTimes []time.Time
strikes int
bannedUntil time.Time
}
func (nb *nodeBehavior) isBanned() bool {
return time.Now().Before(nb.bannedUntil)
}
func (nb *nodeBehavior) strike(n int) {
nb.strikes += n
if nb.strikes >= strikeThreshold {
nb.bannedUntil = time.Now().Add(banDuration)
}
}
func pruneWindow(ts []time.Time, dur time.Duration) []time.Time {
cutoff := time.Now().Add(-dur)
i := 0
for i < len(ts) && ts[i].Before(cutoff) {
i++
}
return ts[i:]
}
// recordInWindow appends now to the window slice and returns false (+ adds a
// strike) when the count exceeds max.
func (nb *nodeBehavior) recordInWindow(ts *[]time.Time, max int) bool {
*ts = pruneWindow(*ts, behaviorWindowDur)
if len(*ts) >= max {
nb.strike(1)
return false
}
*ts = append(*ts, time.Now())
return true
}
// ── NodeBehaviorTracker ───────────────────────────────────────────────────────
// NodeBehaviorTracker is the indexer-side per-node compliance monitor.
// It is entirely local: no state is shared with other indexers.
type NodeBehaviorTracker struct {
mu sync.RWMutex
nodes map[pp.ID]*nodeBehavior
maxHB int
maxPub int
maxGet int
}
func newNodeBehaviorTracker() *NodeBehaviorTracker {
cfg := conf.GetConfig()
return &NodeBehaviorTracker{
nodes: make(map[pp.ID]*nodeBehavior),
maxHB: cfgOr(cfg.MaxHBPerMinute, defaultMaxHBPerMinute),
maxPub: cfgOr(cfg.MaxPublishPerMinute, defaultMaxPublishPerMin),
maxGet: cfgOr(cfg.MaxGetPerMinute, defaultMaxGetPerMin),
}
}
func (t *NodeBehaviorTracker) get(pid pp.ID) *nodeBehavior {
t.mu.RLock()
nb := t.nodes[pid]
t.mu.RUnlock()
if nb != nil {
return nb
}
t.mu.Lock()
defer t.mu.Unlock()
if nb = t.nodes[pid]; nb == nil {
nb = &nodeBehavior{}
t.nodes[pid] = nb
}
return nb
}
// IsBanned returns true when the peer is in an active ban period.
func (t *NodeBehaviorTracker) IsBanned(pid pp.ID) bool {
nb := t.get(pid)
nb.mu.Lock()
defer nb.mu.Unlock()
return nb.isBanned()
}
// RecordHeartbeat checks heartbeat cadence. Returns an error if the peer is
// flooding (too many heartbeats in the sliding window).
func (t *NodeBehaviorTracker) RecordHeartbeat(pid pp.ID) error {
nb := t.get(pid)
nb.mu.Lock()
defer nb.mu.Unlock()
if nb.isBanned() {
return errors.New("peer is banned")
}
if !nb.recordInWindow(&nb.hbTimes, t.maxHB) {
return errors.New("heartbeat flood detected")
}
return nil
}
// CheckIdentity verifies that the DID associated with a PeerID never changes.
// A DID change is a strong signal of identity spoofing.
func (t *NodeBehaviorTracker) CheckIdentity(pid pp.ID, did string) error {
if did == "" {
return nil
}
nb := t.get(pid)
nb.mu.Lock()
defer nb.mu.Unlock()
if nb.knownDID == "" {
nb.knownDID = did
return nil
}
if nb.knownDID != did {
nb.strike(2) // identity change is severe
return errors.New("DID mismatch for peer " + pid.String())
}
return nil
}
// RecordBadSignature registers a cryptographic verification failure.
// A single bad signature is worth 2 strikes (near-immediate ban).
func (t *NodeBehaviorTracker) RecordBadSignature(pid pp.ID) {
nb := t.get(pid)
nb.mu.Lock()
defer nb.mu.Unlock()
nb.strike(2)
}
// RecordPublish checks publish volume. Returns an error if the peer is
// sending too many publish requests.
func (t *NodeBehaviorTracker) RecordPublish(pid pp.ID) error {
nb := t.get(pid)
nb.mu.Lock()
defer nb.mu.Unlock()
if nb.isBanned() {
return errors.New("peer is banned")
}
if !nb.recordInWindow(&nb.pubTimes, t.maxPub) {
return errors.New("publish volume exceeded")
}
return nil
}
// RecordGet checks get volume. Returns an error if the peer is enumerating
// the DHT at an abnormal rate.
func (t *NodeBehaviorTracker) RecordGet(pid pp.ID) error {
nb := t.get(pid)
nb.mu.Lock()
defer nb.mu.Unlock()
if nb.isBanned() {
return errors.New("peer is banned")
}
if !nb.recordInWindow(&nb.getTimes, t.maxGet) {
return errors.New("get volume exceeded")
}
return nil
}
// Cleanup removes the behavior entry for a peer if it is not currently banned.
// Called when the peer is evicted from StreamRecords by the GC.
func (t *NodeBehaviorTracker) Cleanup(pid pp.ID) {
t.mu.RLock()
nb := t.nodes[pid]
t.mu.RUnlock()
if nb == nil {
return
}
nb.mu.Lock()
banned := nb.isBanned()
nb.mu.Unlock()
if !banned {
t.mu.Lock()
delete(t.nodes, pid)
t.mu.Unlock()
}
}

View File

@@ -7,7 +7,7 @@ import (
"errors"
"fmt"
"io"
"oc-discovery/conf"
"math/rand"
"oc-discovery/daemons/node/common"
"strings"
"time"
@@ -18,7 +18,7 @@ import (
"cloud.o-forge.io/core/oc-lib/tools"
"github.com/libp2p/go-libp2p/core/crypto"
"github.com/libp2p/go-libp2p/core/network"
"github.com/libp2p/go-libp2p/core/peer"
lpp "github.com/libp2p/go-libp2p/core/peer"
)
type PeerRecordPayload struct {
@@ -118,6 +118,7 @@ func (ix *IndexerService) genPIDKey(peerID string) string {
func (ix *IndexerService) initNodeHandler() {
logger := oclib.GetLogger()
logger.Info().Msg("Init Node Handler")
// Each heartbeat from a node carries a freshly signed PeerRecord.
// Republish it to the DHT so the record never expires as long as the node
// is alive — no separate publish stream needed from the node side.
@@ -177,49 +178,48 @@ func (ix *IndexerService) initNodeHandler() {
ix.Host.SetStreamHandler(common.ProtocolHeartbeat, ix.HandleHeartbeat)
ix.Host.SetStreamHandler(common.ProtocolPublish, ix.handleNodePublish)
ix.Host.SetStreamHandler(common.ProtocolGet, ix.handleNodeGet)
ix.Host.SetStreamHandler(common.ProtocolIndexerGetNatives, ix.handleGetNatives)
ix.Host.SetStreamHandler(common.ProtocolIndexerConsensus, ix.handleIndexerConsensus)
ix.Host.SetStreamHandler(common.ProtocolIndexerCandidates, ix.handleCandidateRequest)
ix.initSearchHandlers()
}
// handleIndexerConsensus implements Phase 2 liveness voting (ProtocolIndexerConsensus).
// The caller sends a list of candidate multiaddrs; this indexer replies with the
// subset it considers currently alive (recent heartbeat in StreamRecords).
func (ix *IndexerService) handleIndexerConsensus(stream network.Stream) {
defer stream.Reset()
var req common.IndexerConsensusRequest
if err := json.NewDecoder(stream).Decode(&req); err != nil {
// handleCandidateRequest responds to a node's consensus candidate request.
// Returns a random sample of indexers from the local DHT cache.
func (ix *IndexerService) handleCandidateRequest(s network.Stream) {
defer s.Close()
s.SetDeadline(time.Now().Add(5 * time.Second))
var req common.IndexerCandidatesRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
return
}
ix.StreamMU.RLock()
streams := ix.StreamRecords[common.ProtocolHeartbeat]
ix.StreamMU.RUnlock()
alive := make([]string, 0, len(req.Candidates))
for _, addr := range req.Candidates {
ad, err := peer.AddrInfoFromString(addr)
if err != nil {
continue
}
ix.StreamMU.RLock()
rec, ok := streams[ad.ID]
ix.StreamMU.RUnlock()
if !ok || rec.HeartbeatStream == nil || rec.HeartbeatStream.UptimeTracker == nil {
continue
}
// D: consider alive only if recent heartbeat AND score above minimum quality bar.
if time.Since(rec.HeartbeatStream.UptimeTracker.LastSeen) <= 2*common.RecommendedHeartbeatInterval &&
rec.LastScore >= 30.0 {
alive = append(alive, addr)
}
if req.Count <= 0 || req.Count > 10 {
req.Count = 3
}
json.NewEncoder(stream).Encode(common.IndexerConsensusResponse{Alive: alive})
ix.dhtCacheMu.RLock()
cache := make([]dhtCacheEntry, len(ix.dhtCache))
copy(cache, ix.dhtCache)
ix.dhtCacheMu.RUnlock()
// Shuffle for randomness: each voter offers a different subset.
rand.Shuffle(len(cache), func(i, j int) { cache[i], cache[j] = cache[j], cache[i] })
candidates := make([]lpp.AddrInfo, 0, req.Count)
for _, e := range cache {
if len(candidates) >= req.Count {
break
}
candidates = append(candidates, e.AI)
}
json.NewEncoder(s).Encode(common.IndexerCandidatesResponse{Candidates: candidates})
}
func (ix *IndexerService) handleNodePublish(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
remotePeer := s.Conn().RemotePeer()
if err := ix.behavior.RecordPublish(remotePeer); err != nil {
logger.Warn().Err(err).Str("peer", remotePeer.String()).Msg("publish refused")
s.Reset()
return
}
for {
var rec PeerRecord
if err := json.NewDecoder(s).Decode(&rec); err != nil {
@@ -233,14 +233,20 @@ func (ix *IndexerService) handleNodePublish(s network.Stream) {
continue
}
if _, err := rec.Verify(); err != nil {
logger.Err(err)
ix.behavior.RecordBadSignature(remotePeer)
logger.Warn().Err(err).Str("peer", remotePeer.String()).Msg("bad signature on publish")
return
}
if err := ix.behavior.CheckIdentity(remotePeer, rec.DID); err != nil {
logger.Warn().Err(err).Msg("identity mismatch on publish")
s.Reset()
return
}
if rec.PeerID == "" || rec.ExpiryDate.Before(time.Now().UTC()) {
logger.Err(errors.New(rec.PeerID + " is expired."))
return
}
pid, err := peer.Decode(rec.PeerID)
pid, err := lpp.Decode(rec.PeerID)
if err != nil {
return
}
@@ -248,7 +254,7 @@ func (ix *IndexerService) handleNodePublish(s network.Stream) {
ix.StreamMU.Lock()
defer ix.StreamMU.Unlock()
if ix.StreamRecords[common.ProtocolHeartbeat] == nil {
ix.StreamRecords[common.ProtocolHeartbeat] = map[peer.ID]*common.StreamRecord[PeerRecord]{}
ix.StreamRecords[common.ProtocolHeartbeat] = map[lpp.ID]*common.StreamRecord[PeerRecord]{}
}
streams := ix.StreamRecords[common.ProtocolHeartbeat]
if srec, ok := streams[pid]; ok {
@@ -297,6 +303,12 @@ func (ix *IndexerService) handleNodePublish(s network.Stream) {
func (ix *IndexerService) handleNodeGet(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
remotePeer := s.Conn().RemotePeer()
if err := ix.behavior.RecordGet(remotePeer); err != nil {
logger.Warn().Err(err).Str("peer", remotePeer.String()).Msg("get refused")
s.Reset()
return
}
for {
var req GetValue
if err := json.NewDecoder(s).Decode(&req); err != nil {
@@ -367,43 +379,3 @@ func (ix *IndexerService) handleNodeGet(s network.Stream) {
}
}
// handleGetNatives returns this indexer's configured native addresses,
// excluding any in the request's Exclude list.
func (ix *IndexerService) handleGetNatives(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
for {
var req common.GetIndexerNativesRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
logger.Err(err).Msg("indexer get natives: decode")
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
return
}
continue
}
excludeSet := make(map[string]struct{}, len(req.Exclude))
for _, e := range req.Exclude {
excludeSet[e] = struct{}{}
}
resp := common.GetIndexerNativesResponse{}
for _, addr := range strings.Split(conf.GetConfig().NativeIndexerAddresses, ",") {
addr = strings.TrimSpace(addr)
if addr == "" {
continue
}
if _, excluded := excludeSet[addr]; !excluded {
resp.Natives = append(resp.Natives, addr)
}
}
if err := json.NewEncoder(s).Encode(resp); err != nil {
logger.Err(err).Msg("indexer get natives: encode response")
}
break
}
}

View File

@@ -3,7 +3,6 @@ package indexer
import (
"context"
"encoding/json"
"fmt"
"strings"
"sync"
"time"
@@ -43,19 +42,69 @@ type NameIndexEvent struct {
// nameIndexState holds the local in-memory name index and the sender-side
// deduplication tracker.
//
// Search strategy: trigram inverted index.
// - byName: lowercased name → peerID → DID (for delete and exact resolution)
// - byPeer: peerID → lowercased name (to recompute trigrams on delete)
// - trigrams: 3-char substring → set of peerIDs (for O(1) substring lookup)
//
// For needles shorter than 3 chars the trigram index cannot help; a linear
// scan of byName is used as fallback (rare and fast enough at small N).
type nameIndexState struct {
// index: name → peerID → DID, built from events received from all indexers.
index map[string]map[string]string
indexMu sync.RWMutex
byName map[string]map[string]string // name → peerID → DID
byPeer map[string]string // peerID → name
trigrams map[string]map[string]struct{} // trigram → peerID set
indexMu sync.RWMutex
// emitted tracks the last emission time for each (action, name, peerID) key
// to suppress duplicates within nameIndexDedupWindow.
// emitted deduplicates GossipSub emissions within nameIndexDedupWindow.
// Purged periodically to prevent unbounded growth.
emitted map[string]time.Time
emittedMu sync.Mutex
}
// trigramsOf returns all overlapping 3-char substrings of s (already lowercased).
// If s is shorter than 3 chars the string itself is returned as the sole token.
func trigramsOf(s string) []string {
if len(s) < 3 {
return []string{s}
}
out := make([]string, 0, len(s)-2)
for i := 0; i <= len(s)-3; i++ {
out = append(out, s[i:i+3])
}
return out
}
// addTrigrams inserts peerID into every trigram bucket for name.
func (s *nameIndexState) addTrigrams(name, peerID string) {
for _, tg := range trigramsOf(name) {
if s.trigrams[tg] == nil {
s.trigrams[tg] = map[string]struct{}{}
}
s.trigrams[tg][peerID] = struct{}{}
}
}
// removeTrigrams deletes peerID from every trigram bucket for name,
// cleaning up empty buckets to keep memory tight.
func (s *nameIndexState) removeTrigrams(name, peerID string) {
for _, tg := range trigramsOf(name) {
if m := s.trigrams[tg]; m != nil {
delete(m, peerID)
if len(m) == 0 {
delete(s.trigrams, tg)
}
}
}
}
// shouldEmit returns true if the (action, name, peerID) tuple has not been
// emitted within nameIndexDedupWindow, updating the tracker if so.
//
// On DELETE: the ADD entry for the same peer is immediately removed — the peer
// is gone, keeping it would cause the map to grow with departed peers forever.
// The DELETE entry itself is kept for the dedup window to absorb duplicate
// delete events, then cleaned by the purgeEmitted ticker.
func (s *nameIndexState) shouldEmit(action NameIndexAction, name, peerID string) bool {
key := string(action) + ":" + name + ":" + peerID
s.emittedMu.Lock()
@@ -64,9 +113,27 @@ func (s *nameIndexState) shouldEmit(action NameIndexAction, name, peerID string)
return false
}
s.emitted[key] = time.Now()
if action == NameIndexDelete {
// Peer is leaving: drop its ADD entry — no longer needed.
delete(s.emitted, string(NameIndexAdd)+":"+name+":"+peerID)
}
return true
}
// purgeEmitted removes stale DELETE entries from the emitted dedup map.
// ADD entries are cleaned eagerly on DELETE, so only short-lived DELETE
// entries remain here; the ticker just trims those stragglers.
func (s *nameIndexState) purgeEmitted() {
now := time.Now()
s.emittedMu.Lock()
defer s.emittedMu.Unlock()
for k, t := range s.emitted {
if now.Sub(t) >= nameIndexDedupWindow {
delete(s.emitted, k)
}
}
}
// onEvent applies a received NameIndexEvent to the local index.
// "add" inserts/updates the mapping; "delete" removes it.
// Operations are idempotent — duplicate events from multiple indexers are harmless.
@@ -74,19 +141,40 @@ func (s *nameIndexState) onEvent(evt NameIndexEvent) {
if evt.Name == "" || evt.PeerID == "" {
return
}
nameLow := strings.ToLower(evt.Name)
s.indexMu.Lock()
defer s.indexMu.Unlock()
switch evt.Action {
case NameIndexAdd:
if s.index[evt.Name] == nil {
s.index[evt.Name] = map[string]string{}
// If the peer previously had a different name, clean up old trigrams.
if old, ok := s.byPeer[evt.PeerID]; ok && old != nameLow {
s.removeTrigrams(old, evt.PeerID)
if s.byName[old] != nil {
delete(s.byName[old], evt.PeerID)
if len(s.byName[old]) == 0 {
delete(s.byName, old)
}
}
}
s.index[evt.Name][evt.PeerID] = evt.DID
if s.byName[nameLow] == nil {
s.byName[nameLow] = map[string]string{}
}
s.byName[nameLow][evt.PeerID] = evt.DID
s.byPeer[evt.PeerID] = nameLow
s.addTrigrams(nameLow, evt.PeerID)
case NameIndexDelete:
if s.index[evt.Name] != nil {
delete(s.index[evt.Name], evt.PeerID)
if len(s.index[evt.Name]) == 0 {
delete(s.index, evt.Name)
// Use stored name so trigrams match exactly what was indexed.
name := nameLow
if stored, ok := s.byPeer[evt.PeerID]; ok {
name = stored
}
s.removeTrigrams(name, evt.PeerID)
delete(s.byPeer, evt.PeerID)
if s.byName[name] != nil {
delete(s.byName[name], evt.PeerID)
if len(s.byName[name]) == 0 {
delete(s.byName, name)
}
}
}
@@ -96,10 +184,22 @@ func (s *nameIndexState) onEvent(evt NameIndexEvent) {
// Must be called after ix.PS is ready.
func (ix *IndexerService) initNameIndex(ps *pubsub.PubSub) {
logger := oclib.GetLogger()
ix.nameIndex = &nameIndexState{
index: map[string]map[string]string{},
emitted: map[string]time.Time{},
state := &nameIndexState{
byName: map[string]map[string]string{},
byPeer: map[string]string{},
trigrams: map[string]map[string]struct{}{},
emitted: map[string]time.Time{},
}
ix.nameIndex = state
// Periodically purge the emitted dedup map so it doesn't grow forever.
go func() {
t := time.NewTicker(nameIndexDedupWindow)
defer t.Stop()
for range t.C {
state.purgeEmitted()
}
}()
ps.RegisterTopicValidator(TopicNameIndex, func(_ context.Context, _ pp.ID, _ *pubsub.Message) bool {
return true
@@ -149,23 +249,72 @@ func (ix *IndexerService) publishNameEvent(action NameIndexAction, name, peerID,
// LookupNameIndex searches the distributed name index for peers whose name
// contains needle (case-insensitive). Returns peerID → DID for matched peers.
// Returns nil if the name index is not initialised (e.g. native indexers).
// Returns nil if the name index is not initialised.
//
// Algorithm:
// - needle ≥ 3 chars: trigram intersection → O(|candidates|) verify pass.
// The trigram index immediately narrows the candidate set; false positives
// are eliminated by the full-string contains check.
// - needle < 3 chars: linear scan of byName (rare, still fast at small N).
func (ix *IndexerService) LookupNameIndex(needle string) map[string]string {
if ix.nameIndex == nil {
return nil
}
result := map[string]string{}
needleLow := strings.ToLower(needle)
result := map[string]string{}
ix.nameIndex.indexMu.RLock()
defer ix.nameIndex.indexMu.RUnlock()
for name, peers := range ix.nameIndex.index {
fmt.Println(strings.Contains(strings.ToLower(name), needleLow), needleLow, strings.ToLower(name))
if strings.Contains(strings.ToLower(name), needleLow) {
for peerID, did := range peers {
result[peerID] = did
if len(needleLow) < 3 {
// Short needle: linear scan fallback.
for name, peers := range ix.nameIndex.byName {
if strings.Contains(name, needleLow) {
for peerID, did := range peers {
result[peerID] = did
}
}
}
return result
}
// Trigram intersection: start with the first trigram's set, then
// progressively intersect with each subsequent trigram's set.
tgs := trigramsOf(needleLow)
var candidates map[string]struct{}
for _, tg := range tgs {
set := ix.nameIndex.trigrams[tg]
if len(set) == 0 {
return result // any empty trigram set → no possible match
}
if candidates == nil {
candidates = make(map[string]struct{}, len(set))
for pid := range set {
candidates[pid] = struct{}{}
}
} else {
for pid := range candidates {
if _, ok := set[pid]; !ok {
delete(candidates, pid)
}
}
}
if len(candidates) == 0 {
return result
}
}
// Full-string verification pass: trigrams admit false positives
// (e.g. "abc" and "bca" share the trigram "bc_" with a rotated name).
for peerID := range candidates {
name := ix.nameIndex.byPeer[peerID]
if strings.Contains(name, needleLow) {
did := ""
if m := ix.nameIndex.byName[name]; m != nil {
did = m[peerID]
}
result[peerID] = did
}
}
fmt.Println("RESULT", result)
return result
}

View File

@@ -1,783 +0,0 @@
package indexer
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"math/rand"
"slices"
"strings"
"sync"
"time"
"oc-discovery/daemons/node/common"
oclib "cloud.o-forge.io/core/oc-lib"
pubsub "github.com/libp2p/go-libp2p-pubsub"
"github.com/libp2p/go-libp2p/core/crypto"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
)
const (
// IndexerTTL is the lifetime of a live-indexer cache entry. Set to 50% above
// the recommended 60s heartbeat interval so a single delayed renewal does not
// evict a healthy indexer from the native's cache.
IndexerTTL = 90 * time.Second
// offloadInterval is how often the native checks if it can release responsible peers.
offloadInterval = 30 * time.Second
// dhtRefreshInterval is how often the background goroutine queries the DHT for
// known-but-expired indexer entries (written by neighbouring natives).
dhtRefreshInterval = 30 * time.Second
// maxFallbackPeers caps how many peers the native will accept in self-delegation
// mode. Beyond this limit the native refuses to act as a fallback indexer so it
// is not overwhelmed during prolonged indexer outages.
maxFallbackPeers = 50
)
// liveIndexerEntry tracks a registered indexer in the native's in-memory cache and DHT.
// PubKey and Signature are forwarded from the IndexerRegistration so the DHT validator
// can verify that the entry was produced by the peer owning the declared PeerID.
// FillRate is the fraction of capacity used (0=empty, 1=full) at last registration.
type liveIndexerEntry struct {
PeerID string `json:"peer_id"`
Addr string `json:"addr"`
ExpiresAt time.Time `json:"expires_at"`
RegTimestamp int64 `json:"reg_ts,omitempty"` // Timestamp from the original IndexerRegistration
PubKey []byte `json:"pub_key,omitempty"`
Signature []byte `json:"sig,omitempty"`
FillRate float64 `json:"fill_rate,omitempty"`
}
// NativeState holds runtime state specific to native indexer operation.
type NativeState struct {
liveIndexers map[string]*liveIndexerEntry // keyed by PeerID, local cache with TTL
liveIndexersMu sync.RWMutex
responsiblePeers map[pp.ID]struct{} // peers for which the native is fallback indexer
responsibleMu sync.RWMutex
// knownPeerIDs accumulates all indexer PeerIDs ever seen (local stream or gossip).
// Used by refreshIndexersFromDHT to re-hydrate expired entries from the shared DHT,
// including entries written by other natives.
knownPeerIDs map[string]string
knownMu sync.RWMutex
// cancel stops background goroutines (runOffloadLoop, refreshIndexersFromDHT)
// when the native shuts down.
cancel context.CancelFunc
}
func newNativeState(cancel context.CancelFunc) *NativeState {
return &NativeState{
liveIndexers: map[string]*liveIndexerEntry{},
responsiblePeers: map[pp.ID]struct{}{},
knownPeerIDs: map[string]string{},
cancel: cancel,
}
}
// IndexerRecordValidator validates indexer DHT entries under the "indexer" namespace.
type IndexerRecordValidator struct{}
func (v IndexerRecordValidator) Validate(_ string, value []byte) error {
var e liveIndexerEntry
if err := json.Unmarshal(value, &e); err != nil {
return err
}
if e.Addr == "" {
return errors.New("missing addr")
}
if e.ExpiresAt.Before(time.Now().UTC()) {
return errors.New("expired indexer record")
}
// Verify self-signature when present — rejects entries forged by a
// compromised native that does not control the declared PeerID.
if len(e.Signature) > 0 && len(e.PubKey) > 0 {
pub, err := crypto.UnmarshalPublicKey(e.PubKey)
if err != nil {
return fmt.Errorf("indexer entry: invalid public key: %w", err)
}
payload := []byte(fmt.Sprintf("%s|%s|%d", e.PeerID, e.Addr, e.RegTimestamp))
if ok, err := pub.Verify(payload, e.Signature); err != nil || !ok {
return errors.New("indexer entry: invalid signature")
}
}
return nil
}
func (v IndexerRecordValidator) Select(_ string, values [][]byte) (int, error) {
var newest time.Time
index := 0
for i, val := range values {
var e liveIndexerEntry
if err := json.Unmarshal(val, &e); err != nil {
continue
}
if e.ExpiresAt.After(newest) {
newest = e.ExpiresAt
index = i
}
}
return index, nil
}
// InitNative registers native-specific stream handlers and starts background loops.
// Must be called after DHT is initialized.
func (ix *IndexerService) InitNative() {
ctx, cancel := context.WithCancel(context.Background())
ix.Native = newNativeState(cancel)
ix.Host.SetStreamHandler(common.ProtocolHeartbeat, ix.HandleHeartbeat) // specific heartbeat for Indexer.
ix.Host.SetStreamHandler(common.ProtocolNativeSubscription, ix.handleNativeSubscription)
ix.Host.SetStreamHandler(common.ProtocolNativeUnsubscribe, ix.handleNativeUnsubscribe)
ix.Host.SetStreamHandler(common.ProtocolNativeGetIndexers, ix.handleNativeGetIndexers)
ix.Host.SetStreamHandler(common.ProtocolNativeConsensus, ix.handleNativeConsensus)
ix.Host.SetStreamHandler(common.ProtocolNativeGetPeers, ix.handleNativeGetPeers)
ix.Host.SetStreamHandler(common.ProtocolIndexerGetNatives, ix.handleGetNatives)
ix.subscribeIndexerRegistry()
// Ensure long connections to other configured natives (native-to-native mesh).
common.EnsureNativePeers(ix.Host)
go ix.runOffloadLoop(ctx)
go ix.refreshIndexersFromDHT(ctx)
}
// subscribeIndexerRegistry joins the PubSub topic used by natives to gossip newly
// registered indexer PeerIDs to one another, enabling cross-native DHT discovery.
func (ix *IndexerService) subscribeIndexerRegistry() {
logger := oclib.GetLogger()
ix.PS.RegisterTopicValidator(common.TopicIndexerRegistry, func(_ context.Context, _ pp.ID, msg *pubsub.Message) bool {
// Parse as a signed IndexerRegistration.
var reg common.IndexerRegistration
if err := json.Unmarshal(msg.Data, &reg); err != nil {
return false
}
if reg.Addr == "" {
return false
}
if _, err := pp.AddrInfoFromString(reg.Addr); err != nil {
return false
}
// Verify the self-signature when present (rejects forged gossip from a
// compromised native that does not control the announced PeerID).
if ok, _ := reg.Verify(); !ok {
return false
}
// Accept only messages from known native peers or from this host itself.
// This prevents external PSK participants from injecting registry entries.
from := msg.GetFrom()
if from == ix.Host.ID() {
return true
}
common.StreamNativeMu.RLock()
_, knownNative := common.StaticNatives[from.String()]
if !knownNative {
for _, ad := range common.StaticNatives {
if ad.ID == from {
knownNative = true
break
}
}
}
common.StreamNativeMu.RUnlock()
return knownNative
})
topic, err := ix.PS.Join(common.TopicIndexerRegistry)
if err != nil {
logger.Err(err).Msg("native: failed to join indexer registry topic")
return
}
sub, err := topic.Subscribe()
if err != nil {
logger.Err(err).Msg("native: failed to subscribe to indexer registry topic")
return
}
ix.PubsubMu.Lock()
ix.LongLivedPubSubs[common.TopicIndexerRegistry] = topic
ix.PubsubMu.Unlock()
go func() {
for {
msg, err := sub.Next(context.Background())
if err != nil {
return
}
// The gossip payload is a JSON-encoded IndexerRegistration (signed).
var gossipReg common.IndexerRegistration
if jsonErr := json.Unmarshal(msg.Data, &gossipReg); jsonErr != nil {
continue
}
if gossipReg.Addr == "" || gossipReg.PeerID == "" {
continue
}
// A neighbouring native registered this PeerID; add to known set for DHT refresh.
ix.Native.knownMu.Lock()
ix.Native.knownPeerIDs[gossipReg.PeerID] = gossipReg.Addr
ix.Native.knownMu.Unlock()
}
}()
}
// handleNativeSubscription stores an indexer's alive registration in the local cache
// immediately, then persists it to the DHT asynchronously.
// The stream is temporary: indexer sends one IndexerRegistration and closes.
func (ix *IndexerService) handleNativeSubscription(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
logger.Info().Msg("Subscription")
for {
var reg common.IndexerRegistration
if err := json.NewDecoder(s).Decode(&reg); err != nil {
logger.Err(err).Msg("native subscription: decode")
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
return
}
continue
}
logger.Info().Msg("Subscription " + reg.Addr)
if reg.Addr == "" {
logger.Error().Msg("native subscription: missing addr")
return
}
if reg.PeerID == "" {
ad, err := pp.AddrInfoFromString(reg.Addr)
if err != nil {
logger.Err(err).Msg("native subscription: invalid addr")
return
}
reg.PeerID = ad.ID.String()
}
// Reject registrations with an invalid self-signature.
if ok, err := reg.Verify(); !ok {
logger.Warn().Str("peer", reg.PeerID).Err(err).Msg("native subscription: invalid signature, rejecting")
return
}
// Build entry with a fresh TTL — must happen before the cache write so the
// TTL window is not consumed by DHT retries.
entry := &liveIndexerEntry{
PeerID: reg.PeerID,
Addr: reg.Addr,
ExpiresAt: time.Now().UTC().Add(IndexerTTL),
RegTimestamp: reg.Timestamp,
PubKey: reg.PubKey,
Signature: reg.Signature,
FillRate: reg.FillRate,
}
// Verify that the declared address is actually reachable before admitting
// the registration. This async dial runs in the background; the indexer is
// tentatively admitted immediately (so heartbeats don't get stuck) but is
// evicted from the cache if the dial fails within 5 s.
go func(e *liveIndexerEntry) {
ad, err := pp.AddrInfoFromString(e.Addr)
if err != nil {
logger.Warn().Str("addr", e.Addr).Msg("native subscription: invalid addr during validation, rejecting")
ix.Native.liveIndexersMu.Lock()
if cur := ix.Native.liveIndexers[e.PeerID]; cur == e {
delete(ix.Native.liveIndexers, e.PeerID)
}
ix.Native.liveIndexersMu.Unlock()
return
}
dialCtx, dialCancel := context.WithTimeout(context.Background(), 5*time.Second)
defer dialCancel()
if err := ix.Host.Connect(dialCtx, *ad); err != nil {
logger.Warn().Str("addr", e.Addr).Err(err).Msg("native subscription: declared address unreachable, rejecting")
ix.Native.liveIndexersMu.Lock()
if cur := ix.Native.liveIndexers[e.PeerID]; cur == e {
delete(ix.Native.liveIndexers, e.PeerID)
}
ix.Native.liveIndexersMu.Unlock()
}
}(entry)
// Update local cache and known set immediately so concurrent GetIndexers calls
// can already see this indexer without waiting for the DHT write to complete.
ix.Native.liveIndexersMu.Lock()
_, isRenewal := ix.Native.liveIndexers[reg.PeerID]
ix.Native.liveIndexers[reg.PeerID] = entry
ix.Native.liveIndexersMu.Unlock()
ix.Native.knownMu.Lock()
ix.Native.knownPeerIDs[reg.PeerID] = reg.Addr
ix.Native.knownMu.Unlock()
// Gossip the signed registration to neighbouring natives.
// The payload is JSON-encoded so the receiver can verify the self-signature.
ix.PubsubMu.RLock()
topic := ix.LongLivedPubSubs[common.TopicIndexerRegistry]
ix.PubsubMu.RUnlock()
if topic != nil {
if gossipData, marshalErr := json.Marshal(reg); marshalErr == nil {
if err := topic.Publish(context.Background(), gossipData); err != nil {
logger.Err(err).Msg("native subscription: registry gossip publish")
}
}
}
if isRenewal {
// logger.Debug().Str("peer", reg.PeerID).Msg("native: indexer TTL renewed : " + fmt.Sprintf("%v", len(ix.Native.liveIndexers)))
} else {
logger.Info().Str("peer", reg.PeerID).Msg("native: indexer registered : " + fmt.Sprintf("%v", len(ix.Native.liveIndexers)))
}
// Persist in DHT asynchronously with bounded retry.
// Max retry window = IndexerTTL (90 s) — retrying past entry expiry is pointless.
// Backoff: 10 s → 20 s → 40 s, then repeats at 40 s until deadline.
key := ix.genIndexerKey(reg.PeerID)
data, err := json.Marshal(entry)
if err != nil {
logger.Err(err).Msg("native subscription: marshal entry")
return
}
go func() {
deadline := time.Now().Add(IndexerTTL)
backoff := 10 * time.Second
for {
if time.Now().After(deadline) {
logger.Warn().Str("key", key).Msg("native subscription: DHT put abandoned, entry TTL exceeded")
return
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
err := ix.DHT.PutValue(ctx, key, data)
cancel()
if err == nil {
return
}
logger.Err(err).Msg("native subscription: DHT put " + key)
if !strings.Contains(err.Error(), "failed to find any peer in table") {
return // non-retryable error
}
remaining := time.Until(deadline)
if backoff > remaining {
backoff = remaining
}
if backoff <= 0 {
return
}
time.Sleep(backoff)
if backoff < 40*time.Second {
backoff *= 2
}
}
}()
break
}
}
// handleNativeUnsubscribe removes a departing indexer from the local cache and
// known set immediately, without waiting for TTL expiry.
func (ix *IndexerService) handleNativeUnsubscribe(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
var reg common.IndexerRegistration
if err := json.NewDecoder(s).Decode(&reg); err != nil {
logger.Err(err).Msg("native unsubscribe: decode")
return
}
if reg.PeerID == "" {
logger.Warn().Msg("native unsubscribe: missing peer_id")
return
}
ix.Native.liveIndexersMu.Lock()
delete(ix.Native.liveIndexers, reg.PeerID)
ix.Native.liveIndexersMu.Unlock()
ix.Native.knownMu.Lock()
delete(ix.Native.knownPeerIDs, reg.PeerID)
ix.Native.knownMu.Unlock()
logger.Info().Str("peer", reg.PeerID).Msg("native: indexer explicitly unregistered")
}
// handleNativeGetIndexers returns this native's own list of reachable indexers.
// Self-delegation (native acting as temporary fallback indexer) is only permitted
// for nodes — never for peers that are themselves registered indexers in knownPeerIDs.
// The consensus across natives is the responsibility of the requesting node/indexer.
func (ix *IndexerService) handleNativeGetIndexers(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
for {
var req common.GetIndexersRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
logger.Err(err).Msg("native get indexers: decode")
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
return
}
continue
}
if req.Count <= 0 {
req.Count = 3
}
callerPeerID := s.Conn().RemotePeer().String()
reachable := ix.reachableLiveIndexers(req.Count, callerPeerID)
var resp common.GetIndexersResponse
if len(reachable) == 0 {
// No live indexers reachable — try to self-delegate.
if ix.selfDelegate(s.Conn().RemotePeer(), &resp) {
logger.Info().Str("peer", callerPeerID).Msg("native: no indexers, acting as fallback for node")
} else {
// Fallback pool saturated: return empty so the caller retries another
// native instead of piling more load onto this one.
logger.Warn().Str("peer", callerPeerID).Int("pool", maxFallbackPeers).Msg(
"native: fallback pool saturated, refusing self-delegation")
}
} else {
// Sort by fill rate ascending so less-full indexers are preferred for routing.
ix.Native.liveIndexersMu.RLock()
fillRates := make(map[string]float64, len(reachable))
for _, addr := range reachable {
ad, err := pp.AddrInfoFromString(addr)
if err != nil {
continue
}
for _, e := range ix.Native.liveIndexers {
if e.PeerID == ad.ID.String() {
fillRates[addr] = e.FillRate
break
}
}
}
ix.Native.liveIndexersMu.RUnlock()
// Sort by routing weight descending: weight = fillRate × (1 fillRate).
// This prefers indexers in the "trust sweet spot" — proven popular (fillRate > 0)
// but not saturated (fillRate < 1). Peak at fillRate ≈ 0.5.
routingWeight := func(addr string) float64 {
f := fillRates[addr]
return f * (1 - f)
}
for i := 1; i < len(reachable); i++ {
for j := i; j > 0 && routingWeight(reachable[j]) > routingWeight(reachable[j-1]); j-- {
reachable[j], reachable[j-1] = reachable[j-1], reachable[j]
}
}
if req.Count > len(reachable) {
req.Count = len(reachable)
}
resp.Indexers = reachable[:req.Count]
resp.FillRates = fillRates
}
if err := json.NewEncoder(s).Encode(resp); err != nil {
logger.Err(err).Msg("native get indexers: encode response")
}
break
}
}
// handleNativeConsensus answers a consensus challenge from a node/indexer.
// It returns:
// - Trusted: which of the candidates it considers alive.
// - Suggestions: extras it knows and trusts that were not in the candidate list.
func (ix *IndexerService) handleNativeConsensus(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
for {
var req common.ConsensusRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
logger.Err(err).Msg("native consensus: decode")
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
return
}
continue
}
myList := ix.reachableLiveIndexers(-1, s.Conn().RemotePeer().String())
mySet := make(map[string]struct{}, len(myList))
for _, addr := range myList {
mySet[addr] = struct{}{}
}
trusted := []string{}
candidateSet := make(map[string]struct{}, len(req.Candidates))
for _, addr := range req.Candidates {
candidateSet[addr] = struct{}{}
if _, ok := mySet[addr]; ok {
trusted = append(trusted, addr) // candidate we also confirm as reachable
}
}
// Extras we trust but that the requester didn't include → suggestions.
suggestions := []string{}
for _, addr := range myList {
if _, inCandidates := candidateSet[addr]; !inCandidates {
suggestions = append(suggestions, addr)
}
}
resp := common.ConsensusResponse{Trusted: trusted, Suggestions: suggestions}
if err := json.NewEncoder(s).Encode(resp); err != nil {
logger.Err(err).Msg("native consensus: encode response")
}
break
}
}
// selfDelegate marks the caller as a responsible peer and exposes this native's own
// address as its temporary indexer. Returns false when the fallback pool is saturated
// (maxFallbackPeers reached) — the caller must return an empty response so the node
// retries later instead of pinning indefinitely to an overloaded native.
func (ix *IndexerService) selfDelegate(remotePeer pp.ID, resp *common.GetIndexersResponse) bool {
ix.Native.responsibleMu.Lock()
defer ix.Native.responsibleMu.Unlock()
if len(ix.Native.responsiblePeers) >= maxFallbackPeers {
return false
}
ix.Native.responsiblePeers[remotePeer] = struct{}{}
resp.IsSelfFallback = true
resp.Indexers = []string{ix.Host.Addrs()[len(ix.Host.Addrs())-1].String() + "/p2p/" + ix.Host.ID().String()}
return true
}
// reachableLiveIndexers returns the multiaddrs of non-expired, pingable indexers
// from the local cache (kept fresh by refreshIndexersFromDHT in background).
func (ix *IndexerService) reachableLiveIndexers(count int, from ...string) []string {
ix.Native.liveIndexersMu.RLock()
now := time.Now().UTC()
candidates := []*liveIndexerEntry{}
for _, e := range ix.Native.liveIndexers {
fmt.Println("liveIndexers", slices.Contains(from, e.PeerID), from, e.PeerID)
if e.ExpiresAt.After(now) && !slices.Contains(from, e.PeerID) {
candidates = append(candidates, e)
}
}
ix.Native.liveIndexersMu.RUnlock()
fmt.Println("midway...", candidates, from, ix.Native.knownPeerIDs)
if (count > 0 && len(candidates) < count) || count < 0 {
ix.Native.knownMu.RLock()
for k, v := range ix.Native.knownPeerIDs {
// Include peers whose liveIndexers entry is absent OR expired.
// A non-nil but expired entry means the peer was once known but
// has since timed out — PeerIsAlive below will decide if it's back.
fmt.Println("knownPeerIDs", slices.Contains(from, k), from, k)
if !slices.Contains(from, k) {
candidates = append(candidates, &liveIndexerEntry{
PeerID: k,
Addr: v,
})
}
}
ix.Native.knownMu.RUnlock()
}
fmt.Println("midway...1", candidates)
reachable := []string{}
for _, e := range candidates {
ad, err := pp.AddrInfoFromString(e.Addr)
if err != nil {
continue
}
if common.PeerIsAlive(ix.Host, *ad) {
reachable = append(reachable, e.Addr)
}
}
return reachable
}
// refreshIndexersFromDHT runs in background and queries the shared DHT for every known
// indexer PeerID whose local cache entry is missing or expired. This supplements the
// local cache with entries written by neighbouring natives.
func (ix *IndexerService) refreshIndexersFromDHT(ctx context.Context) {
t := time.NewTicker(dhtRefreshInterval)
defer t.Stop()
logger := oclib.GetLogger()
for {
select {
case <-ctx.Done():
return
case <-t.C:
}
ix.Native.knownMu.RLock()
peerIDs := make([]string, 0, len(ix.Native.knownPeerIDs))
for pid := range ix.Native.knownPeerIDs {
peerIDs = append(peerIDs, pid)
}
ix.Native.knownMu.RUnlock()
now := time.Now().UTC()
for _, pid := range peerIDs {
ix.Native.liveIndexersMu.RLock()
existing := ix.Native.liveIndexers[pid]
ix.Native.liveIndexersMu.RUnlock()
if existing != nil && existing.ExpiresAt.After(now) {
continue // still fresh in local cache
}
key := ix.genIndexerKey(pid)
dhtCtx, dhtCancel := context.WithTimeout(context.Background(), 5*time.Second)
ch, err := ix.DHT.SearchValue(dhtCtx, key)
if err != nil {
dhtCancel()
continue
}
var best *liveIndexerEntry
for b := range ch {
var e liveIndexerEntry
if err := json.Unmarshal(b, &e); err != nil {
continue
}
if e.ExpiresAt.After(time.Now().UTC()) {
if best == nil || e.ExpiresAt.After(best.ExpiresAt) {
best = &e
}
}
}
dhtCancel()
if best != nil {
ix.Native.liveIndexersMu.Lock()
ix.Native.liveIndexers[best.PeerID] = best
ix.Native.liveIndexersMu.Unlock()
logger.Info().Str("peer", best.PeerID).Msg("native: refreshed indexer from DHT")
} else {
// DHT has no fresh entry — peer is gone, prune from known set.
ix.Native.knownMu.Lock()
delete(ix.Native.knownPeerIDs, pid)
ix.Native.knownMu.Unlock()
logger.Info().Str("peer", pid).Msg("native: pruned stale peer from knownPeerIDs")
}
}
}
}
func (ix *IndexerService) genIndexerKey(peerID string) string {
return "/indexer/" + peerID
}
// runOffloadLoop periodically checks if real indexers are available and releases
// responsible peers so they can reconnect to actual indexers on their next attempt.
func (ix *IndexerService) runOffloadLoop(ctx context.Context) {
t := time.NewTicker(offloadInterval)
defer t.Stop()
logger := oclib.GetLogger()
for {
select {
case <-ctx.Done():
return
case <-t.C:
}
fmt.Println("runOffloadLoop", ix.Native.responsiblePeers)
ix.Native.responsibleMu.RLock()
count := len(ix.Native.responsiblePeers)
ix.Native.responsibleMu.RUnlock()
if count == 0 {
continue
}
ix.Native.responsibleMu.RLock()
peerIDS := []string{}
for p := range ix.Native.responsiblePeers {
peerIDS = append(peerIDS, p.String())
}
fmt.Println("COUNT --> ", count, len(ix.reachableLiveIndexers(-1, peerIDS...)))
ix.Native.responsibleMu.RUnlock()
if len(ix.reachableLiveIndexers(-1, peerIDS...)) > 0 {
ix.Native.responsibleMu.RLock()
released := ix.Native.responsiblePeers
ix.Native.responsibleMu.RUnlock()
// Reset (not Close) heartbeat streams of released peers.
// Close() only half-closes the native's write direction — the peer's write
// direction stays open and sendHeartbeat never sees an error.
// Reset() abruptly terminates both directions, making the peer's next
// json.Encode return an error which triggers replenishIndexersFromNative.
ix.StreamMU.Lock()
if streams := ix.StreamRecords[common.ProtocolHeartbeat]; streams != nil {
for pid := range released {
if rec, ok := streams[pid]; ok {
if rec.HeartbeatStream != nil && rec.HeartbeatStream.Stream != nil {
rec.HeartbeatStream.Stream.Reset()
}
ix.Native.responsibleMu.Lock()
delete(ix.Native.responsiblePeers, pid)
ix.Native.responsibleMu.Unlock()
delete(streams, pid)
logger.Info().Str("peer", pid.String()).Str("proto", string(common.ProtocolHeartbeat)).Msg(
"native: offload — stream reset, peer will reconnect to real indexer")
} else {
// No recorded heartbeat stream for this peer: either it never
// passed the score check (new peer, uptime=0 → score<75) or the
// stream was GC'd. We cannot send a Reset signal, so close the
// whole connection instead — this makes the peer's sendHeartbeat
// return an error, which triggers replenishIndexersFromNative and
// migrates it to a real indexer.
ix.Native.responsibleMu.Lock()
delete(ix.Native.responsiblePeers, pid)
ix.Native.responsibleMu.Unlock()
go ix.Host.Network().ClosePeer(pid)
logger.Info().Str("peer", pid.String()).Msg(
"native: offload — no heartbeat stream, closing connection so peer re-requests real indexers")
}
}
}
ix.StreamMU.Unlock()
logger.Info().Int("released", count).Msg("native: offloaded responsible peers to real indexers")
}
}
}
// handleNativeGetPeers returns a random selection of this native's known native
// contacts, excluding any in the request's Exclude list.
func (ix *IndexerService) handleNativeGetPeers(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
for {
var req common.GetNativePeersRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
logger.Err(err).Msg("native get peers: decode")
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) ||
strings.Contains(err.Error(), "reset") ||
strings.Contains(err.Error(), "closed") ||
strings.Contains(err.Error(), "too many connections") {
return
}
continue
}
if req.Count <= 0 {
req.Count = 1
}
excludeSet := make(map[string]struct{}, len(req.Exclude))
for _, e := range req.Exclude {
excludeSet[e] = struct{}{}
}
common.StreamNativeMu.RLock()
candidates := make([]string, 0, len(common.StaticNatives))
for addr := range common.StaticNatives {
if _, excluded := excludeSet[addr]; !excluded {
candidates = append(candidates, addr)
}
}
common.StreamNativeMu.RUnlock()
rand.Shuffle(len(candidates), func(i, j int) { candidates[i], candidates[j] = candidates[j], candidates[i] })
if req.Count > len(candidates) {
req.Count = len(candidates)
}
resp := common.GetNativePeersResponse{Peers: candidates[:req.Count]}
if err := json.NewEncoder(s).Encode(resp); err != nil {
logger.Err(err).Msg("native get peers: encode response")
}
break
}
}
// StartNativeRegistration starts a goroutine that periodically registers this
// indexer with all configured native indexers (every RecommendedHeartbeatInterval).

View File

@@ -0,0 +1,228 @@
package indexer
import (
"context"
"encoding/json"
"strings"
"time"
"oc-discovery/conf"
"oc-discovery/daemons/node/common"
oclib "cloud.o-forge.io/core/oc-lib"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/network"
)
const TopicSearchPeer = "oc-search-peer"
// searchTimeout returns the configured search timeout, defaulting to 5s.
func searchTimeout() time.Duration {
if t := conf.GetConfig().SearchTimeout; t > 0 {
return time.Duration(t) * time.Second
}
return 5 * time.Second
}
// initSearchHandlers registers ProtocolSearchPeer and ProtocolSearchPeerResponse
// and subscribes to TopicSearchPeer on GossipSub.
func (ix *IndexerService) initSearchHandlers() {
ix.Host.SetStreamHandler(common.ProtocolSearchPeer, ix.handleSearchPeer)
ix.Host.SetStreamHandler(common.ProtocolSearchPeerResponse, ix.handleSearchPeerResponse)
ix.initSearchSubscription()
}
// updateReferent is called from HandleHeartbeat when Referent flag changes.
// If referent=true the node is added to referencedNodes; if false it is removed.
func (ix *IndexerService) updateReferent(pid pp.ID, rec PeerRecord, referent bool) {
ix.referencedNodesMu.Lock()
defer ix.referencedNodesMu.Unlock()
if referent {
ix.referencedNodes[pid] = rec
} else {
delete(ix.referencedNodes, pid)
}
}
// searchReferenced looks up nodes in referencedNodes matching the query.
// Matches on peerID (exact), DID (exact), or name (case-insensitive contains).
func (ix *IndexerService) searchReferenced(peerID, did, name string) []common.SearchHit {
ix.referencedNodesMu.RLock()
defer ix.referencedNodesMu.RUnlock()
nameLow := strings.ToLower(name)
var hits []common.SearchHit
for pid, rec := range ix.referencedNodes {
pidStr := pid.String()
matchPeerID := peerID != "" && pidStr == peerID
matchDID := did != "" && rec.DID == did
matchName := name != "" && strings.Contains(strings.ToLower(rec.Name), nameLow)
if matchPeerID || matchDID || matchName {
hits = append(hits, common.SearchHit{
PeerID: pidStr,
DID: rec.DID,
Name: rec.Name,
})
}
}
return hits
}
// handleSearchPeer is the ProtocolSearchPeer handler.
// The node opens this stream, sends a SearchPeerRequest, and reads results
// as they stream in. The stream stays open until timeout or node closes it.
func (ix *IndexerService) handleSearchPeer(s network.Stream) {
logger := oclib.GetLogger()
defer s.Reset()
var req common.SearchPeerRequest
if err := json.NewDecoder(s).Decode(&req); err != nil || req.QueryID == "" {
return
}
// streamCtx is cancelled when the node closes its end of the stream.
streamCtx, streamCancel := context.WithCancel(context.Background())
go func() {
// Block until the stream is reset/closed, then cancel our context.
buf := make([]byte, 1)
s.Read(buf) //nolint:errcheck — we only care about EOF/reset
streamCancel()
}()
defer streamCancel()
resultCh := make(chan []common.SearchHit, 16)
ix.pendingSearchesMu.Lock()
ix.pendingSearches[req.QueryID] = resultCh
ix.pendingSearchesMu.Unlock()
defer func() {
ix.pendingSearchesMu.Lock()
delete(ix.pendingSearches, req.QueryID)
ix.pendingSearchesMu.Unlock()
}()
// Check own referencedNodes immediately.
if hits := ix.searchReferenced(req.PeerID, req.DID, req.Name); len(hits) > 0 {
resultCh <- hits
}
// Broadcast search on GossipSub so other indexers can respond.
ix.publishSearchQuery(req.QueryID, req.PeerID, req.DID, req.Name)
// Stream results back to node as they arrive; reset idle timer on each result.
enc := json.NewEncoder(s)
idleTimer := time.NewTimer(searchTimeout())
defer idleTimer.Stop()
for {
select {
case hits := <-resultCh:
if err := enc.Encode(common.SearchPeerResult{QueryID: req.QueryID, Records: hits}); err != nil {
logger.Debug().Err(err).Msg("[search] stream write failed")
return
}
// Reset idle timeout: keep alive as long as results trickle in.
if !idleTimer.Stop() {
select {
case <-idleTimer.C:
default:
}
}
idleTimer.Reset(searchTimeout())
case <-idleTimer.C:
// No new result within timeout — close gracefully.
return
case <-streamCtx.Done():
// Node closed the stream (new search superseded this one).
return
}
}
}
// handleSearchPeerResponse is the ProtocolSearchPeerResponse handler.
// Another indexer opens this stream to deliver hits for a pending queryID.
func (ix *IndexerService) handleSearchPeerResponse(s network.Stream) {
defer s.Reset()
var result common.SearchPeerResult
if err := json.NewDecoder(s).Decode(&result); err != nil || result.QueryID == "" {
return
}
ix.pendingSearchesMu.Lock()
ch := ix.pendingSearches[result.QueryID]
ix.pendingSearchesMu.Unlock()
if ch != nil {
select {
case ch <- result.Records:
default: // channel full, drop — node may be slow
}
}
}
// publishSearchQuery broadcasts a SearchQuery on TopicSearchPeer.
func (ix *IndexerService) publishSearchQuery(queryID, peerID, did, name string) {
ix.LongLivedStreamRecordedService.LongLivedPubSubService.PubsubMu.RLock()
topic := ix.LongLivedStreamRecordedService.LongLivedPubSubService.LongLivedPubSubs[TopicSearchPeer]
ix.LongLivedStreamRecordedService.LongLivedPubSubService.PubsubMu.RUnlock()
if topic == nil {
return
}
q := common.SearchQuery{
QueryID: queryID,
PeerID: peerID,
DID: did,
Name: name,
EmitterID: ix.Host.ID().String(),
}
b, err := json.Marshal(q)
if err != nil {
return
}
_ = topic.Publish(context.Background(), b)
}
// initSearchSubscription joins TopicSearchPeer and dispatches incoming queries.
func (ix *IndexerService) initSearchSubscription() {
logger := oclib.GetLogger()
ix.LongLivedStreamRecordedService.LongLivedPubSubService.PubsubMu.Lock()
topic, err := ix.PS.Join(TopicSearchPeer)
if err != nil {
ix.LongLivedStreamRecordedService.LongLivedPubSubService.PubsubMu.Unlock()
logger.Err(err).Msg("[search] failed to join search topic")
return
}
ix.LongLivedStreamRecordedService.LongLivedPubSubService.LongLivedPubSubs[TopicSearchPeer] = topic
ix.LongLivedStreamRecordedService.LongLivedPubSubService.PubsubMu.Unlock()
common.SubscribeEvents(
ix.LongLivedStreamRecordedService.LongLivedPubSubService,
context.Background(),
TopicSearchPeer,
-1,
func(_ context.Context, q common.SearchQuery, _ string) {
ix.onSearchQuery(q)
},
)
}
// onSearchQuery handles an incoming GossipSub search broadcast.
// If we have matching referencedNodes, we respond to the emitting indexer.
func (ix *IndexerService) onSearchQuery(q common.SearchQuery) {
// Don't respond to our own broadcasts.
if q.EmitterID == ix.Host.ID().String() {
return
}
hits := ix.searchReferenced(q.PeerID, q.DID, q.Name)
if len(hits) == 0 {
return
}
emitterID, err := pp.Decode(q.EmitterID)
if err != nil {
return
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
s, err := ix.Host.NewStream(ctx, emitterID, common.ProtocolSearchPeerResponse)
if err != nil {
return
}
defer s.Reset()
s.SetDeadline(time.Now().Add(5 * time.Second))
json.NewEncoder(s).Encode(common.SearchPeerResult{QueryID: q.QueryID, Records: hits})
}

View File

@@ -2,10 +2,14 @@ package indexer
import (
"context"
"encoding/json"
"errors"
"math/rand"
"oc-discovery/conf"
"oc-discovery/daemons/node/common"
"strings"
"sync"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
dht "github.com/libp2p/go-libp2p-kad-dht"
@@ -15,28 +19,70 @@ import (
pp "github.com/libp2p/go-libp2p/core/peer"
)
// dhtCacheEntry holds one indexer discovered via DHT for use in suggestion responses.
type dhtCacheEntry struct {
AI pp.AddrInfo
LastSeen time.Time
}
// offloadState tracks which nodes we've already proposed migration to.
// When an indexer is overloaded (fill rate > offloadThreshold) it only sends
// SuggestMigrate to a small batch at a time; peers that don't migrate within
// offloadGracePeriod are moved to alreadyTried so a new batch can be picked.
type offloadState struct {
inBatch map[pp.ID]time.Time // peer → time added to current batch
alreadyTried map[pp.ID]struct{} // peers proposed to that didn't migrate
mu sync.Mutex
}
const (
offloadThreshold = 0.80 // fill rate above which to start offloading
offloadBatchSize = 5 // max concurrent "please migrate" proposals
offloadGracePeriod = 3 * common.RecommendedHeartbeatInterval
)
// IndexerService manages the indexer node's state: stream records, DHT, pubsub.
type IndexerService struct {
*common.LongLivedStreamRecordedService[PeerRecord]
PS *pubsub.PubSub
DHT *dht.IpfsDHT
isStrictIndexer bool
mu sync.RWMutex
IsNative bool
Native *NativeState // non-nil when IsNative == true
nameIndex *nameIndexState
PS *pubsub.PubSub
DHT *dht.IpfsDHT
isStrictIndexer bool
mu sync.RWMutex
nameIndex *nameIndexState
dhtProvideCancel context.CancelFunc
bornAt time.Time
// Passive DHT cache: refreshed every 2 min in background, used for suggestions.
dhtCache []dhtCacheEntry
dhtCacheMu sync.RWMutex
// Offload state for overloaded-indexer migration proposals.
offload offloadState
// referencedNodes holds nodes that have designated this indexer as their
// search referent (Heartbeat.Referent=true). Used for distributed search.
referencedNodes map[pp.ID]PeerRecord
referencedNodesMu sync.RWMutex
// pendingSearches maps queryID → result channel for in-flight searches.
pendingSearches map[string]chan []common.SearchHit
pendingSearchesMu sync.Mutex
// behavior tracks per-node compliance (heartbeat rate, publish/get volume,
// identity consistency, signature failures).
behavior *NodeBehaviorTracker
// connGuard limits new-connection bursts to protect public indexers.
connGuard *ConnectionRateGuard
}
// NewIndexerService creates an IndexerService.
// If ps is nil, this is a strict indexer (no pre-existing gossip sub from a node).
func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int, isNative bool) *IndexerService {
func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int) *IndexerService {
logger := oclib.GetLogger()
logger.Info().Msg("open indexer mode...")
var err error
ix := &IndexerService{
LongLivedStreamRecordedService: common.NewStreamRecordedService[PeerRecord](h, maxNode),
isStrictIndexer: ps == nil,
IsNative: isNative,
referencedNodes: map[pp.ID]PeerRecord{},
pendingSearches: map[string]chan []common.SearchHit{},
behavior: newNodeBehaviorTracker(),
connGuard: newConnectionRateGuard(),
}
if ps == nil {
ps, err = pubsub.NewGossipSub(context.Background(), ix.Host)
@@ -46,25 +92,45 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int, isNative boo
}
ix.PS = ps
if ix.isStrictIndexer && !isNative {
if ix.isStrictIndexer {
logger.Info().Msg("connect to indexers as strict indexer...")
common.ConnectToIndexers(h, conf.GetConfig().MinIndexer, conf.GetConfig().MaxIndexer, ix.Host.ID())
common.ConnectToIndexers(h, conf.GetConfig().MinIndexer, conf.GetConfig().MaxIndexer*2)
logger.Info().Msg("subscribe to decentralized search flow as strict indexer...")
go ix.SubscribeToSearch(ix.PS, nil)
}
if !isNative {
logger.Info().Msg("init distributed name index...")
ix.initNameIndex(ps)
ix.LongLivedStreamRecordedService.AfterDelete = func(pid pp.ID, name, did string) {
ix.publishNameEvent(NameIndexDelete, name, pid.String(), did)
}
logger.Info().Msg("init distributed name index...")
ix.initNameIndex(ps)
ix.LongLivedStreamRecordedService.AfterDelete = func(pid pp.ID, name, did string) {
ix.publishNameEvent(NameIndexDelete, name, pid.String(), did)
// Remove behavior state for peers that are no longer connected and
// have no active ban — keeps memory bounded to the live node set.
ix.behavior.Cleanup(pid)
}
// Parse bootstrap peers from configured native/indexer addresses so that the
// DHT can find its routing table entries even in a fresh deployment.
// AllowInbound: fired once per stream open, before any heartbeat is decoded.
// 1. Reject peers that are currently banned (behavioral strikes).
// 2. For genuinely new connections, apply the burst guard.
ix.AllowInbound = func(remotePeer pp.ID, isNew bool) error {
if ix.behavior.IsBanned(remotePeer) {
return errors.New("peer is banned")
}
if isNew && !ix.connGuard.Allow() {
return errors.New("connection rate limit exceeded, retry later")
}
return nil
}
// ValidateHeartbeat: fired on every heartbeat tick for an established stream.
// Checks heartbeat cadence — rejects if the node is sending too fast.
ix.ValidateHeartbeat = func(remotePeer pp.ID) error {
return ix.behavior.RecordHeartbeat(remotePeer)
}
// Parse bootstrap peers from configured indexer addresses so the DHT can
// find its routing table entries even in a fresh deployment.
var bootstrapPeers []pp.AddrInfo
for _, addrStr := range strings.Split(conf.GetConfig().NativeIndexerAddresses+","+conf.GetConfig().IndexerAddresses, ",") {
for _, addrStr := range strings.Split(conf.GetConfig().IndexerAddresses, ",") {
addrStr = strings.TrimSpace(addrStr)
if addrStr == "" {
continue
@@ -75,12 +141,11 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int, isNative boo
}
dhtOpts := []dht.Option{
dht.Mode(dht.ModeServer),
dht.ProtocolPrefix("oc"), // 🔥 réseau privé
dht.ProtocolPrefix("oc"),
dht.Validator(record.NamespacedValidator{
"node": PeerRecordValidator{},
"indexer": IndexerRecordValidator{}, // for native indexer registry
"name": DefaultValidator{},
"pid": DefaultValidator{},
"node": PeerRecordValidator{},
"name": DefaultValidator{},
"pid": DefaultValidator{},
}),
}
if len(bootstrapPeers) > 0 {
@@ -91,44 +156,336 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int, isNative boo
return nil
}
// InitNative must happen after DHT is ready
if isNative {
ix.InitNative()
} else {
ix.initNodeHandler()
// Register with configured natives so this indexer appears in their cache.
// Pass a fill rate provider so the native can route new nodes to less-loaded indexers.
if nativeAddrs := conf.GetConfig().NativeIndexerAddresses; nativeAddrs != "" {
fillRateFn := func() float64 {
ix.StreamMU.RLock()
n := len(ix.StreamRecords[common.ProtocolHeartbeat])
ix.StreamMU.RUnlock()
maxN := ix.MaxNodesConn()
if maxN <= 0 {
return 0
}
rate := float64(n) / float64(maxN)
if rate > 1 {
rate = 1
}
return rate
}
common.StartNativeRegistration(ix.Host, nativeAddrs, fillRateFn)
// Make the DHT available for replenishment from other packages.
common.SetDiscoveryDHT(ix.DHT)
ix.bornAt = time.Now().UTC()
ix.offload.inBatch = make(map[pp.ID]time.Time)
ix.offload.alreadyTried = make(map[pp.ID]struct{})
ix.initNodeHandler()
// Build and send a HeartbeatResponse after each received node heartbeat.
// Raw metrics only — no pre-cooked score. Node computes the score itself.
ix.BuildHeartbeatResponse = func(remotePeer pp.ID, need int, challenges []string, challengeDID string, referent bool) *common.HeartbeatResponse {
ix.StreamMU.RLock()
peerCount := len(ix.StreamRecords[common.ProtocolHeartbeat])
// Collect lastSeen per active peer for challenge responses.
type peerMeta struct {
found bool
lastSeen time.Time
}
peerLookup := make(map[string]peerMeta, peerCount)
var remotePeerRecord PeerRecord
for pid, rec := range ix.StreamRecords[common.ProtocolHeartbeat] {
var ls time.Time
if rec.HeartbeatStream != nil && rec.HeartbeatStream.UptimeTracker != nil {
ls = rec.HeartbeatStream.UptimeTracker.LastSeen
}
peerLookup[pid.String()] = peerMeta{found: true, lastSeen: ls}
if pid == remotePeer {
remotePeerRecord = rec.Record
}
}
ix.StreamMU.RUnlock()
// Update referent designation: node marks its best-scored indexer with Referent=true.
ix.updateReferent(remotePeer, remotePeerRecord, referent)
maxN := ix.MaxNodesConn()
fillRate := 0.0
if maxN > 0 {
fillRate = float64(peerCount) / float64(maxN)
if fillRate > 1 {
fillRate = 1
}
}
resp := &common.HeartbeatResponse{
FillRate: fillRate,
PeerCount: peerCount,
MaxNodes: maxN,
BornAt: ix.bornAt,
}
// Answer each challenged PeerID with raw found + lastSeen.
for _, pidStr := range challenges {
meta := peerLookup[pidStr] // zero value if not found
entry := common.ChallengeEntry{
PeerID: pidStr,
Found: meta.found,
LastSeen: meta.lastSeen,
}
resp.Challenges = append(resp.Challenges, entry)
}
// DHT challenge: retrieve the node's own DID to prove DHT is functional.
if challengeDID != "" {
ctx3, cancel3 := context.WithTimeout(context.Background(), 3*time.Second)
val, err := ix.DHT.GetValue(ctx3, "/node/"+challengeDID)
cancel3()
resp.DHTFound = err == nil
if err == nil {
resp.DHTPayload = json.RawMessage(val)
}
}
// Random sample of connected nodes as witnesses (up to 3).
// Never include the requesting peer itself — asking a node to witness
// itself is circular and meaningless.
ix.StreamMU.RLock()
for pidStr := range peerLookup {
if len(resp.Witnesses) >= 3 {
break
}
pid, err := pp.Decode(pidStr)
if err != nil || pid == remotePeer || pid == ix.Host.ID() {
continue
}
addrs := ix.Host.Peerstore().Addrs(pid)
ai := common.FilterLoopbackAddrs(pp.AddrInfo{ID: pid, Addrs: addrs})
if len(ai.Addrs) > 0 {
resp.Witnesses = append(resp.Witnesses, ai)
}
}
ix.StreamMU.RUnlock()
// Attach suggestions: exactly `need` entries from the DHT cache.
// If the indexer is overloaded (SuggestMigrate will be set below), always
// provide at least 1 suggestion even when need == 0, so the node has
// somewhere to go.
suggestionsNeeded := need
if fillRate > offloadThreshold && suggestionsNeeded < 1 {
suggestionsNeeded = 1
}
if suggestionsNeeded > 0 {
ix.dhtCacheMu.RLock()
// When offloading, pick from a random offset within the top N of the
// cache so concurrent migrations spread across multiple targets rather
// than all rushing to the same least-loaded indexer (thundering herd).
// For normal need-based suggestions the full sorted order is fine.
cache := ix.dhtCache
if fillRate > offloadThreshold && len(cache) > suggestionsNeeded {
const spreadWindow = 5 // sample from the top-5 least-loaded
window := spreadWindow
if window > len(cache) {
window = len(cache)
}
start := rand.Intn(window)
cache = cache[start:]
}
for _, e := range cache {
if len(resp.Suggestions) >= suggestionsNeeded {
break
}
// Never suggest the requesting peer itself or this indexer.
if e.AI.ID == remotePeer || e.AI.ID == h.ID() {
continue
}
resp.Suggestions = append(resp.Suggestions, e.AI)
}
ix.dhtCacheMu.RUnlock()
}
// Offload logic: when fill rate is too high, selectively ask nodes to migrate.
if fillRate > offloadThreshold && len(resp.Suggestions) > 0 {
now := time.Now()
ix.offload.mu.Lock()
// Expire stale batch entries -> move to alreadyTried.
for pid, addedAt := range ix.offload.inBatch {
if now.Sub(addedAt) > offloadGracePeriod {
ix.offload.alreadyTried[pid] = struct{}{}
delete(ix.offload.inBatch, pid)
}
}
// Reset alreadyTried if we've exhausted the whole pool.
if len(ix.offload.alreadyTried) >= peerCount {
ix.offload.alreadyTried = make(map[pp.ID]struct{})
}
_, tried := ix.offload.alreadyTried[remotePeer]
_, inBatch := ix.offload.inBatch[remotePeer]
if !tried {
if inBatch {
resp.SuggestMigrate = true
} else if len(ix.offload.inBatch) < offloadBatchSize {
ix.offload.inBatch[remotePeer] = now
resp.SuggestMigrate = true
}
}
ix.offload.mu.Unlock()
} else if fillRate <= offloadThreshold {
// Fill rate back to normal: reset offload state.
ix.offload.mu.Lock()
if len(ix.offload.inBatch) > 0 || len(ix.offload.alreadyTried) > 0 {
ix.offload.inBatch = make(map[pp.ID]time.Time)
ix.offload.alreadyTried = make(map[pp.ID]struct{})
}
ix.offload.mu.Unlock()
}
// Bootstrap: if this indexer has no indexers of its own, probe the
// connecting peer to check it supports ProtocolHeartbeat (i.e. it is
// itself an indexer). Plain nodes do not register the handler and the
// negotiation fails instantly — no wasted heartbeat cycle.
// Run in a goroutine: the probe is a short blocking stream open.
if len(common.Indexers.GetAddrs()) == 0 && remotePeer != h.ID() {
pid := remotePeer
go func() {
if !common.SupportsHeartbeat(h, pid) {
logger.Debug().Str("peer", pid.String()).
Msg("[bootstrap] inbound peer has no heartbeat handler — not an indexer, skipping")
return
}
addrs := h.Peerstore().Addrs(pid)
ai := common.FilterLoopbackAddrs(pp.AddrInfo{ID: pid, Addrs: addrs})
if len(ai.Addrs) == 0 {
return
}
key := pid.String()
if !common.Indexers.ExistsAddr(key) {
adCopy := ai
common.Indexers.SetAddr(key, &adCopy)
common.Indexers.NudgeIt()
logger.Info().Str("peer", key).Msg("[bootstrap] no indexers — added inbound indexer peer as candidate")
}
}()
}
return resp
}
// Advertise this indexer in the DHT so nodes can discover it.
fillRateFn := func() float64 {
ix.StreamMU.RLock()
n := len(ix.StreamRecords[common.ProtocolHeartbeat])
ix.StreamMU.RUnlock()
maxN := ix.MaxNodesConn()
if maxN <= 0 {
return 0
}
rate := float64(n) / float64(maxN)
if rate > 1 {
rate = 1
}
return rate
}
ix.startDHTCacheRefresh()
ix.startDHTProvide(fillRateFn)
return ix
}
func (ix *IndexerService) Close() {
if ix.Native != nil && ix.Native.cancel != nil {
ix.Native.cancel()
}
// Explicitly deregister from natives on clean shutdown so they evict this
// indexer immediately rather than waiting for TTL expiry (~90 s).
if !ix.IsNative {
if nativeAddrs := conf.GetConfig().NativeIndexerAddresses; nativeAddrs != "" {
common.UnregisterFromNative(ix.Host, nativeAddrs)
// startDHTCacheRefresh periodically queries the DHT for peer indexers and
// refreshes ix.dhtCache. This passive cache is used by BuildHeartbeatResponse
// to suggest better indexers to connected nodes without any per-request cost.
func (ix *IndexerService) startDHTCacheRefresh() {
ctx, cancel := context.WithCancel(context.Background())
// Store cancel alongside the provide cancel so Close() stops both.
prevCancel := ix.dhtProvideCancel
ix.dhtProvideCancel = func() {
if prevCancel != nil {
prevCancel()
}
cancel()
}
go func() {
logger := oclib.GetLogger()
refresh := func() {
if ix.DHT == nil {
return
}
// Fetch more than needed so SelectByFillRate can filter for diversity.
raw := common.DiscoverIndexersFromDHT(ix.Host, ix.DHT, 30)
if len(raw) == 0 {
return
}
// Remove self before selection.
filtered := raw[:0]
for _, ai := range raw {
if ai.ID != ix.Host.ID() {
filtered = append(filtered, ai)
}
}
// SelectByFillRate applies /24 subnet diversity and fill-rate weighting.
// Fill rates are unknown at this stage (nil map) so all peers get
// the neutral prior f=0.5 — diversity filtering still applies.
selected := common.SelectByFillRate(filtered, nil, 10)
now := time.Now()
ix.dhtCacheMu.Lock()
ix.dhtCache = ix.dhtCache[:0]
for _, ai := range selected {
ix.dhtCache = append(ix.dhtCache, dhtCacheEntry{AI: ai, LastSeen: now})
}
ix.dhtCacheMu.Unlock()
logger.Info().Int("cached", len(selected)).Msg("[dht] indexer suggestion cache refreshed")
}
// Initial delay: let the DHT routing table warm up first.
select {
case <-time.After(30 * time.Second):
case <-ctx.Done():
return
}
refresh()
t := time.NewTicker(2 * time.Minute)
defer t.Stop()
for {
select {
case <-t.C:
refresh()
case <-ctx.Done():
return
}
}
}()
}
// startDHTProvide bootstraps the DHT and starts a goroutine that periodically
// advertises this indexer under the well-known provider key.
func (ix *IndexerService) startDHTProvide(fillRateFn func() float64) {
ctx, cancel := context.WithCancel(context.Background())
ix.dhtProvideCancel = cancel
go func() {
logger := oclib.GetLogger()
// Wait until a routable (non-loopback) address is available.
for i := 0; i < 12; i++ {
addrs := ix.Host.Addrs()
if len(addrs) > 0 && !strings.Contains(addrs[len(addrs)-1].String(), "127.0.0.1") {
break
}
select {
case <-ctx.Done():
return
case <-time.After(5 * time.Second):
}
}
if err := ix.DHT.Bootstrap(ctx); err != nil {
logger.Warn().Err(err).Msg("[dht] bootstrap failed")
}
provide := func() {
pCtx, pCancel := context.WithTimeout(ctx, 30*time.Second)
defer pCancel()
if err := ix.DHT.Provide(pCtx, common.IndexerCID(), true); err != nil {
logger.Warn().Err(err).Msg("[dht] Provide failed")
} else {
logger.Info().Float64("fill_rate", fillRateFn()).Msg("[dht] indexer advertised in DHT")
}
}
provide()
t := time.NewTicker(common.RecommendedHeartbeatInterval)
defer t.Stop()
for {
select {
case <-t.C:
provide()
case <-ctx.Done():
return
}
}
}()
}
func (ix *IndexerService) Close() {
if ix.dhtProvideCancel != nil {
ix.dhtProvideCancel()
}
ix.DHT.Close()
ix.PS.UnregisterTopicValidator(common.TopicPubSubSearch)

View File

@@ -73,14 +73,12 @@ func ListenNATS(n *Node) {
if err != nil {
return
}
n.StreamService.Mu.Lock()
defer n.StreamService.Mu.Unlock()
if p.Relation == peer.PARTNER {
n.StreamService.ConnectToPartner(p.StreamAddress)
} else {
// Non-partner: close any existing streams for this peer.
if p.Relation != peer.PARTNER {
n.StreamService.Mu.Lock()
defer n.StreamService.Mu.Unlock()
ps := common.ProtocolStream{}
for p, s := range n.StreamService.Streams {
for proto, s := range n.StreamService.Streams {
m := map[pp.ID]*common.Stream{}
for k := range s {
if ad.ID != k {
@@ -89,7 +87,7 @@ func ListenNATS(n *Node) {
s[k].Stream.Close()
}
}
ps[p] = m
ps[proto] = m
}
n.StreamService.Streams = ps
}
@@ -167,7 +165,7 @@ func ListenNATS(n *Node) {
if m["peer_id"] == nil { // send to every active stream
n.StreamService.Mu.Lock()
if n.StreamService.Streams[stream.ProtocolSendPlanner] != nil {
for pid := range n.StreamService.Streams[stream.ProtocolSendPlanner] {
for pid := range n.StreamService.Streams[stream.ProtocolSendPlanner] { // send Planner can be long lived - it's a conn
n.StreamService.PublishCommon(nil, resp.User, pid.String(), stream.ProtocolSendPlanner, b)
}
}
@@ -192,18 +190,18 @@ func ListenNATS(n *Node) {
if propalgation.DataType == int(tools.PEER) {
m := map[string]interface{}{}
if err := json.Unmarshal(propalgation.Payload, &m); err == nil {
if peers, err := n.GetPeerRecord(context.Background(), fmt.Sprintf("%v", m["search"]), true); err == nil {
for _, p := range peers {
if b, err := json.Marshal(p); err == nil {
go tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.DataType(tools.PEER),
Method: int(tools.SEARCH_EVENT),
Payload: b,
})
}
needle := fmt.Sprintf("%v", m["search"])
userKey := resp.User
go n.SearchPeerRecord(userKey, needle, func(hit common.SearchHit) {
if b, err := json.Marshal(hit); err == nil {
tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.DataType(tools.PEER),
Method: int(tools.SEARCH_EVENT),
Payload: b,
})
}
}
})
}
} else {

View File

@@ -21,10 +21,18 @@ import (
"github.com/libp2p/go-libp2p"
pubsubs "github.com/libp2p/go-libp2p-pubsub"
"github.com/libp2p/go-libp2p/core/crypto"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
"github.com/libp2p/go-libp2p/p2p/security/noise"
)
// activeSearch tracks an in-flight distributed peer search for one user.
type activeSearch struct {
queryID string
cancel context.CancelFunc
}
type Node struct {
*common.LongLivedStreamRecordedService[interface{}] // change type of stream
PS *pubsubs.PubSub
@@ -35,10 +43,14 @@ type Node struct {
isIndexer bool
peerRecord *indexer.PeerRecord
// activeSearches: one streaming search per user; new search cancels previous.
activeSearchesMu sync.Mutex
activeSearches map[string]*activeSearch
Mu sync.RWMutex
}
func InitNode(isNode bool, isIndexer bool, isNativeIndexer bool) (*Node, error) {
func InitNode(isNode bool, isIndexer bool) (*Node, error) {
if !isNode && !isIndexer {
return nil, errors.New("wait... what ? your node need to at least something. Retry we can't be friend in that case")
}
@@ -54,13 +66,17 @@ func InitNode(isNode bool, isIndexer bool, isNativeIndexer bool) (*Node, error)
return nil, nil
}
logger.Info().Msg("open a host...")
gater := newOCConnectionGater(nil) // host set below after creation
h, err := libp2p.New(
libp2p.PrivateNetwork(psk),
libp2p.Identity(priv),
libp2p.Security(noise.ID, noise.New),
libp2p.ListenAddrStrings(
fmt.Sprintf("/ip4/0.0.0.0/tcp/%d", conf.GetConfig().NodeEndpointPort),
),
libp2p.ConnectionGater(gater),
)
gater.host = h // wire host back into gater now that it exists
logger.Info().Msg("Host open on " + h.ID().String())
if err != nil {
return nil, errors.New("no host no node")
@@ -69,10 +85,15 @@ func InitNode(isNode bool, isIndexer bool, isNativeIndexer bool) (*Node, error)
PeerID: h.ID(),
isIndexer: isIndexer,
LongLivedStreamRecordedService: common.NewStreamRecordedService[interface{}](h, 1000),
activeSearches: map[string]*activeSearch{},
}
// Register the bandwidth probe handler so any peer measuring this node's
// throughput can open a dedicated probe stream and read the echo.
h.SetStreamHandler(common.ProtocolBandwidthProbe, common.HandleBandwidthProbe)
// Register the witness query handler so peers can ask this node's view of indexers.
h.SetStreamHandler(common.ProtocolWitnessQuery, func(s network.Stream) {
common.HandleWitnessQuery(h, s)
})
var ps *pubsubs.PubSub
if isNode {
logger.Info().Msg("generate opencloud node...")
@@ -104,7 +125,7 @@ func InitNode(isNode bool, isIndexer bool, isNativeIndexer bool) (*Node, error)
return json.RawMessage(b)
}
logger.Info().Msg("connect to indexers...")
common.ConnectToIndexers(node.Host, conf.GetConfig().MinIndexer, conf.GetConfig().MaxIndexer, node.PeerID, buildRecord)
common.ConnectToIndexers(node.Host, conf.GetConfig().MinIndexer, conf.GetConfig().MaxIndexer, buildRecord)
logger.Info().Msg("claims my node...")
if _, err := node.claimInfo(conf.GetConfig().Name, conf.GetConfig().Hostname); err != nil {
panic(err)
@@ -137,7 +158,7 @@ func InitNode(isNode bool, isIndexer bool, isNativeIndexer bool) (*Node, error)
}
if isIndexer {
logger.Info().Msg("generate opencloud indexer...")
node.IndexerService = indexer.NewIndexerService(node.Host, ps, 500, isNativeIndexer)
node.IndexerService = indexer.NewIndexerService(node.Host, ps, 500)
}
return node, nil
}
@@ -158,20 +179,14 @@ func (d *Node) publishPeerRecord(
if err != nil {
return err
}
common.StreamMuIndexes.RLock()
indexerSnapshot := make([]*pp.AddrInfo, 0, len(common.StaticIndexers))
for _, ad := range common.StaticIndexers {
indexerSnapshot = append(indexerSnapshot, ad)
}
common.StreamMuIndexes.RUnlock()
for _, ad := range indexerSnapshot {
for _, ad := range common.Indexers.GetAddrs() {
var err error
if common.StreamIndexers, err = common.TempStream(d.Host, *ad, common.ProtocolPublish, "", common.StreamIndexers, map[protocol.ID]*common.ProtocolInfo{},
&common.StreamMuIndexes); err != nil {
if common.Indexers.Streams, err = common.TempStream(d.Host, *ad.Info, common.ProtocolPublish, "", common.Indexers.Streams, map[protocol.ID]*common.ProtocolInfo{},
&common.Indexers.MuStream); err != nil {
continue
}
stream := common.StreamIndexers[common.ProtocolPublish][ad.ID]
stream := common.Indexers.Streams.GetPerID(common.ProtocolPublish, ad.Info.ID)
base := indexer.PeerRecordPayload{
Name: rec.Name,
DID: rec.DID,
@@ -188,6 +203,75 @@ func (d *Node) publishPeerRecord(
return nil
}
// SearchPeerRecord starts a distributed peer search via ProtocolSearchPeer.
// userKey identifies the requesting user — a new call cancels any previous
// search for the same user. Results are pushed to onResult as they arrive.
// The function returns when the search stream closes (idle timeout or indexer unreachable).
func (d *Node) SearchPeerRecord(userKey, needle string, onResult func(common.SearchHit)) {
logger := oclib.GetLogger()
ctx, cancel := context.WithCancel(context.Background())
d.activeSearchesMu.Lock()
if prev, ok := d.activeSearches[userKey]; ok {
prev.cancel()
}
queryID := uuid.New().String()
d.activeSearches[userKey] = &activeSearch{queryID: queryID, cancel: cancel}
d.activeSearchesMu.Unlock()
defer func() {
cancel()
d.activeSearchesMu.Lock()
if cur, ok := d.activeSearches[userKey]; ok && cur.queryID == queryID {
delete(d.activeSearches, userKey)
}
d.activeSearchesMu.Unlock()
}()
req := common.SearchPeerRequest{QueryID: queryID}
if pid, err := pp.Decode(needle); err == nil {
req.PeerID = pid.String()
} else if _, err := uuid.Parse(needle); err == nil {
req.DID = needle
} else {
req.Name = needle
}
// Try indexers in pool order until one accepts the stream.
for _, ad := range common.Indexers.GetAddrs() {
if ad.Info == nil {
continue
}
dialCtx, dialCancel := context.WithTimeout(ctx, 5*time.Second)
s, err := d.Host.NewStream(dialCtx, ad.Info.ID, common.ProtocolSearchPeer)
dialCancel()
if err != nil {
continue
}
if err := json.NewEncoder(s).Encode(req); err != nil {
s.Reset()
continue
}
dec := json.NewDecoder(s)
for {
var result common.SearchPeerResult
if err := dec.Decode(&result); err != nil {
break
}
if result.QueryID != queryID {
continue // stale response from a previous query
}
for _, hit := range result.Records {
onResult(hit)
}
}
s.Reset()
return
}
logger.Warn().Str("user", userKey).Msg("[search] no reachable indexer for peer search")
}
func (d *Node) GetPeerRecord(
ctx context.Context,
pidOrdid string,
@@ -195,13 +279,6 @@ func (d *Node) GetPeerRecord(
) ([]*peer.Peer, error) {
var err error
var info map[string]indexer.PeerRecord
common.StreamMuIndexes.RLock()
indexerSnapshot2 := make([]*pp.AddrInfo, 0, len(common.StaticIndexers))
for _, ad := range common.StaticIndexers {
indexerSnapshot2 = append(indexerSnapshot2, ad)
}
common.StreamMuIndexes.RUnlock()
// Build the GetValue request: if pidOrdid is neither a UUID DID nor a libp2p
// PeerID, treat it as a human-readable name and let the indexer resolve it.
getReq := indexer.GetValue{Key: pidOrdid}
@@ -213,12 +290,12 @@ func (d *Node) GetPeerRecord(
getReq.Key = ""
}
getReq.Search = search
for _, ad := range indexerSnapshot2 {
if common.StreamIndexers, err = common.TempStream(d.Host, *ad, common.ProtocolGet, "",
common.StreamIndexers, map[protocol.ID]*common.ProtocolInfo{}, &common.StreamMuIndexes); err != nil {
for _, ad := range common.Indexers.GetAddrs() {
if common.Indexers.Streams, err = common.TempStream(d.Host, *ad.Info, common.ProtocolGet, "",
common.Indexers.Streams, map[protocol.ID]*common.ProtocolInfo{}, &common.Indexers.MuStream); err != nil {
continue
}
stream := common.StreamIndexers[common.ProtocolGet][ad.ID]
stream := common.Indexers.Streams.GetPerID(common.ProtocolGet, ad.Info.ID)
if err := json.NewEncoder(stream.Stream).Encode(getReq); err != nil {
continue
}

View File

@@ -94,27 +94,21 @@ func (abs *StreamService) sendPlanner(event *common.Event) error { //
return err
}
} else {
m := map[string]interface{}{}
if err := json.Unmarshal(event.Payload, &m); err == nil {
m["peer_id"] = event.From
if pl, err := json.Marshal(m); err == nil {
if b, err := json.Marshal(tools.PropalgationMessage{
DataType: -1,
Action: tools.PB_PLANNER,
Payload: pl,
}); err == nil {
go tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.DataType(oclib.BOOKING),
Method: int(tools.PROPALGATION_EVENT),
Payload: b,
})
}
}
return err
}
} else { // if not empty so it's
m := map[string]interface{}{}
if err := json.Unmarshal(event.Payload, &m); err == nil {
m["peer_id"] = event.From
if pl, err := json.Marshal(m); err == nil {
go tools.NewNATSCaller().SetNATSPub(tools.PLANNER_EXECUTION, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.DataType(oclib.BOOKING),
Method: int(tools.PLANNER_EXECUTION),
Payload: pl,
})
}
}
} else {
}
return nil
}

View File

@@ -70,8 +70,7 @@ func (ps *StreamService) ToPartnerPublishEvent(
if err := json.Unmarshal(payload, &p); err != nil {
return err
}
pid, err := pp.Decode(p.PeerID)
if err != nil {
if _, err := pp.Decode(p.PeerID); err != nil {
return err
}
@@ -86,19 +85,7 @@ func (ps *StreamService) ToPartnerPublishEvent(
if _, err := ps.PublishCommon(dt, user, p.PeerID, ProtocolUpdateResource, b2); err != nil {
return err
}
if p.Relation == peer.PARTNER {
if ps.Streams[ProtocolHeartbeatPartner] == nil {
ps.Streams[ProtocolHeartbeatPartner] = map[pp.ID]*common.Stream{}
}
fmt.Println("SHOULD CONNECT")
ps.ConnectToPartner(p.StreamAddress)
} else if ps.Streams[ProtocolHeartbeatPartner] != nil && ps.Streams[ProtocolHeartbeatPartner][pid] != nil {
for _, pids := range ps.Streams {
if pids[pid] != nil {
delete(pids, pid)
}
}
}
}
}
return nil

View File

@@ -4,7 +4,6 @@ import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"oc-discovery/conf"
"oc-discovery/daemons/node/common"
@@ -21,7 +20,6 @@ import (
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
ma "github.com/multiformats/go-multiaddr"
)
const ProtocolConsidersResource = "/opencloud/resource/considers/1.0"
@@ -56,10 +54,9 @@ type StreamService struct {
Key pp.ID
Host host.Host
Node common.DiscoveryPeer
Streams common.ProtocolStream
Streams common.ProtocolStream
maxNodesConn int
Mu sync.RWMutex
// Stream map[protocol.ID]map[pp.ID]*daemons.Stream
}
func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node common.DiscoveryPeer) (*StreamService, error) {
@@ -68,11 +65,9 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
Key: key,
Node: node,
Host: h,
Streams: common.ProtocolStream{},
Streams: common.ProtocolStream{},
maxNodesConn: maxNode,
}
logger.Info().Msg("handle to partner heartbeat protocol...")
service.Host.SetStreamHandler(ProtocolHeartbeatPartner, service.HandlePartnerHeartbeat)
for proto := range protocols {
service.Host.SetStreamHandler(proto, service.HandleResponse)
}
@@ -106,39 +101,11 @@ func (s *StreamService) HandleResponse(stream network.Stream) {
stream.Protocol(), protocols[stream.Protocol()])
}
func (s *StreamService) HandlePartnerHeartbeat(stream network.Stream) {
s.Mu.Lock()
if s.Streams[ProtocolHeartbeatPartner] == nil {
s.Streams[ProtocolHeartbeatPartner] = map[pp.ID]*common.Stream{}
}
streams := s.Streams[ProtocolHeartbeatPartner]
streamsAnonym := map[pp.ID]common.HeartBeatStreamed{}
for k, v := range streams {
streamsAnonym[k] = v
}
s.Mu.Unlock()
pid, hb, err := common.CheckHeartbeat(s.Host, stream, json.NewDecoder(stream), streamsAnonym, &s.Mu, s.maxNodesConn)
if err != nil {
return
}
s.Mu.Lock()
defer s.Mu.Unlock()
// if record already seen update last seen
if rec, ok := streams[*pid]; ok {
rec.DID = hb.DID
rec.Expiry = time.Now().UTC().Add(10 * time.Second)
} else { // if not in stream ?
val, err := stream.Conn().RemoteMultiaddr().ValueForProtocol(ma.P_IP4)
if err == nil {
s.ConnectToPartner(val)
}
}
// GC is already running via InitStream — starting a new ticker goroutine on
// every heartbeat would leak an unbounded number of goroutines.
}
func (s *StreamService) connectToPartners() error {
logger := oclib.GetLogger()
// Register handlers for partner resource protocols (create/update/delete).
// Connections to partners happen on-demand via TempStream when needed.
for proto, info := range protocolsPartners {
f := func(ss network.Stream) {
if s.Streams[proto] == nil {
@@ -153,25 +120,9 @@ func (s *StreamService) connectToPartners() error {
logger.Info().Msg("SetStreamHandler " + string(proto))
s.Host.SetStreamHandler(proto, f)
}
peers, err := s.searchPeer(fmt.Sprintf("%v", peer.PARTNER.EnumIndex()))
if err != nil {
logger.Err(err)
return err
}
for _, p := range peers {
s.ConnectToPartner(p.StreamAddress)
}
return nil
}
func (s *StreamService) ConnectToPartner(address string) {
logger := oclib.GetLogger()
if ad, err := pp.AddrInfoFromString(address); err == nil {
logger.Info().Msg("Connect to Partner " + ProtocolHeartbeatPartner + " " + address)
common.SendHeartbeat(context.Background(), ProtocolHeartbeatPartner, conf.GetConfig().Name,
s.Host, s.Streams, map[string]*pp.AddrInfo{address: ad}, nil, 20*time.Second)
}
}
func (s *StreamService) searchPeer(search string) ([]*peer.Peer, error) {
ps := []*peer.Peer{}
@@ -220,11 +171,7 @@ func (s *StreamService) gc() {
defer s.Mu.Unlock()
now := time.Now().UTC()
if s.Streams[ProtocolHeartbeatPartner] == nil {
s.Streams[ProtocolHeartbeatPartner] = map[pp.ID]*common.Stream{}
}
streams := s.Streams[ProtocolHeartbeatPartner]
for pid, rec := range streams {
for pid, rec := range s.Streams[ProtocolHeartbeatPartner] {
if now.After(rec.Expiry) {
for _, sstreams := range s.Streams {
if sstreams[pid] != nil {