200 lines
6.1 KiB
Go
200 lines
6.1 KiB
Go
|
|
package common
|
||
|
|
|
||
|
|
import (
|
||
|
|
"context"
|
||
|
|
"encoding/json"
|
||
|
|
"sort"
|
||
|
|
"time"
|
||
|
|
|
||
|
|
oclib "cloud.o-forge.io/core/oc-lib"
|
||
|
|
"github.com/libp2p/go-libp2p/core/host"
|
||
|
|
"github.com/libp2p/go-libp2p/core/network"
|
||
|
|
pp "github.com/libp2p/go-libp2p/core/peer"
|
||
|
|
)
|
||
|
|
|
||
|
|
// ProtocolIndexerCandidates is opened by a node toward its remaining indexers
|
||
|
|
// to request candidate replacement indexers after an ejection event.
|
||
|
|
const ProtocolIndexerCandidates = "/opencloud/indexer/candidates/1.0"
|
||
|
|
|
||
|
|
// IndexerCandidatesRequest is sent by a node to one of its indexers.
|
||
|
|
// Count is how many candidates are needed.
|
||
|
|
type IndexerCandidatesRequest struct {
|
||
|
|
Count int `json:"count"`
|
||
|
|
}
|
||
|
|
|
||
|
|
// IndexerCandidatesResponse carries a random sample of known indexers from
|
||
|
|
// the responding indexer's DHT cache.
|
||
|
|
type IndexerCandidatesResponse struct {
|
||
|
|
Candidates []pp.AddrInfo `json:"candidates"`
|
||
|
|
}
|
||
|
|
|
||
|
|
// TriggerConsensus asks each remaining indexer for a random pool of candidates,
|
||
|
|
// scores them asynchronously via a one-shot probe heartbeat, and admits the
|
||
|
|
// best ones to StaticIndexers. Falls back to DHT replenishment for any gap.
|
||
|
|
//
|
||
|
|
// Must be called in a goroutine — it blocks until all probes have returned
|
||
|
|
// (or timed out), which can take up to ~10s.
|
||
|
|
func TriggerConsensus(h host.Host, remaining []pp.AddrInfo, need int) {
|
||
|
|
if need <= 0 || len(remaining) == 0 {
|
||
|
|
return
|
||
|
|
}
|
||
|
|
logger := oclib.GetLogger()
|
||
|
|
logger.Info().Int("voters", len(remaining)).Int("need", need).
|
||
|
|
Msg("[consensus] starting indexer candidate consensus")
|
||
|
|
|
||
|
|
// Phase 1 — collect candidates from all remaining indexers in parallel.
|
||
|
|
type collectResult struct{ candidates []pp.AddrInfo }
|
||
|
|
collectCh := make(chan collectResult, len(remaining))
|
||
|
|
for _, ai := range remaining {
|
||
|
|
go func(ai pp.AddrInfo) {
|
||
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||
|
|
defer cancel()
|
||
|
|
s, err := h.NewStream(ctx, ai.ID, ProtocolIndexerCandidates)
|
||
|
|
if err != nil {
|
||
|
|
collectCh <- collectResult{}
|
||
|
|
return
|
||
|
|
}
|
||
|
|
defer s.Close()
|
||
|
|
s.SetDeadline(time.Now().Add(5 * time.Second))
|
||
|
|
if err := json.NewEncoder(s).Encode(IndexerCandidatesRequest{Count: need + 2}); err != nil {
|
||
|
|
collectCh <- collectResult{}
|
||
|
|
return
|
||
|
|
}
|
||
|
|
var resp IndexerCandidatesResponse
|
||
|
|
if err := json.NewDecoder(s).Decode(&resp); err != nil {
|
||
|
|
collectCh <- collectResult{}
|
||
|
|
return
|
||
|
|
}
|
||
|
|
collectCh <- collectResult{candidates: resp.Candidates}
|
||
|
|
}(ai)
|
||
|
|
}
|
||
|
|
|
||
|
|
// Merge and deduplicate, excluding indexers already in the pool.
|
||
|
|
seen := map[pp.ID]struct{}{}
|
||
|
|
for _, ai := range Indexers.GetAddrIDs() {
|
||
|
|
seen[ai] = struct{}{}
|
||
|
|
|
||
|
|
}
|
||
|
|
var candidates []pp.AddrInfo
|
||
|
|
for range remaining {
|
||
|
|
r := <-collectCh
|
||
|
|
for _, ai := range r.candidates {
|
||
|
|
if _, dup := seen[ai.ID]; !dup {
|
||
|
|
seen[ai.ID] = struct{}{}
|
||
|
|
candidates = append(candidates, ai)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if len(candidates) == 0 {
|
||
|
|
logger.Info().Msg("[consensus] no candidates from voters, falling back to DHT")
|
||
|
|
replenishIndexersFromDHT(h, need)
|
||
|
|
return
|
||
|
|
}
|
||
|
|
logger.Info().Int("candidates", len(candidates)).Msg("[consensus] scoring candidates")
|
||
|
|
|
||
|
|
// Phase 2 — score all candidates in parallel via a one-shot probe heartbeat.
|
||
|
|
type scoreResult struct {
|
||
|
|
ai pp.AddrInfo
|
||
|
|
score float64
|
||
|
|
}
|
||
|
|
scoreCh := make(chan scoreResult, len(candidates))
|
||
|
|
for _, ai := range candidates {
|
||
|
|
go func(ai pp.AddrInfo) {
|
||
|
|
resp, rtt, err := probeIndexer(h, ai)
|
||
|
|
if err != nil {
|
||
|
|
scoreCh <- scoreResult{ai: ai, score: 0}
|
||
|
|
return
|
||
|
|
}
|
||
|
|
scoreCh <- scoreResult{ai: ai, score: quickScore(resp, rtt)}
|
||
|
|
}(ai)
|
||
|
|
}
|
||
|
|
|
||
|
|
results := make([]scoreResult, 0, len(candidates))
|
||
|
|
for range candidates {
|
||
|
|
results = append(results, <-scoreCh)
|
||
|
|
}
|
||
|
|
|
||
|
|
// Sort descending by quick score, admit top `need` above the minimum bar.
|
||
|
|
sort.Slice(results, func(i, j int) bool { return results[i].score > results[j].score })
|
||
|
|
minQ := dynamicMinScore(0) // fresh peer: threshold starts at 20
|
||
|
|
|
||
|
|
admitted := 0
|
||
|
|
for _, res := range results {
|
||
|
|
if admitted >= need {
|
||
|
|
break
|
||
|
|
}
|
||
|
|
if res.score < minQ {
|
||
|
|
break // sorted desc: everything after is worse
|
||
|
|
}
|
||
|
|
key := addrKey(res.ai)
|
||
|
|
if Indexers.ExistsAddr(key) {
|
||
|
|
continue // already in pool (race with heartbeat path)
|
||
|
|
}
|
||
|
|
cpy := res.ai
|
||
|
|
Indexers.SetAddr(key, &cpy)
|
||
|
|
admitted++
|
||
|
|
}
|
||
|
|
|
||
|
|
if admitted > 0 {
|
||
|
|
logger.Info().Int("admitted", admitted).Msg("[consensus] candidates admitted to pool")
|
||
|
|
Indexers.NudgeIt()
|
||
|
|
}
|
||
|
|
|
||
|
|
// Fill any remaining gap with DHT discovery.
|
||
|
|
if gap := need - admitted; gap > 0 {
|
||
|
|
logger.Info().Int("gap", gap).Msg("[consensus] gap after consensus, falling back to DHT")
|
||
|
|
replenishIndexersFromDHT(h, gap)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// probeIndexer dials the candidate, sends one lightweight heartbeat, and
|
||
|
|
// returns the HeartbeatResponse (nil if the indexer doesn't support it) and RTT.
|
||
|
|
func probeIndexer(h host.Host, ai pp.AddrInfo) (*HeartbeatResponse, time.Duration, error) {
|
||
|
|
ctx, cancel := context.WithTimeout(context.Background(), 8*time.Second)
|
||
|
|
defer cancel()
|
||
|
|
if h.Network().Connectedness(ai.ID) != network.Connected {
|
||
|
|
if err := h.Connect(ctx, ai); err != nil {
|
||
|
|
return nil, 0, err
|
||
|
|
}
|
||
|
|
}
|
||
|
|
s, err := h.NewStream(ctx, ai.ID, ProtocolHeartbeat)
|
||
|
|
if err != nil {
|
||
|
|
return nil, 0, err
|
||
|
|
}
|
||
|
|
defer s.Close()
|
||
|
|
|
||
|
|
hb := Heartbeat{PeerID: h.ID().String(), Timestamp: time.Now().UTC().Unix()}
|
||
|
|
s.SetWriteDeadline(time.Now().Add(3 * time.Second))
|
||
|
|
if err := json.NewEncoder(s).Encode(hb); err != nil {
|
||
|
|
return nil, 0, err
|
||
|
|
}
|
||
|
|
s.SetWriteDeadline(time.Time{})
|
||
|
|
|
||
|
|
sentAt := time.Now()
|
||
|
|
s.SetReadDeadline(time.Now().Add(5 * time.Second))
|
||
|
|
var resp HeartbeatResponse
|
||
|
|
if err := json.NewDecoder(s).Decode(&resp); err != nil {
|
||
|
|
// Indexer connected but no response: connection itself is the signal.
|
||
|
|
return nil, time.Since(sentAt), nil
|
||
|
|
}
|
||
|
|
return &resp, time.Since(sentAt), nil
|
||
|
|
}
|
||
|
|
|
||
|
|
// quickScore computes a lightweight score [0,100] from a probe result.
|
||
|
|
// Uses only fill rate (inverse) and latency — the two signals available
|
||
|
|
// without a full heartbeat history.
|
||
|
|
func quickScore(resp *HeartbeatResponse, rtt time.Duration) float64 {
|
||
|
|
maxRTT := BaseRoundTrip * 10
|
||
|
|
latencyScore := 1.0 - float64(rtt)/float64(maxRTT)
|
||
|
|
if latencyScore < 0 {
|
||
|
|
latencyScore = 0
|
||
|
|
}
|
||
|
|
if resp == nil {
|
||
|
|
// Connection worked but no response (old indexer): moderate score.
|
||
|
|
return latencyScore * 50
|
||
|
|
}
|
||
|
|
fillScore := 1.0 - resp.FillRate // prefer less-loaded indexers
|
||
|
|
return (0.5*latencyScore + 0.5*fillScore) * 100
|
||
|
|
}
|