Discovery Nano the light version.
This commit is contained in:
362
daemons/node/stream/dnt_cache.go
Normal file
362
daemons/node/stream/dnt_cache.go
Normal file
@@ -0,0 +1,362 @@
|
||||
package stream
|
||||
|
||||
// dnt_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
|
||||
//
|
||||
// When a stream write fails because the remote peer is unreachable, the request
|
||||
// is saved here and retried on the next tick. Two levels are defined:
|
||||
//
|
||||
// - dntCritical : retry indefinitely (create / update / delete resource).
|
||||
// - dntModerate : up to dntMaxModerateRetries retries, then abandon.
|
||||
//
|
||||
// Pubsub messages and search streams are explicitly excluded.
|
||||
// Streams initiated from the indexer side are never enqueued here.
|
||||
//
|
||||
// # Crash-resilient persistence
|
||||
//
|
||||
// Critical entries are written to an encrypted file (AES-256-GCM) so they
|
||||
// survive a node crash/restart. The AES key is derived deterministically from
|
||||
// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
|
||||
// Moderate entries are intentionally not persisted: their retry budget is small
|
||||
// enough that re-loading them after a restart would be misleading.
|
||||
|
||||
import (
|
||||
"crypto/aes"
|
||||
"crypto/cipher"
|
||||
"crypto/rand"
|
||||
"crypto/sha256"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"golang.org/x/crypto/hkdf"
|
||||
|
||||
"oc-discovery/conf"
|
||||
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
"github.com/libp2p/go-libp2p/core/protocol"
|
||||
)
|
||||
|
||||
type dntLevel int
|
||||
|
||||
const (
|
||||
dntCritical dntLevel = iota // retry until the message is delivered
|
||||
dntModerate // retry up to dntMaxModerateRetries times
|
||||
)
|
||||
|
||||
const dntMaxModerateRetries = 3
|
||||
const dntRetryInterval = 15 * time.Second
|
||||
|
||||
// dntProtocols maps each stream protocol to its DNT level.
|
||||
// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
|
||||
var dntProtocols = map[protocol.ID]dntLevel{
|
||||
// Critical — data mutations that must eventually be delivered.
|
||||
ProtocolCreateResource: dntCritical,
|
||||
ProtocolUpdateResource: dntCritical,
|
||||
ProtocolDeleteResource: dntCritical,
|
||||
// Moderate — confirmations / config / planner: 3 retries before abandon.
|
||||
ProtocolVerifyResource: dntModerate,
|
||||
ProtocolSendPlanner: dntModerate,
|
||||
ProtocolConsidersResource: dntModerate,
|
||||
ProtocolMinioConfigResource: dntModerate,
|
||||
ProtocolAdmiraltyConfigResource: dntModerate,
|
||||
}
|
||||
|
||||
// dntEntryJSON is the on-disk representation of a dntEntry.
|
||||
// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
|
||||
type dntEntryJSON struct {
|
||||
DID string `json:"did"`
|
||||
Addr pp.AddrInfo `json:"addr"`
|
||||
DT *tools.DataType `json:"dt,omitempty"`
|
||||
User string `json:"user"`
|
||||
Payload []byte `json:"payload"`
|
||||
Proto protocol.ID `json:"proto"`
|
||||
Retries int `json:"retries"`
|
||||
AddedAt time.Time `json:"added_at"`
|
||||
}
|
||||
|
||||
type dntEntry struct {
|
||||
did string
|
||||
addr pp.AddrInfo
|
||||
dt *tools.DataType
|
||||
user string
|
||||
payload []byte
|
||||
proto protocol.ID
|
||||
retries int
|
||||
addedAt time.Time
|
||||
}
|
||||
|
||||
func (e *dntEntry) toJSON() dntEntryJSON {
|
||||
return dntEntryJSON{
|
||||
DID: e.did,
|
||||
Addr: e.addr,
|
||||
DT: e.dt,
|
||||
User: e.user,
|
||||
Payload: e.payload,
|
||||
Proto: e.proto,
|
||||
Retries: e.retries,
|
||||
AddedAt: e.addedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func entryFromJSON(j dntEntryJSON) *dntEntry {
|
||||
return &dntEntry{
|
||||
did: j.DID,
|
||||
addr: j.Addr,
|
||||
dt: j.DT,
|
||||
user: j.User,
|
||||
payload: j.Payload,
|
||||
proto: j.Proto,
|
||||
retries: j.Retries,
|
||||
addedAt: j.AddedAt,
|
||||
}
|
||||
}
|
||||
|
||||
type dntCache struct {
|
||||
mu sync.Mutex
|
||||
entries []*dntEntry
|
||||
// aesKey is the derived AES-256 key used for on-disk encryption.
|
||||
// Nil when key derivation failed: persistence is disabled but the in-memory
|
||||
// cache continues to function normally.
|
||||
aesKey []byte
|
||||
}
|
||||
|
||||
// newDNTCache initialises the cache, derives the encryption key, and restores
|
||||
// any critical entries that were persisted before the last crash.
|
||||
func newDNTCache() *dntCache {
|
||||
log := oclib.GetLogger()
|
||||
c := &dntCache{}
|
||||
key, err := deriveDNTKey()
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
|
||||
} else {
|
||||
c.aesKey = key
|
||||
c.loadFromDisk()
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// enqueue adds an entry to the cache and persists critical entries to disk.
|
||||
func (c *dntCache) enqueue(e *dntEntry) {
|
||||
c.mu.Lock()
|
||||
c.entries = append(c.entries, e)
|
||||
c.mu.Unlock()
|
||||
if dntProtocols[e.proto] == dntCritical {
|
||||
go c.persistToDisk()
|
||||
}
|
||||
}
|
||||
|
||||
// drain atomically removes and returns all current entries.
|
||||
func (c *dntCache) drain() []*dntEntry {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
out := c.entries
|
||||
c.entries = nil
|
||||
return out
|
||||
}
|
||||
|
||||
// requeue puts entries back at the head of the list, preserving any new
|
||||
// entries added while the retry loop was running.
|
||||
func (c *dntCache) requeue(entries []*dntEntry) {
|
||||
if len(entries) == 0 {
|
||||
return
|
||||
}
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
c.entries = append(entries, c.entries...)
|
||||
}
|
||||
|
||||
// ── Persistence ──────────────────────────────────────────────────────────────
|
||||
|
||||
// dntCachePath returns the path of the on-disk cache file, placed next to the
|
||||
// node's private key so it lives on the same persistent volume.
|
||||
func dntCachePath() string {
|
||||
return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
|
||||
}
|
||||
|
||||
// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
|
||||
// using HKDF-SHA256. The derivation is deterministic: the same key is always
|
||||
// produced from the same private key, so no symmetric secret needs storing.
|
||||
func deriveDNTKey() ([]byte, error) {
|
||||
priv, err := tools.LoadKeyFromFilePrivate()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
|
||||
// (32-byte seed || 32-byte public key). We use the full 64 bytes as IKM.
|
||||
raw, err := priv.Raw()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
|
||||
key := make([]byte, 32)
|
||||
if _, err := io.ReadFull(reader, key); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return key, nil
|
||||
}
|
||||
|
||||
// persistToDisk encrypts all current critical entries and writes them to disk.
|
||||
// Non-critical entries are deliberately excluded — they are not worth restoring
|
||||
// after a restart given their limited retry budget.
|
||||
func (c *dntCache) persistToDisk() {
|
||||
if c.aesKey == nil {
|
||||
return
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
c.mu.Lock()
|
||||
var toSave []dntEntryJSON
|
||||
for _, e := range c.entries {
|
||||
if dntProtocols[e.proto] == dntCritical {
|
||||
toSave = append(toSave, e.toJSON())
|
||||
}
|
||||
}
|
||||
c.mu.Unlock()
|
||||
|
||||
plaintext, err := json.Marshal(toSave)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
block, err := aes.NewCipher(c.aesKey)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
gcm, err := cipher.NewGCM(block)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
nonce := make([]byte, gcm.NonceSize())
|
||||
if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
|
||||
return
|
||||
}
|
||||
ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
|
||||
|
||||
path := dntCachePath()
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to write cache file")
|
||||
return
|
||||
}
|
||||
if err := os.Rename(tmp, path); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
|
||||
_ = os.Remove(tmp)
|
||||
}
|
||||
}
|
||||
|
||||
// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
|
||||
// Errors (missing file, decryption failure) are non-fatal: the cache simply
|
||||
// starts empty, which is safe.
|
||||
func (c *dntCache) loadFromDisk() {
|
||||
if c.aesKey == nil {
|
||||
return
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
path := dntCachePath()
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to read cache file")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
block, err := aes.NewCipher(c.aesKey)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
gcm, err := cipher.NewGCM(block)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if len(data) < gcm.NonceSize() {
|
||||
log.Warn().Msg("[dnt] cache file too short, ignoring")
|
||||
return
|
||||
}
|
||||
nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
|
||||
plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
|
||||
return
|
||||
}
|
||||
|
||||
var saved []dntEntryJSON
|
||||
if err := json.Unmarshal(plaintext, &saved); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
|
||||
return
|
||||
}
|
||||
|
||||
count := 0
|
||||
for _, j := range saved {
|
||||
// Only restore critical entries — moderate entries are intentionally
|
||||
// not persisted, but this guard defends against format changes.
|
||||
if dntProtocols[j.Proto] != dntCritical {
|
||||
continue
|
||||
}
|
||||
c.entries = append(c.entries, entryFromJSON(j))
|
||||
count++
|
||||
}
|
||||
if count > 0 {
|
||||
log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Retry loop ────────────────────────────────────────────────────────────────
|
||||
|
||||
// startDNTLoop runs the background retry goroutine. Call once after init.
|
||||
func (s *StreamService) startDNTLoop() {
|
||||
logger := oclib.GetLogger()
|
||||
ticker := time.NewTicker(dntRetryInterval)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
entries := s.dnt.drain()
|
||||
if len(entries) == 0 {
|
||||
continue
|
||||
}
|
||||
var keep []*dntEntry
|
||||
for _, e := range entries {
|
||||
_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
|
||||
if err == nil {
|
||||
level := dntProtocols[e.proto]
|
||||
if level == dntCritical {
|
||||
logger.Info().
|
||||
Str("proto", string(e.proto)).
|
||||
Str("peer", e.did).
|
||||
Msg("[dnt] critical message delivered after retry")
|
||||
} else {
|
||||
logger.Info().
|
||||
Str("proto", string(e.proto)).
|
||||
Str("peer", e.did).
|
||||
Int("retries", e.retries).
|
||||
Msg("[dnt] moderate message delivered after retry")
|
||||
}
|
||||
continue
|
||||
}
|
||||
level := dntProtocols[e.proto]
|
||||
switch level {
|
||||
case dntCritical:
|
||||
keep = append(keep, e)
|
||||
case dntModerate:
|
||||
e.retries++
|
||||
if e.retries < dntMaxModerateRetries {
|
||||
keep = append(keep, e)
|
||||
} else {
|
||||
logger.Warn().
|
||||
Str("proto", string(e.proto)).
|
||||
Str("peer", e.did).
|
||||
Int("retries", e.retries).
|
||||
Msg("[dnt] moderate message abandoned after max retries")
|
||||
}
|
||||
}
|
||||
}
|
||||
s.dnt.requeue(keep)
|
||||
// Persist after each tick so the on-disk file reflects the current
|
||||
// state (entries delivered are removed, new ones from concurrent
|
||||
// enqueues are included).
|
||||
go s.dnt.persistToDisk()
|
||||
}
|
||||
}
|
||||
@@ -14,14 +14,23 @@ import (
|
||||
"cloud.o-forge.io/core/oc-lib/models/peer"
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"github.com/libp2p/go-libp2p/core/network"
|
||||
)
|
||||
|
||||
type Verify struct {
|
||||
IsVerified bool `json:"is_verified"`
|
||||
}
|
||||
|
||||
func (ps *StreamService) handleEvent(protocol string, evt *common.Event) error {
|
||||
fmt.Println("handleEvent")
|
||||
func (ps *StreamService) handleEvent(protocol string, evt *common.Event, s network.Stream) error {
|
||||
fmt.Println("handleEvent", protocol)
|
||||
// Heartbeat received on an outgoing ProtocolObserve stream.
|
||||
if protocol == ProtocolObserve {
|
||||
return ps.handleIncomingObserve(s)
|
||||
}
|
||||
if protocol == observeHBEventType {
|
||||
return ps.handleObserveHeartbeat(evt)
|
||||
}
|
||||
|
||||
ps.handleEventFromPartner(evt, protocol)
|
||||
/*if protocol == ProtocolVerifyResource {
|
||||
if evt.DataType == -1 {
|
||||
@@ -159,7 +168,7 @@ func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol stri
|
||||
And: map[string][]dbs.Filter{
|
||||
"peer_id": {{Operator: dbs.EQUAL.String(), Value: evt.From}},
|
||||
},
|
||||
}, evt.From, false)
|
||||
}, evt.From, false, 0, 1)
|
||||
if len(peers.Data) > 0 {
|
||||
p := peers.Data[0].(*peer.Peer)
|
||||
ps.SendResponse(p, evt, fmt.Sprintf("%v", search))
|
||||
@@ -212,7 +221,7 @@ func (abs *StreamService) SendResponse(p *peer.Peer, event *common.Event, search
|
||||
} else {
|
||||
for _, dt := range dts {
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil)
|
||||
searched := access.Search(abs.FilterPeer(self.GetID(), event.Groups, search), "", false)
|
||||
searched := access.Search(abs.FilterPeer(self.GetID(), event.Groups, search), "", false, 0, 0)
|
||||
for _, ss := range searched.Data {
|
||||
if j, err := json.Marshal(ss); err == nil {
|
||||
abs.PublishCommon(&dt, event.User, event.Groups, p.PeerID, ProtocolSearchResource, j)
|
||||
|
||||
552
daemons/node/stream/observe.go
Normal file
552
daemons/node/stream/observe.go
Normal file
@@ -0,0 +1,552 @@
|
||||
package stream
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"oc-discovery/daemons/node/common"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
"cloud.o-forge.io/core/oc-lib/models/peer"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"github.com/libp2p/go-libp2p/core/network"
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
)
|
||||
|
||||
// ProtocolObserve is the libp2p protocol for peer connectivity observation.
|
||||
// The requesting oc-discovery opens a stream to the remote oc-discovery and
|
||||
// sends an ObserveRequest. The remote side keeps the stream open and writes
|
||||
// ObserveHeartbeat events back every observeHBInterval seconds.
|
||||
const ProtocolObserve = "/opencloud/peer/observe/1.0"
|
||||
|
||||
// observeHBEventType is used as the common.Event.Type for heartbeat responses.
|
||||
const observeHBEventType = "/opencloud/peer/observe/heartbeat"
|
||||
|
||||
const observeHBInterval = 30 * time.Second
|
||||
const observeDrainDuration = 30 * time.Second
|
||||
|
||||
// observeBatchWindow is the accumulation window before a heartbeat batch is
|
||||
// flushed to NATS. All peer heartbeats received within this window are grouped
|
||||
// into a single PEER_OBSERVE_RESPONSE_EVENT, reducing NATS traffic.
|
||||
const observeBatchWindow = 2 * time.Second
|
||||
|
||||
// ObserveRequest is the first (and only) message sent by the observing side
|
||||
// when opening a ProtocolObserve stream.
|
||||
type ObserveRequest struct {
|
||||
// Close, when true, asks the remote side to stop the heartbeat goroutine
|
||||
// and remove the observer from its cache. Used for graceful teardown.
|
||||
Close bool `json:"close,omitempty"`
|
||||
}
|
||||
|
||||
// ObserveHeartbeat is sent by the observed side every observeHBInterval.
|
||||
type ObserveHeartbeat struct {
|
||||
State string `json:"state"` // always "online" when actively emitted
|
||||
}
|
||||
|
||||
// ShallowPeer is the minimal peer representation sent by oc-peer in a
|
||||
// PEER_OBSERVE_EVENT. StreamAddress lets oc-discovery connect without a DB
|
||||
// lookup; Address carries the NATSAddress (unused here, forwarded as-is).
|
||||
type ShallowPeer struct {
|
||||
ID string `json:"id"`
|
||||
PeerID string `json:"peer_id"`
|
||||
Address string `json:"address"`
|
||||
StreamAddress string `json:"stream_address"`
|
||||
}
|
||||
|
||||
// ObserveCommand is the payload carried by a PEER_OBSERVE_EVENT NATS message
|
||||
// (from oc-peer).
|
||||
//
|
||||
// Observe → User + Peers populated
|
||||
// Close → User + PeerIDs + Close=true
|
||||
// CloseAll → CloseAll=true (User optional)
|
||||
type ObserveCommand struct {
|
||||
User string `json:"user"`
|
||||
Peers []ShallowPeer `json:"peers,omitempty"`
|
||||
PeerIDs []string `json:"peer_ids,omitempty"`
|
||||
Close bool `json:"close,omitempty"`
|
||||
CloseAll bool `json:"close_all,omitempty"`
|
||||
}
|
||||
|
||||
// ── observe cache (observed side) ────────────────────────────────────────────
|
||||
|
||||
// observeCache tracks running heartbeat goroutines keyed by the observing
|
||||
// peer's libp2p PeerID string. It is used exclusively on the OBSERVED side.
|
||||
type observeCache struct {
|
||||
mu sync.Mutex
|
||||
cancels map[string]context.CancelFunc
|
||||
}
|
||||
|
||||
func newObserveCache() *observeCache {
|
||||
return &observeCache{cancels: map[string]context.CancelFunc{}}
|
||||
}
|
||||
|
||||
func (c *observeCache) set(pid string, cancel context.CancelFunc) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
if old, ok := c.cancels[pid]; ok {
|
||||
old() // cancel previous goroutine if any
|
||||
}
|
||||
c.cancels[pid] = cancel
|
||||
}
|
||||
|
||||
func (c *observeCache) cancel(pid string) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
if fn, ok := c.cancels[pid]; ok {
|
||||
fn()
|
||||
delete(c.cancels, pid)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *observeCache) cancelAll() {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
for _, fn := range c.cancels {
|
||||
fn()
|
||||
}
|
||||
c.cancels = map[string]context.CancelFunc{}
|
||||
}
|
||||
|
||||
func (c *observeCache) delete(pid string) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
delete(c.cancels, pid)
|
||||
}
|
||||
|
||||
// ── heartbeat batcher (observing side) ───────────────────────────────────────
|
||||
|
||||
// heartbeatBatcher accumulates peer_ids from incoming heartbeats over
|
||||
// observeBatchWindow, then flushes them in a single NATS call.
|
||||
// Using a map as the backing store deduplicates multiple heartbeats from the
|
||||
// same peer within the same window (should not happen, but is harmless).
|
||||
type heartbeatBatcher struct {
|
||||
mu sync.Mutex
|
||||
ids map[string]struct{}
|
||||
timer *time.Timer
|
||||
flush func(peerIDs []string)
|
||||
}
|
||||
|
||||
func newHeartbeatBatcher(flush func([]string)) *heartbeatBatcher {
|
||||
return &heartbeatBatcher{
|
||||
ids: make(map[string]struct{}),
|
||||
flush: flush,
|
||||
}
|
||||
}
|
||||
|
||||
// add records peerID in the current batch and arms the flush timer if needed.
|
||||
func (b *heartbeatBatcher) add(peerID string) {
|
||||
b.mu.Lock()
|
||||
defer b.mu.Unlock()
|
||||
b.ids[peerID] = struct{}{}
|
||||
if b.timer == nil {
|
||||
b.timer = time.AfterFunc(observeBatchWindow, b.fire)
|
||||
}
|
||||
}
|
||||
|
||||
// fire is called by the timer; it drains the batch and invokes flush.
|
||||
func (b *heartbeatBatcher) fire() {
|
||||
b.mu.Lock()
|
||||
ids := make([]string, 0, len(b.ids))
|
||||
for id := range b.ids {
|
||||
ids = append(ids, id)
|
||||
}
|
||||
b.ids = make(map[string]struct{})
|
||||
b.timer = nil
|
||||
b.mu.Unlock()
|
||||
if len(ids) > 0 {
|
||||
b.flush(ids)
|
||||
}
|
||||
}
|
||||
|
||||
// flushObserveBatch is the flush function wired into the heartbeatBatcher.
|
||||
// It emits two NATS messages:
|
||||
// - PEER_OBSERVE_RESPONSE_EVENT → consumed by oc-peer (direct channel)
|
||||
// - PROPALGATION_EVENT / PB_PROPAGATE → consumed by other oc-discovery nodes
|
||||
func flushObserveBatch(peerIDs []string) {
|
||||
payload, err := json.Marshal(map[string]interface{}{
|
||||
"peer_ids": peerIDs,
|
||||
"state": "online",
|
||||
})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Direct notification to oc-peer.
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PEER_OBSERVE_RESPONSE_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
Datatype: tools.PEER,
|
||||
Method: int(tools.PEER_OBSERVE_RESPONSE_EVENT),
|
||||
Payload: payload,
|
||||
})
|
||||
|
||||
// Broadcast to other oc-discovery nodes so they can forward to their
|
||||
// local oc-peer if needed.
|
||||
propPayload, err := json.Marshal(tools.PropalgationMessage{
|
||||
DataType: int(tools.PEER),
|
||||
Action: tools.PB_PROPAGATE,
|
||||
Payload: payload,
|
||||
})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
Datatype: tools.PEER,
|
||||
Method: int(tools.PROPALGATION_EVENT),
|
||||
Payload: propPayload,
|
||||
})
|
||||
}
|
||||
|
||||
// ── incoming observe handler (observed side) ──────────────────────────────────
|
||||
|
||||
// handleIncomingObserve is registered as the ProtocolObserve stream handler.
|
||||
// It is called when a remote peer opens an observe stream to us.
|
||||
// The function reads the request, validates it, then starts (or stops) the
|
||||
// heartbeat goroutine and returns immediately — the goroutine owns the stream.
|
||||
func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
remotePeerID := rawStream.Conn().RemotePeer().String()
|
||||
addr := rawStream.Conn().RemoteMultiaddr().String()
|
||||
ad, err := pp.AddrInfoFromString(addr + "/p2p/" + remotePeerID)
|
||||
if err != nil {
|
||||
fmt.Println("qndlqnl EERR", addr, err)
|
||||
return err
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
|
||||
// Drain mode: reject any new observations for 30 s after a close-all.
|
||||
s.drainMu.RLock()
|
||||
draining := !s.drainUntil.IsZero() && time.Now().Before(s.drainUntil)
|
||||
s.drainMu.RUnlock()
|
||||
if draining {
|
||||
rawStream.Close()
|
||||
fmt.Println("Draining")
|
||||
return errors.New("Draining")
|
||||
}
|
||||
// Read the observe request (with a generous deadline to avoid hangs).
|
||||
// Guard: the requesting peer must not be blacklisted or be ourself.
|
||||
did := ""
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
|
||||
res := access.Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"peer_id": {{Operator: dbs.EQUAL.String(), Value: remotePeerID}},
|
||||
},
|
||||
}, "", false, 0, 1)
|
||||
if len(res.Data) > 0 {
|
||||
p := res.Data[0].(*peer.Peer)
|
||||
did = p.GetID()
|
||||
if p.Relation == peer.BLACKLIST { // || p.Relation == peer.SELF
|
||||
rawStream.Close()
|
||||
fmt.Println("CLOSE blacklist or self")
|
||||
return errors.New("can't exploit blacklist or self")
|
||||
}
|
||||
}
|
||||
|
||||
// Replace any existing heartbeat goroutine for this observer.
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
s.observeCache.set(remotePeerID, cancel)
|
||||
fmt.Println("LOOP OBSERVE")
|
||||
go func() {
|
||||
defer rawStream.Close()
|
||||
defer cancel()
|
||||
defer s.observeCache.delete(remotePeerID)
|
||||
|
||||
ticker := time.NewTicker(observeHBInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
hbPayload, _ := json.Marshal(ObserveHeartbeat{State: "online"})
|
||||
evt := common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", hbPayload)
|
||||
if evt == nil {
|
||||
return
|
||||
}
|
||||
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
|
||||
stream := s.Streams[ProtocolObserve][ad.ID]
|
||||
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
|
||||
// Moderate connectivity event: the observer is unreachable.
|
||||
// The deferred calls above purge this observer from the cache.
|
||||
fmt.Println("LOOP EVT ERR", err)
|
||||
log.Info().
|
||||
Str("observer", remotePeerID).
|
||||
Err(err).
|
||||
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
|
||||
return
|
||||
}
|
||||
}
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
|
||||
rawStream.SetWriteDeadline(time.Now().Add(5 * time.Second))
|
||||
fmt.Println("LOOP EVT", evt)
|
||||
var err error
|
||||
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
|
||||
stream := s.Streams[ProtocolObserve][ad.ID]
|
||||
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
|
||||
// Moderate connectivity event: the observer is unreachable.
|
||||
// The deferred calls above purge this observer from the cache.
|
||||
fmt.Println("LOOP EVT ERR", err)
|
||||
log.Info().
|
||||
Str("observer", remotePeerID).
|
||||
Err(err).
|
||||
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
|
||||
return
|
||||
}
|
||||
}
|
||||
rawStream.SetWriteDeadline(time.Time{})
|
||||
}
|
||||
}
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── heartbeat receiver (observing side) ───────────────────────────────────────
|
||||
|
||||
// handleObserveHeartbeat is called by readLoop when a heartbeat event arrives
|
||||
// on an outgoing ProtocolObserve stream. It queues the peer_id in the batch
|
||||
// accumulator; the batcher flushes to NATS after observeBatchWindow.
|
||||
func (ps *StreamService) handleObserveHeartbeat(evt *common.Event) error {
|
||||
// ps.hbBatcher.add(evt.From)
|
||||
flushObserveBatch([]string{evt.From})
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── user→peer index (ref-counted observe management) ─────────────────────────
|
||||
|
||||
// userPeerIndex tracks which users are observing which peers.
|
||||
// A libp2p observe stream is kept open as long as at least one user watches
|
||||
// the peer; it is closed only when the last user stops.
|
||||
type userPeerIndex struct {
|
||||
mu sync.Mutex
|
||||
index map[string]map[string]struct{} // user → set of peer_id strings
|
||||
}
|
||||
|
||||
func newUserPeerIndex() *userPeerIndex {
|
||||
return &userPeerIndex{index: map[string]map[string]struct{}{}}
|
||||
}
|
||||
|
||||
// add registers user as an observer of peerID.
|
||||
// Returns true if peerID was not yet observed by any user (first observer).
|
||||
func (u *userPeerIndex) add(user, peerID string) (isFirst bool) {
|
||||
u.mu.Lock()
|
||||
defer u.mu.Unlock()
|
||||
// Count total observers for peerID across all users before adding.
|
||||
total := 0
|
||||
for _, peers := range u.index {
|
||||
if _, ok := peers[peerID]; ok {
|
||||
total++
|
||||
}
|
||||
}
|
||||
if u.index[user] == nil {
|
||||
u.index[user] = map[string]struct{}{}
|
||||
}
|
||||
u.index[user][peerID] = struct{}{}
|
||||
return total == 0
|
||||
}
|
||||
|
||||
// remove unregisters user from peerID.
|
||||
// Returns true if no user is observing peerID anymore (last observer removed).
|
||||
func (u *userPeerIndex) remove(user, peerID string) (isLast bool) {
|
||||
u.mu.Lock()
|
||||
defer u.mu.Unlock()
|
||||
delete(u.index[user], peerID)
|
||||
if len(u.index[user]) == 0 {
|
||||
delete(u.index, user)
|
||||
}
|
||||
for _, peers := range u.index {
|
||||
if _, ok := peers[peerID]; ok {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// removeUser removes all entries for user and returns the peer_ids that now
|
||||
// have no remaining observers (i.e., those whose streams should be closed).
|
||||
func (u *userPeerIndex) removeUser(user string) []string {
|
||||
u.mu.Lock()
|
||||
defer u.mu.Unlock()
|
||||
watched := u.index[user]
|
||||
delete(u.index, user)
|
||||
var orphans []string
|
||||
for peerID := range watched {
|
||||
found := false
|
||||
for _, peers := range u.index {
|
||||
if _, ok := peers[peerID]; ok {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
orphans = append(orphans, peerID)
|
||||
}
|
||||
}
|
||||
return orphans
|
||||
}
|
||||
|
||||
// ── NATS command handler (observing side) ─────────────────────────────────────
|
||||
|
||||
// HandleObserveNATSCommand processes a PEER_OBSERVE_EVENT received from oc-peer.
|
||||
func (ps *StreamService) HandleObserveNATSCommand(resp tools.NATSResponse) {
|
||||
log := oclib.GetLogger()
|
||||
var cmd ObserveCommand
|
||||
if err := json.Unmarshal(resp.Payload, &cmd); err != nil {
|
||||
log.Warn().Err(err).Msg("[observe] failed to unmarshal ObserveCommand")
|
||||
return
|
||||
}
|
||||
if cmd.CloseAll {
|
||||
log.Info().Msg("[observe] close-all received via NATS")
|
||||
ps.CloseAllObserves()
|
||||
return
|
||||
}
|
||||
if cmd.Close {
|
||||
for _, peerID := range cmd.PeerIDs {
|
||||
if isLast := ps.observeUsers.remove(cmd.User, peerID); isLast {
|
||||
if err := ps.closeObserveStream(peerID); err != nil {
|
||||
log.Warn().Str("peer", peerID).Err(err).Msg("[observe] closeObserveStream failed")
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
// Observe: open streams for any new peer, using the address from the payload.
|
||||
for _, p := range cmd.Peers {
|
||||
if isFirst := ps.observeUsers.add(cmd.User, p.PeerID); isFirst {
|
||||
if err := ps.openObserveStream(p); err != nil {
|
||||
// Roll back the index entry so the next NATS command can retry.
|
||||
ps.observeUsers.remove(cmd.User, p.PeerID)
|
||||
log.Warn().Str("peer", p.PeerID).Err(err).Msg("[observe] openObserveStream failed")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── outgoing observe management (observing side) ──────────────────────────────
|
||||
|
||||
// OpenObserveStream is the exported variant for inter-discovery propagation
|
||||
// (no user context available). It bypasses the user index and opens the stream
|
||||
// directly if not already open.
|
||||
func (ps *StreamService) OpenObserveStream(p ShallowPeer) error {
|
||||
return ps.openObserveStream(p)
|
||||
}
|
||||
|
||||
// CloseObserveStream is the exported variant for inter-discovery propagation.
|
||||
func (ps *StreamService) CloseObserveStream(toPeerID string) error {
|
||||
return ps.closeObserveStream(toPeerID)
|
||||
}
|
||||
|
||||
// openObserveStream opens a ProtocolObserve stream to p.
|
||||
// Uses p.StreamAddress directly; falls back to DB then DHT lookup if empty.
|
||||
func (ps *StreamService) openObserveStream(p ShallowPeer) error {
|
||||
streamAddr := p.StreamAddress
|
||||
fmt.Println("STREAM OBS", streamAddr)
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
|
||||
res := access.Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"peer_id": {{Operator: dbs.EQUAL.String(), Value: p.PeerID}},
|
||||
},
|
||||
}, "", false, 0, 1)
|
||||
if streamAddr == "" {
|
||||
// Fallback: DB then DHT.
|
||||
if len(res.Data) > 0 {
|
||||
streamAddr = res.Data[0].(*peer.Peer).StreamAddress
|
||||
} else if peers, err := ps.Node.GetPeerRecord(context.Background(), p.PeerID); err == nil && len(peers) > 0 {
|
||||
streamAddr = peers[0].StreamAddress
|
||||
}
|
||||
}
|
||||
if len(res.Data) > 0 && res.Data[0].(*peer.Peer).Relation == peer.SELF {
|
||||
return errors.New("Can't send to self")
|
||||
}
|
||||
fmt.Println("STREAM OBS SSS", streamAddr)
|
||||
|
||||
if streamAddr == "" {
|
||||
return nil // can't resolve address — silently skip
|
||||
}
|
||||
|
||||
decodedID, err := pp.Decode(p.PeerID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// If a stream already exists, reuse it.
|
||||
ps.Mu.RLock()
|
||||
_, alreadyOpen := ps.Streams[ProtocolObserve][decodedID]
|
||||
ps.Mu.RUnlock()
|
||||
if alreadyOpen {
|
||||
return nil
|
||||
}
|
||||
ad, err := pp.AddrInfoFromString(streamAddr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Println("TempStream OBSERVE", ad)
|
||||
if ps.Streams, err = common.TempStream(ps.Host, *ad, ProtocolObserve, p.ID, ps.Streams, protocols, &ps.Mu); err == nil {
|
||||
rawStream := ps.Streams[ProtocolObserve][ad.ID]
|
||||
if hbPayload, err := json.Marshal(ObserveRequest{Close: false}); err == nil {
|
||||
if err := json.NewEncoder(rawStream.Stream).Encode(common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", hbPayload)); err != nil {
|
||||
fmt.Println("ERR")
|
||||
rawStream.Stream.Close()
|
||||
return err
|
||||
}
|
||||
s := &common.Stream{
|
||||
Stream: rawStream.Stream,
|
||||
Expiry: time.Now().Add(365 * 24 * time.Hour),
|
||||
}
|
||||
ps.Mu.Lock()
|
||||
if ps.Streams[ProtocolObserve] == nil {
|
||||
ps.Streams[ProtocolObserve] = map[pp.ID]*common.Stream{}
|
||||
}
|
||||
ps.Streams[ProtocolObserve][ad.ID] = s
|
||||
ps.Mu.Unlock()
|
||||
|
||||
go ps.readLoop(s, ad.ID, ProtocolObserve, &common.ProtocolInfo{PersistantStream: true})
|
||||
}
|
||||
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// closeObserveStream closes the ProtocolObserve stream to toPeerID and notifies
|
||||
// the remote side.
|
||||
func (ps *StreamService) closeObserveStream(toPeerID string) error {
|
||||
decodedID, err := pp.Decode(toPeerID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
ps.Mu.Lock()
|
||||
if ps.Streams[ProtocolObserve] != nil {
|
||||
if s, ok := ps.Streams[ProtocolObserve][decodedID]; ok {
|
||||
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
|
||||
s.Stream.Close()
|
||||
delete(ps.Streams[ProtocolObserve], decodedID)
|
||||
}
|
||||
}
|
||||
ps.Mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
// CloseAllObserves closes every outgoing ProtocolObserve stream, clears the
|
||||
// user index, and enters drain mode for observeDrainDuration.
|
||||
func (ps *StreamService) CloseAllObserves() {
|
||||
ps.Mu.Lock()
|
||||
for _, s := range ps.Streams[ProtocolObserve] {
|
||||
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
|
||||
s.Stream.Close()
|
||||
}
|
||||
delete(ps.Streams, ProtocolObserve)
|
||||
ps.Mu.Unlock()
|
||||
|
||||
// Reset user index so stale ref-counts don't block future opens.
|
||||
ps.observeUsers = newUserPeerIndex()
|
||||
|
||||
ps.drainMu.Lock()
|
||||
ps.drainUntil = time.Now().Add(observeDrainDuration)
|
||||
ps.drainMu.Unlock()
|
||||
}
|
||||
@@ -6,6 +6,8 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"oc-discovery/daemons/node/common"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
@@ -19,9 +21,9 @@ func (ps *StreamService) PublishesCommon(dt *tools.DataType, user string, groups
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
|
||||
var p oclib.LibDataShallow
|
||||
if filter == nil {
|
||||
p = access.LoadAll(false)
|
||||
p = access.LoadAll(false, 0, 10000)
|
||||
} else {
|
||||
p = access.Search(filter, "", false)
|
||||
p = access.Search(filter, "", false, 0, 10000)
|
||||
}
|
||||
for _, pes := range p.Data {
|
||||
for _, proto := range protos {
|
||||
@@ -45,7 +47,7 @@ func (ps *StreamService) PublishCommon(dt *tools.DataType, user string, groups [
|
||||
And: map[string][]dbs.Filter{ // search by name if no filters are provided
|
||||
"peer_id": {{Operator: dbs.EQUAL.String(), Value: toPeerID}},
|
||||
},
|
||||
}, toPeerID, false)
|
||||
}, toPeerID, false, 0, 1)
|
||||
var pe *peer.Peer
|
||||
if len(p.Data) > 0 && p.Data[0].(*peer.Peer).Relation != peer.BLACKLIST {
|
||||
pe = p.Data[0].(*peer.Peer)
|
||||
@@ -57,13 +59,36 @@ func (ps *StreamService) PublishCommon(dt *tools.DataType, user string, groups [
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return ps.write(toPeerID, ad, dt, user, resource, proto)
|
||||
stream, err := ps.write(toPeerID, ad, dt, user, resource, proto)
|
||||
if err != nil {
|
||||
if _, ok := dntProtocols[proto]; ok {
|
||||
ps.dnt.enqueue(&dntEntry{
|
||||
did: toPeerID,
|
||||
addr: *ad,
|
||||
dt: dt,
|
||||
user: user,
|
||||
payload: resource,
|
||||
proto: proto,
|
||||
addedAt: time.Now().UTC(),
|
||||
})
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
return stream, nil
|
||||
}
|
||||
return nil, errors.New("peer unvalid " + toPeerID)
|
||||
}
|
||||
|
||||
func (ps *StreamService) ToPartnerPublishEvent(
|
||||
ctx context.Context, action tools.PubSubAction, dt *tools.DataType, user string, groups []string, payload []byte) error {
|
||||
var proto protocol.ID
|
||||
proto = ProtocolCreateResource
|
||||
switch action {
|
||||
case tools.PB_DELETE:
|
||||
proto = ProtocolDeleteResource
|
||||
case tools.PB_UPDATE:
|
||||
proto = ProtocolUpdateResource
|
||||
}
|
||||
if *dt == tools.PEER {
|
||||
var p peer.Peer
|
||||
if err := json.Unmarshal(payload, &p); err != nil {
|
||||
@@ -87,25 +112,30 @@ func (ps *StreamService) ToPartnerPublishEvent(
|
||||
|
||||
}
|
||||
}
|
||||
var per peer.Peer
|
||||
if err := json.Unmarshal(payload, &per); err == nil && !strings.Contains(per.Relation.String(), "master") && !strings.Contains(per.Relation.String(), "nano") {
|
||||
for _, rel := range []peer.PeerRelation{peer.MASTER, peer.NANO} {
|
||||
ps.PublishesCommon(dt, user, groups, &dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"relation": {{Operator: dbs.EQUAL.String(), Value: rel}},
|
||||
},
|
||||
}, payload, proto)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
ks := []protocol.ID{}
|
||||
for k := range protocolsPartners {
|
||||
ks = append(ks, k)
|
||||
}
|
||||
var proto protocol.ID
|
||||
proto = ProtocolCreateResource
|
||||
switch action {
|
||||
case tools.PB_DELETE:
|
||||
proto = ProtocolDeleteResource
|
||||
case tools.PB_UPDATE:
|
||||
proto = ProtocolUpdateResource
|
||||
for _, rel := range []peer.PeerRelation{peer.PARTNER, peer.MASTER, peer.NANO} {
|
||||
ps.PublishesCommon(dt, user, groups, &dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"relation": {{Operator: dbs.EQUAL.String(), Value: rel}},
|
||||
},
|
||||
}, payload, proto)
|
||||
}
|
||||
ps.PublishesCommon(dt, user, groups, &dbs.Filters{ // filter by like name, short_description, description, owner, url if no filters are provided
|
||||
And: map[string][]dbs.Filter{
|
||||
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.PARTNER}},
|
||||
},
|
||||
}, payload, proto)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -129,7 +159,6 @@ func (s *StreamService) write(
|
||||
if s.Streams, err = common.TempStream(s.Host, *peerID, proto, did, s.Streams, pts, &s.Mu); err != nil {
|
||||
fmt.Println("TempStream", err)
|
||||
return nil, errors.New("no stream available for protocol " + fmt.Sprintf("%v", proto) + " from PID " + peerID.ID.String())
|
||||
|
||||
}
|
||||
|
||||
stream := s.Streams[proto][peerID.ID]
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/config"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
"cloud.o-forge.io/core/oc-lib/models/peer"
|
||||
"cloud.o-forge.io/core/oc-lib/models/utils"
|
||||
@@ -42,6 +43,7 @@ var protocols = map[protocol.ID]*common.ProtocolInfo{
|
||||
ProtocolVerifyResource: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
ProtocolMinioConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
ProtocolAdmiraltyConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
ProtocolObserve: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
}
|
||||
|
||||
var protocolsPartners = map[protocol.ID]*common.ProtocolInfo{
|
||||
@@ -61,6 +63,21 @@ type StreamService struct {
|
||||
// IsPeerKnown, when set, is called at stream open for every inbound protocol.
|
||||
// Return false to reset the stream immediately. Left nil until wired by the node.
|
||||
IsPeerKnown func(pid pp.ID) bool
|
||||
// dnt is the Disconnection Network Tolerance cache for outbound streams.
|
||||
dnt *dntCache
|
||||
// observeCache tracks running heartbeat goroutines on the OBSERVED side.
|
||||
observeCache *observeCache
|
||||
// hbBatcher accumulates incoming heartbeats (observing side) and flushes
|
||||
// them as a single NATS batch after observeBatchWindow.
|
||||
hbBatcher *heartbeatBatcher
|
||||
// drainUntil / drainMu implement the startup drain window: for 30 s after a
|
||||
// close-all, incoming ProtocolObserve requests are rejected so stale heartbeats
|
||||
// from a previous run cannot mix with fresh observations.
|
||||
drainUntil time.Time
|
||||
drainMu sync.RWMutex
|
||||
// observeUsers tracks which users are observing which peers so streams are
|
||||
// closed only when the last observer for a peer disconnects.
|
||||
observeUsers *userPeerIndex
|
||||
}
|
||||
|
||||
func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node common.DiscoveryPeer) (*StreamService, error) {
|
||||
@@ -72,31 +89,60 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
|
||||
Streams: common.ProtocolStream{},
|
||||
maxNodesConn: maxNode,
|
||||
ResourceSearches: common.NewSearchTracker(),
|
||||
dnt: newDNTCache(),
|
||||
observeCache: newObserveCache(),
|
||||
observeUsers: newUserPeerIndex(),
|
||||
}
|
||||
service.hbBatcher = newHeartbeatBatcher(flushObserveBatch)
|
||||
for proto := range protocols {
|
||||
service.Host.SetStreamHandler(proto, service.gate(service.HandleResponse))
|
||||
}
|
||||
// ProtocolObserve uses a dedicated handler (bidirectional, long-lived).
|
||||
logger.Info().Msg("connect to partners...")
|
||||
service.connectToPartners() // we set up a stream
|
||||
go service.StartGC(8 * time.Second)
|
||||
go service.startDNTLoop()
|
||||
return service, nil
|
||||
}
|
||||
|
||||
// gate wraps a stream handler with IsPeerKnown validation.
|
||||
// If the peer is unknown the entire connection is closed and the handler is not called.
|
||||
// IsPeerKnown is read at stream-open time so it works even when set after InitStream.
|
||||
func (s *StreamService) gatePrivilege(h func(network.Stream)) func(network.Stream) {
|
||||
return func(stream network.Stream) {
|
||||
if config.GetConfig().IsNano {
|
||||
d := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.MASTER}},
|
||||
},
|
||||
}, "", false, 0, 1)
|
||||
if len(d.Data) == 0 {
|
||||
return
|
||||
}
|
||||
}
|
||||
s.knowingGate(stream, h)
|
||||
}
|
||||
}
|
||||
|
||||
// gate wraps a stream handler with IsPeerKnown validation.
|
||||
// If the peer is unknown the entire connection is closed and the handler is not called.
|
||||
// IsPeerKnown is read at stream-open time so it works even when set after InitStream.
|
||||
func (s *StreamService) gate(h func(network.Stream)) func(network.Stream) {
|
||||
return func(stream network.Stream) {
|
||||
if s.IsPeerKnown != nil && !s.IsPeerKnown(stream.Conn().RemotePeer()) {
|
||||
logger := oclib.GetLogger()
|
||||
logger.Warn().Str("peer", stream.Conn().RemotePeer().String()).Msg("[stream] unknown peer, closing connection")
|
||||
stream.Conn().Close()
|
||||
return
|
||||
}
|
||||
h(stream)
|
||||
s.knowingGate(stream, h)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *StreamService) knowingGate(stream network.Stream, h func(network.Stream)) {
|
||||
if s.IsPeerKnown != nil && !s.IsPeerKnown(stream.Conn().RemotePeer()) {
|
||||
logger := oclib.GetLogger()
|
||||
logger.Warn().Str("peer", stream.Conn().RemotePeer().String()).Msg("[stream] unknown peer, closing connection")
|
||||
stream.Conn().Close()
|
||||
return
|
||||
}
|
||||
h(stream)
|
||||
}
|
||||
|
||||
func (s *StreamService) HandleResponse(stream network.Stream) {
|
||||
s.Mu.Lock()
|
||||
defer s.Mu.Unlock()
|
||||
@@ -137,13 +183,27 @@ func (s *StreamService) connectToPartners() error {
|
||||
go s.readLoop(s.Streams[proto][ss.Conn().RemotePeer()], ss.Conn().RemotePeer(), proto, info)
|
||||
}
|
||||
logger.Info().Msg("SetStreamHandler " + string(proto))
|
||||
s.Host.SetStreamHandler(proto, s.gate(f))
|
||||
s.Host.SetStreamHandler(proto, s.gatePrivilege(f))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *StreamService) searchPeer(search string) ([]*peer.Peer, error) {
|
||||
ps := []*peer.Peer{}
|
||||
if conf.GetConfig().NanoIDS != "" {
|
||||
for _, peerID := range strings.Split(conf.GetConfig().NanoIDS, ",") {
|
||||
ppID := strings.Split(peerID, "/")
|
||||
ps = append(ps, &peer.Peer{
|
||||
AbstractObject: utils.AbstractObject{
|
||||
UUID: uuid.New().String(),
|
||||
Name: ppID[1],
|
||||
},
|
||||
PeerID: ppID[len(ppID)-1],
|
||||
StreamAddress: peerID,
|
||||
Relation: peer.NANO,
|
||||
})
|
||||
}
|
||||
}
|
||||
if conf.GetConfig().PeerIDS != "" {
|
||||
for _, peerID := range strings.Split(conf.GetConfig().PeerIDS, ",") {
|
||||
ppID := strings.Split(peerID, "/")
|
||||
@@ -159,7 +219,7 @@ func (s *StreamService) searchPeer(search string) ([]*peer.Peer, error) {
|
||||
}
|
||||
}
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
|
||||
peers := access.Search(nil, search, false)
|
||||
peers := access.Search(nil, search, false, 0, 0)
|
||||
for _, p := range peers.Data {
|
||||
ps = append(ps, p.(*peer.Peer))
|
||||
}
|
||||
@@ -230,7 +290,7 @@ func (ps *StreamService) readLoop(s *common.Stream, id pp.ID, proto protocol.ID,
|
||||
}
|
||||
continue
|
||||
}
|
||||
ps.handleEvent(evt.Type, &evt)
|
||||
ps.handleEvent(evt.Type, &evt, s.Stream)
|
||||
if protocolInfo.WaitResponse && !protocolInfo.PersistantStream {
|
||||
break
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user