Discovery Nano the light version.

This commit is contained in:
mr
2026-04-29 07:41:00 +02:00
parent fa341494d9
commit 7f951afd41
34 changed files with 2961 additions and 1501 deletions

View File

@@ -92,10 +92,6 @@ type SearchQuery struct {
// SearchPeerResult is sent by a responding indexer to the emitting indexer
// via ProtocolSearchPeerResponse, and forwarded by the emitting indexer to
// the node on the open ProtocolSearchPeer stream.
type SearchPeerResult struct {
QueryID string `json:"query_id"`
Records []SearchHit `json:"records"`
}
// SearchHit is a single peer found during distributed search.
type SearchHit struct {

View File

@@ -203,6 +203,9 @@ func waitResults[T interface{}](topic *pubsub.Topic, s *LongLivedPubSubService,
if errors.Is(err, context.DeadlineExceeded) {
// timeout hit, no message before deadline kill subsciption.
s.PubsubMu.Lock()
if s.LongLivedPubSubs[proto] != nil {
s.LongLivedPubSubs[proto].Close()
}
delete(s.LongLivedPubSubs, proto)
s.PubsubMu.Unlock()
return
@@ -214,6 +217,5 @@ func waitResults[T interface{}](topic *pubsub.Topic, s *LongLivedPubSubService,
continue
}
f(ctx, evt, fmt.Sprintf("%v", proto))
fmt.Println("DEADLOCK ?")
}
}

View File

@@ -101,6 +101,10 @@ func (ix *LongLivedStreamRecordedService[T]) gc() {
evicted = append(evicted, gcEntry{pid, name, did})
for _, sstreams := range ix.StreamRecords {
if sstreams[pid] != nil {
if sstreams[pid].HeartbeatStream != nil && sstreams[pid].HeartbeatStream.Stream != nil {
sstreams[pid].HeartbeatStream.Stream.Close()
}
delete(sstreams, pid)
}
}

View File

@@ -184,12 +184,15 @@ func TempStream(h host.Host, ad pp.AddrInfo, proto protocol.ID, did string, stre
}
ctxTTL, cancelTTL := context.WithTimeout(context.Background(), expiry)
defer cancelTTL()
if h.Network().Connectedness(ad.ID) != network.Connected {
if err := h.Connect(ctxTTL, ad); err != nil {
fmt.Println("Connectedness", ad.ID, err)
return streams, err
}
}
fmt.Println("PROTO", streams[proto])
if streams[proto] != nil && streams[proto][ad.ID] != nil {
return streams, nil
} else if s, err := h.NewStream(ctxTTL, ad.ID, proto); err == nil {
@@ -200,6 +203,9 @@ func TempStream(h host.Host, ad pp.AddrInfo, proto protocol.ID, did string, stre
mu.Unlock()
time.AfterFunc(expiry, func() {
mu.Lock()
if streams[proto] != nil && streams[proto][ad.ID] != nil && streams[proto][ad.ID].Stream != nil {
streams[proto][ad.ID].Stream.Close()
}
delete(streams[proto], ad.ID)
mu.Unlock()
})
@@ -212,6 +218,7 @@ func TempStream(h host.Host, ad pp.AddrInfo, proto protocol.ID, did string, stre
mu.Unlock()
return streams, nil
} else {
fmt.Println("ERRER", err)
return streams, err
}
}

View File

@@ -33,10 +33,12 @@ const maxTTLSeconds = 86400 // 24h
const tombstoneTTL = 10 * time.Minute
type PeerRecordPayload struct {
ID string `json:"id"`
Name string `json:"name"`
DID string `json:"did"`
PubKey []byte `json:"pub_key"`
PubKey []byte `json:"public_key"`
ExpiryDate time.Time `json:"expiry_date"`
IsNano bool `json:"is_nano"`
// TTLSeconds is the publisher's declared lifetime for this record in seconds.
// 0 means "use the default (120 s)". Included in the signed payload so it
// cannot be altered by an intermediary.
@@ -45,6 +47,8 @@ type PeerRecordPayload struct {
type PeerRecord struct {
PeerRecordPayload
CreationDate time.Time `json:"creation_date"`
UpdateDate time.Time `json:"update_date"`
PeerID string `json:"peer_id"`
APIUrl string `json:"api_url"`
StreamAddress string `json:"stream_address"`
@@ -184,7 +188,7 @@ func (ix *IndexerService) isPeerKnown(pid lpp.ID) bool {
And: map[string][]dbs.Filter{
"peer_id": {{Operator: dbs.EQUAL.String(), Value: pid.String()}},
},
}, pid.String(), false)
}, pid.String(), false, 0, 1)
for _, item := range results.Data {
p, ok := item.(*pp.Peer)
if !ok || p.PeerID != pid.String() {

View File

@@ -3,6 +3,7 @@ package indexer
import (
"context"
"encoding/json"
"fmt"
"strings"
"time"
@@ -10,8 +11,8 @@ import (
"oc-discovery/daemons/node/common"
oclib "cloud.o-forge.io/core/oc-lib"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
)
const TopicSearchPeer = "oc-search-peer"
@@ -46,31 +47,34 @@ func (ix *IndexerService) updateReferent(pid pp.ID, rec PeerRecord, referent boo
// searchReferenced looks up nodes in referencedNodes matching the query.
// Matches on peerID (exact), DID (exact), or name (case-insensitive contains).
func (ix *IndexerService) searchReferenced(peerID, did, name string) []common.SearchHit {
func (ix *IndexerService) searchReferenced(peerID, did, name string) []PeerRecord {
ix.referencedNodesMu.RLock()
defer ix.referencedNodesMu.RUnlock()
nameLow := strings.ToLower(name)
var hits []common.SearchHit
var hits []PeerRecord
for pid, rec := range ix.referencedNodes {
pidStr := pid.String()
matchPeerID := peerID != "" && pidStr == peerID
matchDID := did != "" && rec.DID == did
matchName := name != "" && strings.Contains(strings.ToLower(rec.Name), nameLow)
if matchPeerID || matchDID || matchName {
hits = append(hits, common.SearchHit{
PeerID: pidStr,
DID: rec.DID,
Name: rec.Name,
})
rec.ID = rec.DID
hits = append(hits, rec)
}
}
return hits
}
type SearchPeerResult struct {
QueryID string `json:"query_id"`
Records []PeerRecord `json:"records"`
}
// handleSearchPeer is the ProtocolSearchPeer handler.
// The node opens this stream, sends a SearchPeerRequest, and reads results
// as they stream in. The stream stays open until timeout or node closes it.
func (ix *IndexerService) handleSearchPeer(s network.Stream) {
fmt.Println("handleSearchPeer")
logger := oclib.GetLogger()
defer s.Reset()
@@ -78,7 +82,7 @@ func (ix *IndexerService) handleSearchPeer(s network.Stream) {
logger.Warn().Str("peer", s.Conn().RemotePeer().String()).Msg("[search] unknown peer, rejecting stream")
return
}
fmt.Println("SearchPeerRequest")
var req common.SearchPeerRequest
if err := json.NewDecoder(s).Decode(&req); err != nil || req.QueryID == "" {
return
@@ -94,7 +98,7 @@ func (ix *IndexerService) handleSearchPeer(s network.Stream) {
}()
defer streamCancel()
resultCh := make(chan []common.SearchHit, 16)
resultCh := make(chan []PeerRecord, 16)
ix.pendingSearchesMu.Lock()
ix.pendingSearches[req.QueryID] = resultCh
ix.pendingSearchesMu.Unlock()
@@ -106,9 +110,10 @@ func (ix *IndexerService) handleSearchPeer(s network.Stream) {
// Check own referencedNodes immediately.
if hits := ix.searchReferenced(req.PeerID, req.DID, req.Name); len(hits) > 0 {
fmt.Println("hits", hits)
resultCh <- hits
}
fmt.Println("publishSearchQuery")
// Broadcast search on GossipSub so other indexers can respond.
ix.publishSearchQuery(req.QueryID, req.PeerID, req.DID, req.Name)
@@ -119,7 +124,8 @@ func (ix *IndexerService) handleSearchPeer(s network.Stream) {
for {
select {
case hits := <-resultCh:
if err := enc.Encode(common.SearchPeerResult{QueryID: req.QueryID, Records: hits}); err != nil {
fmt.Println("resultCh hits", hits)
if err := enc.Encode(SearchPeerResult{QueryID: req.QueryID, Records: hits}); err != nil {
logger.Debug().Err(err).Msg("[search] stream write failed")
return
}
@@ -145,13 +151,15 @@ func (ix *IndexerService) handleSearchPeer(s network.Stream) {
// Another indexer opens this stream to deliver hits for a pending queryID.
func (ix *IndexerService) handleSearchPeerResponse(s network.Stream) {
defer s.Reset()
var result common.SearchPeerResult
fmt.Println("RECEIVED SEARCH")
var result SearchPeerResult
if err := json.NewDecoder(s).Decode(&result); err != nil || result.QueryID == "" {
return
}
ix.pendingSearchesMu.Lock()
ch := ix.pendingSearches[result.QueryID]
ix.pendingSearchesMu.Unlock()
fmt.Println("RECEIVED", result.QueryID, ix.pendingSearches[result.QueryID])
if ch != nil {
select {
case ch <- result.Records:
@@ -213,21 +221,28 @@ func (ix *IndexerService) onSearchQuery(q common.SearchQuery) {
if q.EmitterID == ix.Host.ID().String() {
return
}
fmt.Println("ON SEARCH QUERY")
hits := ix.searchReferenced(q.PeerID, q.DID, q.Name)
fmt.Println("ON SEARCH QUERY HITS", hits)
if len(hits) == 0 {
return
}
emitterID, err := pp.Decode(q.EmitterID)
if err != nil {
fmt.Println("ON SEARCH QUERY err DECODE", err)
return
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
s, err := ix.Host.NewStream(ctx, emitterID, common.ProtocolSearchPeerResponse)
if err != nil {
fmt.Println("ON SEARCH QUERY err NewStream", emitterID, err)
return
}
defer s.Reset()
fmt.Println("ON ", emitterID)
defer s.Close()
s.SetDeadline(time.Now().Add(5 * time.Second))
json.NewEncoder(s).Encode(common.SearchPeerResult{QueryID: q.QueryID, Records: hits})
err = json.NewEncoder(s).Encode(SearchPeerResult{QueryID: q.QueryID, Records: hits})
fmt.Println("SEARCH ERR", err)
s.CloseWrite()
}

View File

@@ -61,7 +61,7 @@ type IndexerService struct {
referencedNodes map[pp.ID]PeerRecord
referencedNodesMu sync.RWMutex
// pendingSearches maps queryID → result channel for in-flight searches.
pendingSearches map[string]chan []common.SearchHit
pendingSearches map[string]chan []PeerRecord
pendingSearchesMu sync.Mutex
// behavior tracks per-node compliance (heartbeat rate, publish/get volume,
// identity consistency, signature failures).
@@ -91,7 +91,7 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int) *IndexerServ
LongLivedStreamRecordedService: common.NewStreamRecordedService[PeerRecord](h, maxNode),
isStrictIndexer: ps == nil,
referencedNodes: map[pp.ID]PeerRecord{},
pendingSearches: map[string]chan []common.SearchHit{},
pendingSearches: map[string]chan []PeerRecord{},
behavior: newNodeBehaviorTracker(),
deletedDIDs: make(map[string]time.Time),
eventQueue: &common.MembershipEventQueue{},

View File

@@ -4,7 +4,7 @@ import (
"context"
"encoding/json"
"fmt"
"oc-discovery/daemons/node/common"
"oc-discovery/daemons/node/indexer"
"oc-discovery/daemons/node/stream"
"slices"
@@ -29,6 +29,11 @@ func ListenNATS(n *Node) {
tools.PEER_BEHAVIOR_EVENT: func(resp tools.NATSResponse) { //nolint:typecheck
handlePeerBehaviorEvent(n, resp)
},
// PEER_OBSERVE_EVENT is sent by oc-peer to start or stop observations
// for a list of peer IDs, or to trigger a close-all.
tools.PEER_OBSERVE_EVENT: func(resp tools.NATSResponse) {
n.StreamService.HandleObserveNATSCommand(resp)
},
tools.PROPALGATION_EVENT: func(resp tools.NATSResponse) {
if resp.FromApp == config.GetAppName() {
return
@@ -134,6 +139,21 @@ func ListenNATS(n *Node) {
}
n.StreamService.Mu.Unlock()
}
case tools.PB_OBSERVE:
print("PROPALGATE OBSERVE")
handleObserveEvent(n, propalgation)
case tools.PB_OBSERVE_CLOSE:
print("PROPALGATE CLOSE")
handleObserveCloseEvent(n, propalgation)
case tools.PB_PROPAGATE:
// Another oc-discovery forwarded a heartbeat batch.
// Re-emit on PEER_OBSERVE_RESPONSE_EVENT so the local oc-peer sees it.
tools.NewNATSCaller().SetNATSPub(tools.PEER_OBSERVE_RESPONSE_EVENT, tools.NATSResponse{
FromApp: resp.FromApp,
Datatype: tools.PEER,
Method: int(tools.PEER_OBSERVE_RESPONSE_EVENT),
Payload: propalgation.Payload,
})
case tools.PB_CLOSE_SEARCH:
if propalgation.DataType == int(tools.PEER) {
n.peerSearches.Cancel(resp.User)
@@ -141,16 +161,18 @@ func ListenNATS(n *Node) {
n.StreamService.ResourceSearches.Cancel(resp.User)
}
case tools.PB_SEARCH:
fmt.Println("PROPALGATE PEER")
if propalgation.DataType == int(tools.PEER) {
m := map[string]interface{}{}
if err := json.Unmarshal(propalgation.Payload, &m); err == nil {
needle := fmt.Sprintf("%v", m["search"])
userKey := resp.User
go n.SearchPeerRecord(userKey, needle, func(hit common.SearchHit) {
go n.SearchPeerRecord(userKey, needle, func(hit indexer.PeerRecord) {
if b, err := json.Marshal(hit); err == nil {
tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.DataType(tools.PEER),
User: userKey,
Method: int(tools.SEARCH_EVENT),
Payload: b,
})
@@ -240,3 +262,37 @@ func handlePeerBehaviorEvent(n *Node, resp tools.NATSResponse) {
})
}
}
// handleObserveEvent processes a PB_OBSERVE PropalgationMessage from another
// oc-discovery node, starting observation for the listed peers.
func handleObserveEvent(n *Node, p tools.PropalgationMessage) {
var cmd stream.ObserveCommand
if err := json.Unmarshal(p.Payload, &cmd); err != nil {
fmt.Println("handleObserveEvent: unmarshal error:", err)
return
}
for _, sp := range cmd.Peers {
if err := n.StreamService.OpenObserveStream(sp); err != nil {
fmt.Println("handleObserveEvent: OpenObserveStream failed for", sp.PeerID, ":", err)
}
}
}
// handleObserveCloseEvent processes a PB_OBSERVE_CLOSE PropalgationMessage from
// another oc-discovery node, stopping observation for the listed peer IDs.
func handleObserveCloseEvent(n *Node, p tools.PropalgationMessage) {
var cmd stream.ObserveCommand
if err := json.Unmarshal(p.Payload, &cmd); err != nil {
fmt.Println("handleObserveCloseEvent: unmarshal error:", err)
return
}
if cmd.CloseAll {
n.StreamService.CloseAllObserves()
return
}
for _, peerID := range cmd.PeerIDs {
if err := n.StreamService.CloseObserveStream(peerID); err != nil {
fmt.Println("handleObserveCloseEvent: CloseObserveStream failed for", peerID, ":", err)
}
}
}

View File

@@ -113,6 +113,7 @@ func InitNode(isNode bool, isIndexer bool) (*Node, error) {
if ttl <= 0 {
ttl = indexer.DefaultTTLSeconds * time.Second
}
fresh.UpdateDate = time.Now().UTC()
fresh.PeerRecordPayload.ExpiryDate = time.Now().UTC().Add(ttl)
payload, _ := json.Marshal(fresh.PeerRecordPayload)
fresh.Signature, err = priv.Sign(payload)
@@ -141,7 +142,7 @@ func InitNode(isNode bool, isIndexer bool) (*Node, error) {
And: map[string][]dbs.Filter{
"peer_id": {{Operator: dbs.EQUAL.String(), Value: pid.String()}},
},
}, pid.String(), false)
}, pid.String(), false, 0, 1)
for _, item := range results.Data {
p, ok := item.(*peer.Peer)
if !ok || p.PeerID != pid.String() {
@@ -228,7 +229,7 @@ func (d *Node) isPeerKnown(pid pp.ID) bool {
And: map[string][]dbs.Filter{
"peer_id": {{Operator: dbs.EQUAL.String(), Value: pid.String()}},
},
}, pid.String(), false)
}, pid.String(), false, 0, 1)
for _, item := range results.Data {
p, ok := item.(*peer.Peer)
if !ok || p.PeerID != pid.String() {
@@ -267,15 +268,8 @@ func (d *Node) publishPeerRecord(
if ttl <= 0 {
ttl = indexer.DefaultTTLSeconds * time.Second
}
base := indexer.PeerRecordPayload{
Name: rec.Name,
DID: rec.DID,
PubKey: rec.PubKey,
TTLSeconds: rec.TTLSeconds,
ExpiryDate: time.Now().UTC().Add(ttl),
}
payload, _ := json.Marshal(base)
rec.PeerRecordPayload = base
rec.ExpiryDate = time.Now().UTC().Add(ttl)
payload, _ := json.Marshal(rec.PeerRecordPayload)
rec.Signature, err = priv.Sign(payload)
if err := json.NewEncoder(stream.Stream).Encode(&rec); err != nil { // then publish on stream
return err
@@ -288,7 +282,7 @@ func (d *Node) publishPeerRecord(
// A new call for the same userKey cancels any previous search.
// Results are pushed to onResult as they arrive; the function returns when
// the stream closes (idle timeout, explicit cancel, or indexer unreachable).
func (d *Node) SearchPeerRecord(userKey, needle string, onResult func(common.SearchHit)) {
func (d *Node) SearchPeerRecord(userKey, needle string, onResult func(indexer.PeerRecord)) {
logger := oclib.GetLogger()
idleTimeout := common.SearchIdleTimeout()
@@ -306,7 +300,7 @@ func (d *Node) SearchPeerRecord(userKey, needle string, onResult func(common.Sea
} else {
req.Name = needle
}
fmt.Println("PROPALGATE PEER", needle, common.Indexers.GetAddrs())
for _, ad := range common.Indexers.GetAddrs() {
if ad.Info == nil {
continue
@@ -330,7 +324,7 @@ func (d *Node) SearchPeerRecord(userKey, needle string, onResult func(common.Sea
seen := map[string]struct{}{}
dec := json.NewDecoder(s)
for {
var result common.SearchPeerResult
var result indexer.SearchPeerResult
if err := dec.Decode(&result); err != nil {
break
}
@@ -416,7 +410,7 @@ func (d *Node) claimInfo(
And: map[string][]dbs.Filter{ // search by name if no filters are provided
"peer_id": {{Operator: dbs.EQUAL.String(), Value: d.Host.ID().String()}},
},
}, "", false)
}, "", false, 0, 1)
if len(peers.Data) > 0 {
did = peers.Data[0].GetID() // if already existing set up did as made
}
@@ -435,9 +429,11 @@ func (d *Node) claimInfo(
now := time.Now().UTC()
pRec := indexer.PeerRecordPayload{
Name: name,
DID: did, // REAL PEER ID
PubKey: pubBytes,
Name: name,
DID: did, // REAL PEER ID
PubKey: pubBytes,
IsNano: oclib.GetConfig().IsNano,
TTLSeconds: indexer.DefaultTTLSeconds,
ExpiryDate: now.Add(indexer.DefaultTTLSeconds * time.Second),
}
@@ -447,6 +443,8 @@ func (d *Node) claimInfo(
rec := &indexer.PeerRecord{
PeerRecordPayload: pRec,
}
rec.CreationDate = time.Now().UTC()
rec.UpdateDate = time.Now().UTC()
rec.Signature, err = priv.Sign(payload)
if err != nil {
return nil, err

View File

@@ -27,8 +27,9 @@ func (ps *PubSubService) SearchPublishEvent(
return ps.StreamService.PublishesCommon(dt, user, groups, nil, b, stream.ProtocolSearchResource) //if partners focus only them*/
case "partner": // define Search Strategy
return ps.StreamService.PublishesCommon(dt, user, groups, &dbs.Filters{ // filter by like name, short_description, description, owner, url if no filters are provided
And: map[string][]dbs.Filter{
Or: map[string][]dbs.Filter{
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.PARTNER}},
"is_nano": {{Operator: dbs.EQUAL.String(), Value: true}},
},
}, b, stream.ProtocolSearchResource)
case "all": // Gossip PubSub

View File

@@ -0,0 +1,362 @@
package stream
// dnt_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
//
// When a stream write fails because the remote peer is unreachable, the request
// is saved here and retried on the next tick. Two levels are defined:
//
// - dntCritical : retry indefinitely (create / update / delete resource).
// - dntModerate : up to dntMaxModerateRetries retries, then abandon.
//
// Pubsub messages and search streams are explicitly excluded.
// Streams initiated from the indexer side are never enqueued here.
//
// # Crash-resilient persistence
//
// Critical entries are written to an encrypted file (AES-256-GCM) so they
// survive a node crash/restart. The AES key is derived deterministically from
// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
// Moderate entries are intentionally not persisted: their retry budget is small
// enough that re-loading them after a restart would be misleading.
import (
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"crypto/sha256"
"encoding/json"
"io"
"os"
"path/filepath"
"sync"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/tools"
"golang.org/x/crypto/hkdf"
"oc-discovery/conf"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
)
type dntLevel int
const (
dntCritical dntLevel = iota // retry until the message is delivered
dntModerate // retry up to dntMaxModerateRetries times
)
const dntMaxModerateRetries = 3
const dntRetryInterval = 15 * time.Second
// dntProtocols maps each stream protocol to its DNT level.
// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
var dntProtocols = map[protocol.ID]dntLevel{
// Critical — data mutations that must eventually be delivered.
ProtocolCreateResource: dntCritical,
ProtocolUpdateResource: dntCritical,
ProtocolDeleteResource: dntCritical,
// Moderate — confirmations / config / planner: 3 retries before abandon.
ProtocolVerifyResource: dntModerate,
ProtocolSendPlanner: dntModerate,
ProtocolConsidersResource: dntModerate,
ProtocolMinioConfigResource: dntModerate,
ProtocolAdmiraltyConfigResource: dntModerate,
}
// dntEntryJSON is the on-disk representation of a dntEntry.
// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
type dntEntryJSON struct {
DID string `json:"did"`
Addr pp.AddrInfo `json:"addr"`
DT *tools.DataType `json:"dt,omitempty"`
User string `json:"user"`
Payload []byte `json:"payload"`
Proto protocol.ID `json:"proto"`
Retries int `json:"retries"`
AddedAt time.Time `json:"added_at"`
}
type dntEntry struct {
did string
addr pp.AddrInfo
dt *tools.DataType
user string
payload []byte
proto protocol.ID
retries int
addedAt time.Time
}
func (e *dntEntry) toJSON() dntEntryJSON {
return dntEntryJSON{
DID: e.did,
Addr: e.addr,
DT: e.dt,
User: e.user,
Payload: e.payload,
Proto: e.proto,
Retries: e.retries,
AddedAt: e.addedAt,
}
}
func entryFromJSON(j dntEntryJSON) *dntEntry {
return &dntEntry{
did: j.DID,
addr: j.Addr,
dt: j.DT,
user: j.User,
payload: j.Payload,
proto: j.Proto,
retries: j.Retries,
addedAt: j.AddedAt,
}
}
type dntCache struct {
mu sync.Mutex
entries []*dntEntry
// aesKey is the derived AES-256 key used for on-disk encryption.
// Nil when key derivation failed: persistence is disabled but the in-memory
// cache continues to function normally.
aesKey []byte
}
// newDNTCache initialises the cache, derives the encryption key, and restores
// any critical entries that were persisted before the last crash.
func newDNTCache() *dntCache {
log := oclib.GetLogger()
c := &dntCache{}
key, err := deriveDNTKey()
if err != nil {
log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
} else {
c.aesKey = key
c.loadFromDisk()
}
return c
}
// enqueue adds an entry to the cache and persists critical entries to disk.
func (c *dntCache) enqueue(e *dntEntry) {
c.mu.Lock()
c.entries = append(c.entries, e)
c.mu.Unlock()
if dntProtocols[e.proto] == dntCritical {
go c.persistToDisk()
}
}
// drain atomically removes and returns all current entries.
func (c *dntCache) drain() []*dntEntry {
c.mu.Lock()
defer c.mu.Unlock()
out := c.entries
c.entries = nil
return out
}
// requeue puts entries back at the head of the list, preserving any new
// entries added while the retry loop was running.
func (c *dntCache) requeue(entries []*dntEntry) {
if len(entries) == 0 {
return
}
c.mu.Lock()
defer c.mu.Unlock()
c.entries = append(entries, c.entries...)
}
// ── Persistence ──────────────────────────────────────────────────────────────
// dntCachePath returns the path of the on-disk cache file, placed next to the
// node's private key so it lives on the same persistent volume.
func dntCachePath() string {
return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
}
// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
// using HKDF-SHA256. The derivation is deterministic: the same key is always
// produced from the same private key, so no symmetric secret needs storing.
func deriveDNTKey() ([]byte, error) {
priv, err := tools.LoadKeyFromFilePrivate()
if err != nil {
return nil, err
}
// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
// (32-byte seed || 32-byte public key). We use the full 64 bytes as IKM.
raw, err := priv.Raw()
if err != nil {
return nil, err
}
reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
key := make([]byte, 32)
if _, err := io.ReadFull(reader, key); err != nil {
return nil, err
}
return key, nil
}
// persistToDisk encrypts all current critical entries and writes them to disk.
// Non-critical entries are deliberately excluded — they are not worth restoring
// after a restart given their limited retry budget.
func (c *dntCache) persistToDisk() {
if c.aesKey == nil {
return
}
log := oclib.GetLogger()
c.mu.Lock()
var toSave []dntEntryJSON
for _, e := range c.entries {
if dntProtocols[e.proto] == dntCritical {
toSave = append(toSave, e.toJSON())
}
}
c.mu.Unlock()
plaintext, err := json.Marshal(toSave)
if err != nil {
return
}
block, err := aes.NewCipher(c.aesKey)
if err != nil {
return
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return
}
nonce := make([]byte, gcm.NonceSize())
if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
return
}
ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
path := dntCachePath()
tmp := path + ".tmp"
if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
log.Warn().Err(err).Msg("[dnt] failed to write cache file")
return
}
if err := os.Rename(tmp, path); err != nil {
log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
_ = os.Remove(tmp)
}
}
// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
// Errors (missing file, decryption failure) are non-fatal: the cache simply
// starts empty, which is safe.
func (c *dntCache) loadFromDisk() {
if c.aesKey == nil {
return
}
log := oclib.GetLogger()
path := dntCachePath()
data, err := os.ReadFile(path)
if err != nil {
if !os.IsNotExist(err) {
log.Warn().Err(err).Msg("[dnt] failed to read cache file")
}
return
}
block, err := aes.NewCipher(c.aesKey)
if err != nil {
return
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return
}
if len(data) < gcm.NonceSize() {
log.Warn().Msg("[dnt] cache file too short, ignoring")
return
}
nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
if err != nil {
log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
return
}
var saved []dntEntryJSON
if err := json.Unmarshal(plaintext, &saved); err != nil {
log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
return
}
count := 0
for _, j := range saved {
// Only restore critical entries — moderate entries are intentionally
// not persisted, but this guard defends against format changes.
if dntProtocols[j.Proto] != dntCritical {
continue
}
c.entries = append(c.entries, entryFromJSON(j))
count++
}
if count > 0 {
log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
}
}
// ── Retry loop ────────────────────────────────────────────────────────────────
// startDNTLoop runs the background retry goroutine. Call once after init.
func (s *StreamService) startDNTLoop() {
logger := oclib.GetLogger()
ticker := time.NewTicker(dntRetryInterval)
defer ticker.Stop()
for range ticker.C {
entries := s.dnt.drain()
if len(entries) == 0 {
continue
}
var keep []*dntEntry
for _, e := range entries {
_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
if err == nil {
level := dntProtocols[e.proto]
if level == dntCritical {
logger.Info().
Str("proto", string(e.proto)).
Str("peer", e.did).
Msg("[dnt] critical message delivered after retry")
} else {
logger.Info().
Str("proto", string(e.proto)).
Str("peer", e.did).
Int("retries", e.retries).
Msg("[dnt] moderate message delivered after retry")
}
continue
}
level := dntProtocols[e.proto]
switch level {
case dntCritical:
keep = append(keep, e)
case dntModerate:
e.retries++
if e.retries < dntMaxModerateRetries {
keep = append(keep, e)
} else {
logger.Warn().
Str("proto", string(e.proto)).
Str("peer", e.did).
Int("retries", e.retries).
Msg("[dnt] moderate message abandoned after max retries")
}
}
}
s.dnt.requeue(keep)
// Persist after each tick so the on-disk file reflects the current
// state (entries delivered are removed, new ones from concurrent
// enqueues are included).
go s.dnt.persistToDisk()
}
}

View File

@@ -14,14 +14,23 @@ import (
"cloud.o-forge.io/core/oc-lib/models/peer"
"cloud.o-forge.io/core/oc-lib/models/resources"
"cloud.o-forge.io/core/oc-lib/tools"
"github.com/libp2p/go-libp2p/core/network"
)
type Verify struct {
IsVerified bool `json:"is_verified"`
}
func (ps *StreamService) handleEvent(protocol string, evt *common.Event) error {
fmt.Println("handleEvent")
func (ps *StreamService) handleEvent(protocol string, evt *common.Event, s network.Stream) error {
fmt.Println("handleEvent", protocol)
// Heartbeat received on an outgoing ProtocolObserve stream.
if protocol == ProtocolObserve {
return ps.handleIncomingObserve(s)
}
if protocol == observeHBEventType {
return ps.handleObserveHeartbeat(evt)
}
ps.handleEventFromPartner(evt, protocol)
/*if protocol == ProtocolVerifyResource {
if evt.DataType == -1 {
@@ -159,7 +168,7 @@ func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol stri
And: map[string][]dbs.Filter{
"peer_id": {{Operator: dbs.EQUAL.String(), Value: evt.From}},
},
}, evt.From, false)
}, evt.From, false, 0, 1)
if len(peers.Data) > 0 {
p := peers.Data[0].(*peer.Peer)
ps.SendResponse(p, evt, fmt.Sprintf("%v", search))
@@ -212,7 +221,7 @@ func (abs *StreamService) SendResponse(p *peer.Peer, event *common.Event, search
} else {
for _, dt := range dts {
access := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil)
searched := access.Search(abs.FilterPeer(self.GetID(), event.Groups, search), "", false)
searched := access.Search(abs.FilterPeer(self.GetID(), event.Groups, search), "", false, 0, 0)
for _, ss := range searched.Data {
if j, err := json.Marshal(ss); err == nil {
abs.PublishCommon(&dt, event.User, event.Groups, p.PeerID, ProtocolSearchResource, j)

View File

@@ -0,0 +1,552 @@
package stream
import (
"context"
"encoding/json"
"errors"
"fmt"
"sync"
"time"
"oc-discovery/daemons/node/common"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/dbs"
"cloud.o-forge.io/core/oc-lib/models/peer"
"cloud.o-forge.io/core/oc-lib/tools"
"github.com/libp2p/go-libp2p/core/network"
pp "github.com/libp2p/go-libp2p/core/peer"
)
// ProtocolObserve is the libp2p protocol for peer connectivity observation.
// The requesting oc-discovery opens a stream to the remote oc-discovery and
// sends an ObserveRequest. The remote side keeps the stream open and writes
// ObserveHeartbeat events back every observeHBInterval seconds.
const ProtocolObserve = "/opencloud/peer/observe/1.0"
// observeHBEventType is used as the common.Event.Type for heartbeat responses.
const observeHBEventType = "/opencloud/peer/observe/heartbeat"
const observeHBInterval = 30 * time.Second
const observeDrainDuration = 30 * time.Second
// observeBatchWindow is the accumulation window before a heartbeat batch is
// flushed to NATS. All peer heartbeats received within this window are grouped
// into a single PEER_OBSERVE_RESPONSE_EVENT, reducing NATS traffic.
const observeBatchWindow = 2 * time.Second
// ObserveRequest is the first (and only) message sent by the observing side
// when opening a ProtocolObserve stream.
type ObserveRequest struct {
// Close, when true, asks the remote side to stop the heartbeat goroutine
// and remove the observer from its cache. Used for graceful teardown.
Close bool `json:"close,omitempty"`
}
// ObserveHeartbeat is sent by the observed side every observeHBInterval.
type ObserveHeartbeat struct {
State string `json:"state"` // always "online" when actively emitted
}
// ShallowPeer is the minimal peer representation sent by oc-peer in a
// PEER_OBSERVE_EVENT. StreamAddress lets oc-discovery connect without a DB
// lookup; Address carries the NATSAddress (unused here, forwarded as-is).
type ShallowPeer struct {
ID string `json:"id"`
PeerID string `json:"peer_id"`
Address string `json:"address"`
StreamAddress string `json:"stream_address"`
}
// ObserveCommand is the payload carried by a PEER_OBSERVE_EVENT NATS message
// (from oc-peer).
//
// Observe → User + Peers populated
// Close → User + PeerIDs + Close=true
// CloseAll → CloseAll=true (User optional)
type ObserveCommand struct {
User string `json:"user"`
Peers []ShallowPeer `json:"peers,omitempty"`
PeerIDs []string `json:"peer_ids,omitempty"`
Close bool `json:"close,omitempty"`
CloseAll bool `json:"close_all,omitempty"`
}
// ── observe cache (observed side) ────────────────────────────────────────────
// observeCache tracks running heartbeat goroutines keyed by the observing
// peer's libp2p PeerID string. It is used exclusively on the OBSERVED side.
type observeCache struct {
mu sync.Mutex
cancels map[string]context.CancelFunc
}
func newObserveCache() *observeCache {
return &observeCache{cancels: map[string]context.CancelFunc{}}
}
func (c *observeCache) set(pid string, cancel context.CancelFunc) {
c.mu.Lock()
defer c.mu.Unlock()
if old, ok := c.cancels[pid]; ok {
old() // cancel previous goroutine if any
}
c.cancels[pid] = cancel
}
func (c *observeCache) cancel(pid string) {
c.mu.Lock()
defer c.mu.Unlock()
if fn, ok := c.cancels[pid]; ok {
fn()
delete(c.cancels, pid)
}
}
func (c *observeCache) cancelAll() {
c.mu.Lock()
defer c.mu.Unlock()
for _, fn := range c.cancels {
fn()
}
c.cancels = map[string]context.CancelFunc{}
}
func (c *observeCache) delete(pid string) {
c.mu.Lock()
defer c.mu.Unlock()
delete(c.cancels, pid)
}
// ── heartbeat batcher (observing side) ───────────────────────────────────────
// heartbeatBatcher accumulates peer_ids from incoming heartbeats over
// observeBatchWindow, then flushes them in a single NATS call.
// Using a map as the backing store deduplicates multiple heartbeats from the
// same peer within the same window (should not happen, but is harmless).
type heartbeatBatcher struct {
mu sync.Mutex
ids map[string]struct{}
timer *time.Timer
flush func(peerIDs []string)
}
func newHeartbeatBatcher(flush func([]string)) *heartbeatBatcher {
return &heartbeatBatcher{
ids: make(map[string]struct{}),
flush: flush,
}
}
// add records peerID in the current batch and arms the flush timer if needed.
func (b *heartbeatBatcher) add(peerID string) {
b.mu.Lock()
defer b.mu.Unlock()
b.ids[peerID] = struct{}{}
if b.timer == nil {
b.timer = time.AfterFunc(observeBatchWindow, b.fire)
}
}
// fire is called by the timer; it drains the batch and invokes flush.
func (b *heartbeatBatcher) fire() {
b.mu.Lock()
ids := make([]string, 0, len(b.ids))
for id := range b.ids {
ids = append(ids, id)
}
b.ids = make(map[string]struct{})
b.timer = nil
b.mu.Unlock()
if len(ids) > 0 {
b.flush(ids)
}
}
// flushObserveBatch is the flush function wired into the heartbeatBatcher.
// It emits two NATS messages:
// - PEER_OBSERVE_RESPONSE_EVENT → consumed by oc-peer (direct channel)
// - PROPALGATION_EVENT / PB_PROPAGATE → consumed by other oc-discovery nodes
func flushObserveBatch(peerIDs []string) {
payload, err := json.Marshal(map[string]interface{}{
"peer_ids": peerIDs,
"state": "online",
})
if err != nil {
return
}
// Direct notification to oc-peer.
tools.NewNATSCaller().SetNATSPub(tools.PEER_OBSERVE_RESPONSE_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.PEER,
Method: int(tools.PEER_OBSERVE_RESPONSE_EVENT),
Payload: payload,
})
// Broadcast to other oc-discovery nodes so they can forward to their
// local oc-peer if needed.
propPayload, err := json.Marshal(tools.PropalgationMessage{
DataType: int(tools.PEER),
Action: tools.PB_PROPAGATE,
Payload: payload,
})
if err != nil {
return
}
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.PEER,
Method: int(tools.PROPALGATION_EVENT),
Payload: propPayload,
})
}
// ── incoming observe handler (observed side) ──────────────────────────────────
// handleIncomingObserve is registered as the ProtocolObserve stream handler.
// It is called when a remote peer opens an observe stream to us.
// The function reads the request, validates it, then starts (or stops) the
// heartbeat goroutine and returns immediately — the goroutine owns the stream.
func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
remotePeerID := rawStream.Conn().RemotePeer().String()
addr := rawStream.Conn().RemoteMultiaddr().String()
ad, err := pp.AddrInfoFromString(addr + "/p2p/" + remotePeerID)
if err != nil {
fmt.Println("qndlqnl EERR", addr, err)
return err
}
log := oclib.GetLogger()
// Drain mode: reject any new observations for 30 s after a close-all.
s.drainMu.RLock()
draining := !s.drainUntil.IsZero() && time.Now().Before(s.drainUntil)
s.drainMu.RUnlock()
if draining {
rawStream.Close()
fmt.Println("Draining")
return errors.New("Draining")
}
// Read the observe request (with a generous deadline to avoid hangs).
// Guard: the requesting peer must not be blacklisted or be ourself.
did := ""
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
res := access.Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"peer_id": {{Operator: dbs.EQUAL.String(), Value: remotePeerID}},
},
}, "", false, 0, 1)
if len(res.Data) > 0 {
p := res.Data[0].(*peer.Peer)
did = p.GetID()
if p.Relation == peer.BLACKLIST { // || p.Relation == peer.SELF
rawStream.Close()
fmt.Println("CLOSE blacklist or self")
return errors.New("can't exploit blacklist or self")
}
}
// Replace any existing heartbeat goroutine for this observer.
ctx, cancel := context.WithCancel(context.Background())
s.observeCache.set(remotePeerID, cancel)
fmt.Println("LOOP OBSERVE")
go func() {
defer rawStream.Close()
defer cancel()
defer s.observeCache.delete(remotePeerID)
ticker := time.NewTicker(observeHBInterval)
defer ticker.Stop()
hbPayload, _ := json.Marshal(ObserveHeartbeat{State: "online"})
evt := common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", hbPayload)
if evt == nil {
return
}
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
stream := s.Streams[ProtocolObserve][ad.ID]
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
// Moderate connectivity event: the observer is unreachable.
// The deferred calls above purge this observer from the cache.
fmt.Println("LOOP EVT ERR", err)
log.Info().
Str("observer", remotePeerID).
Err(err).
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
return
}
}
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
rawStream.SetWriteDeadline(time.Now().Add(5 * time.Second))
fmt.Println("LOOP EVT", evt)
var err error
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
stream := s.Streams[ProtocolObserve][ad.ID]
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
// Moderate connectivity event: the observer is unreachable.
// The deferred calls above purge this observer from the cache.
fmt.Println("LOOP EVT ERR", err)
log.Info().
Str("observer", remotePeerID).
Err(err).
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
return
}
}
rawStream.SetWriteDeadline(time.Time{})
}
}
}()
return nil
}
// ── heartbeat receiver (observing side) ───────────────────────────────────────
// handleObserveHeartbeat is called by readLoop when a heartbeat event arrives
// on an outgoing ProtocolObserve stream. It queues the peer_id in the batch
// accumulator; the batcher flushes to NATS after observeBatchWindow.
func (ps *StreamService) handleObserveHeartbeat(evt *common.Event) error {
// ps.hbBatcher.add(evt.From)
flushObserveBatch([]string{evt.From})
return nil
}
// ── user→peer index (ref-counted observe management) ─────────────────────────
// userPeerIndex tracks which users are observing which peers.
// A libp2p observe stream is kept open as long as at least one user watches
// the peer; it is closed only when the last user stops.
type userPeerIndex struct {
mu sync.Mutex
index map[string]map[string]struct{} // user → set of peer_id strings
}
func newUserPeerIndex() *userPeerIndex {
return &userPeerIndex{index: map[string]map[string]struct{}{}}
}
// add registers user as an observer of peerID.
// Returns true if peerID was not yet observed by any user (first observer).
func (u *userPeerIndex) add(user, peerID string) (isFirst bool) {
u.mu.Lock()
defer u.mu.Unlock()
// Count total observers for peerID across all users before adding.
total := 0
for _, peers := range u.index {
if _, ok := peers[peerID]; ok {
total++
}
}
if u.index[user] == nil {
u.index[user] = map[string]struct{}{}
}
u.index[user][peerID] = struct{}{}
return total == 0
}
// remove unregisters user from peerID.
// Returns true if no user is observing peerID anymore (last observer removed).
func (u *userPeerIndex) remove(user, peerID string) (isLast bool) {
u.mu.Lock()
defer u.mu.Unlock()
delete(u.index[user], peerID)
if len(u.index[user]) == 0 {
delete(u.index, user)
}
for _, peers := range u.index {
if _, ok := peers[peerID]; ok {
return false
}
}
return true
}
// removeUser removes all entries for user and returns the peer_ids that now
// have no remaining observers (i.e., those whose streams should be closed).
func (u *userPeerIndex) removeUser(user string) []string {
u.mu.Lock()
defer u.mu.Unlock()
watched := u.index[user]
delete(u.index, user)
var orphans []string
for peerID := range watched {
found := false
for _, peers := range u.index {
if _, ok := peers[peerID]; ok {
found = true
break
}
}
if !found {
orphans = append(orphans, peerID)
}
}
return orphans
}
// ── NATS command handler (observing side) ─────────────────────────────────────
// HandleObserveNATSCommand processes a PEER_OBSERVE_EVENT received from oc-peer.
func (ps *StreamService) HandleObserveNATSCommand(resp tools.NATSResponse) {
log := oclib.GetLogger()
var cmd ObserveCommand
if err := json.Unmarshal(resp.Payload, &cmd); err != nil {
log.Warn().Err(err).Msg("[observe] failed to unmarshal ObserveCommand")
return
}
if cmd.CloseAll {
log.Info().Msg("[observe] close-all received via NATS")
ps.CloseAllObserves()
return
}
if cmd.Close {
for _, peerID := range cmd.PeerIDs {
if isLast := ps.observeUsers.remove(cmd.User, peerID); isLast {
if err := ps.closeObserveStream(peerID); err != nil {
log.Warn().Str("peer", peerID).Err(err).Msg("[observe] closeObserveStream failed")
}
}
}
return
}
// Observe: open streams for any new peer, using the address from the payload.
for _, p := range cmd.Peers {
if isFirst := ps.observeUsers.add(cmd.User, p.PeerID); isFirst {
if err := ps.openObserveStream(p); err != nil {
// Roll back the index entry so the next NATS command can retry.
ps.observeUsers.remove(cmd.User, p.PeerID)
log.Warn().Str("peer", p.PeerID).Err(err).Msg("[observe] openObserveStream failed")
}
}
}
}
// ── outgoing observe management (observing side) ──────────────────────────────
// OpenObserveStream is the exported variant for inter-discovery propagation
// (no user context available). It bypasses the user index and opens the stream
// directly if not already open.
func (ps *StreamService) OpenObserveStream(p ShallowPeer) error {
return ps.openObserveStream(p)
}
// CloseObserveStream is the exported variant for inter-discovery propagation.
func (ps *StreamService) CloseObserveStream(toPeerID string) error {
return ps.closeObserveStream(toPeerID)
}
// openObserveStream opens a ProtocolObserve stream to p.
// Uses p.StreamAddress directly; falls back to DB then DHT lookup if empty.
func (ps *StreamService) openObserveStream(p ShallowPeer) error {
streamAddr := p.StreamAddress
fmt.Println("STREAM OBS", streamAddr)
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
res := access.Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"peer_id": {{Operator: dbs.EQUAL.String(), Value: p.PeerID}},
},
}, "", false, 0, 1)
if streamAddr == "" {
// Fallback: DB then DHT.
if len(res.Data) > 0 {
streamAddr = res.Data[0].(*peer.Peer).StreamAddress
} else if peers, err := ps.Node.GetPeerRecord(context.Background(), p.PeerID); err == nil && len(peers) > 0 {
streamAddr = peers[0].StreamAddress
}
}
if len(res.Data) > 0 && res.Data[0].(*peer.Peer).Relation == peer.SELF {
return errors.New("Can't send to self")
}
fmt.Println("STREAM OBS SSS", streamAddr)
if streamAddr == "" {
return nil // can't resolve address — silently skip
}
decodedID, err := pp.Decode(p.PeerID)
if err != nil {
return err
}
// If a stream already exists, reuse it.
ps.Mu.RLock()
_, alreadyOpen := ps.Streams[ProtocolObserve][decodedID]
ps.Mu.RUnlock()
if alreadyOpen {
return nil
}
ad, err := pp.AddrInfoFromString(streamAddr)
if err != nil {
return err
}
fmt.Println("TempStream OBSERVE", ad)
if ps.Streams, err = common.TempStream(ps.Host, *ad, ProtocolObserve, p.ID, ps.Streams, protocols, &ps.Mu); err == nil {
rawStream := ps.Streams[ProtocolObserve][ad.ID]
if hbPayload, err := json.Marshal(ObserveRequest{Close: false}); err == nil {
if err := json.NewEncoder(rawStream.Stream).Encode(common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", hbPayload)); err != nil {
fmt.Println("ERR")
rawStream.Stream.Close()
return err
}
s := &common.Stream{
Stream: rawStream.Stream,
Expiry: time.Now().Add(365 * 24 * time.Hour),
}
ps.Mu.Lock()
if ps.Streams[ProtocolObserve] == nil {
ps.Streams[ProtocolObserve] = map[pp.ID]*common.Stream{}
}
ps.Streams[ProtocolObserve][ad.ID] = s
ps.Mu.Unlock()
go ps.readLoop(s, ad.ID, ProtocolObserve, &common.ProtocolInfo{PersistantStream: true})
}
} else {
return err
}
return nil
}
// closeObserveStream closes the ProtocolObserve stream to toPeerID and notifies
// the remote side.
func (ps *StreamService) closeObserveStream(toPeerID string) error {
decodedID, err := pp.Decode(toPeerID)
if err != nil {
return err
}
ps.Mu.Lock()
if ps.Streams[ProtocolObserve] != nil {
if s, ok := ps.Streams[ProtocolObserve][decodedID]; ok {
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
s.Stream.Close()
delete(ps.Streams[ProtocolObserve], decodedID)
}
}
ps.Mu.Unlock()
return nil
}
// CloseAllObserves closes every outgoing ProtocolObserve stream, clears the
// user index, and enters drain mode for observeDrainDuration.
func (ps *StreamService) CloseAllObserves() {
ps.Mu.Lock()
for _, s := range ps.Streams[ProtocolObserve] {
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
s.Stream.Close()
}
delete(ps.Streams, ProtocolObserve)
ps.Mu.Unlock()
// Reset user index so stale ref-counts don't block future opens.
ps.observeUsers = newUserPeerIndex()
ps.drainMu.Lock()
ps.drainUntil = time.Now().Add(observeDrainDuration)
ps.drainMu.Unlock()
}

View File

@@ -6,6 +6,8 @@ import (
"errors"
"fmt"
"oc-discovery/daemons/node/common"
"strings"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/dbs"
@@ -19,9 +21,9 @@ func (ps *StreamService) PublishesCommon(dt *tools.DataType, user string, groups
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
var p oclib.LibDataShallow
if filter == nil {
p = access.LoadAll(false)
p = access.LoadAll(false, 0, 10000)
} else {
p = access.Search(filter, "", false)
p = access.Search(filter, "", false, 0, 10000)
}
for _, pes := range p.Data {
for _, proto := range protos {
@@ -45,7 +47,7 @@ func (ps *StreamService) PublishCommon(dt *tools.DataType, user string, groups [
And: map[string][]dbs.Filter{ // search by name if no filters are provided
"peer_id": {{Operator: dbs.EQUAL.String(), Value: toPeerID}},
},
}, toPeerID, false)
}, toPeerID, false, 0, 1)
var pe *peer.Peer
if len(p.Data) > 0 && p.Data[0].(*peer.Peer).Relation != peer.BLACKLIST {
pe = p.Data[0].(*peer.Peer)
@@ -57,13 +59,36 @@ func (ps *StreamService) PublishCommon(dt *tools.DataType, user string, groups [
if err != nil {
return nil, err
}
return ps.write(toPeerID, ad, dt, user, resource, proto)
stream, err := ps.write(toPeerID, ad, dt, user, resource, proto)
if err != nil {
if _, ok := dntProtocols[proto]; ok {
ps.dnt.enqueue(&dntEntry{
did: toPeerID,
addr: *ad,
dt: dt,
user: user,
payload: resource,
proto: proto,
addedAt: time.Now().UTC(),
})
}
return nil, err
}
return stream, nil
}
return nil, errors.New("peer unvalid " + toPeerID)
}
func (ps *StreamService) ToPartnerPublishEvent(
ctx context.Context, action tools.PubSubAction, dt *tools.DataType, user string, groups []string, payload []byte) error {
var proto protocol.ID
proto = ProtocolCreateResource
switch action {
case tools.PB_DELETE:
proto = ProtocolDeleteResource
case tools.PB_UPDATE:
proto = ProtocolUpdateResource
}
if *dt == tools.PEER {
var p peer.Peer
if err := json.Unmarshal(payload, &p); err != nil {
@@ -87,25 +112,30 @@ func (ps *StreamService) ToPartnerPublishEvent(
}
}
var per peer.Peer
if err := json.Unmarshal(payload, &per); err == nil && !strings.Contains(per.Relation.String(), "master") && !strings.Contains(per.Relation.String(), "nano") {
for _, rel := range []peer.PeerRelation{peer.MASTER, peer.NANO} {
ps.PublishesCommon(dt, user, groups, &dbs.Filters{
And: map[string][]dbs.Filter{
"relation": {{Operator: dbs.EQUAL.String(), Value: rel}},
},
}, payload, proto)
}
}
return nil
}
ks := []protocol.ID{}
for k := range protocolsPartners {
ks = append(ks, k)
}
var proto protocol.ID
proto = ProtocolCreateResource
switch action {
case tools.PB_DELETE:
proto = ProtocolDeleteResource
case tools.PB_UPDATE:
proto = ProtocolUpdateResource
for _, rel := range []peer.PeerRelation{peer.PARTNER, peer.MASTER, peer.NANO} {
ps.PublishesCommon(dt, user, groups, &dbs.Filters{
And: map[string][]dbs.Filter{
"relation": {{Operator: dbs.EQUAL.String(), Value: rel}},
},
}, payload, proto)
}
ps.PublishesCommon(dt, user, groups, &dbs.Filters{ // filter by like name, short_description, description, owner, url if no filters are provided
And: map[string][]dbs.Filter{
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.PARTNER}},
},
}, payload, proto)
return nil
}
@@ -129,7 +159,6 @@ func (s *StreamService) write(
if s.Streams, err = common.TempStream(s.Host, *peerID, proto, did, s.Streams, pts, &s.Mu); err != nil {
fmt.Println("TempStream", err)
return nil, errors.New("no stream available for protocol " + fmt.Sprintf("%v", proto) + " from PID " + peerID.ID.String())
}
stream := s.Streams[proto][peerID.ID]

View File

@@ -12,6 +12,7 @@ import (
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/config"
"cloud.o-forge.io/core/oc-lib/dbs"
"cloud.o-forge.io/core/oc-lib/models/peer"
"cloud.o-forge.io/core/oc-lib/models/utils"
@@ -42,6 +43,7 @@ var protocols = map[protocol.ID]*common.ProtocolInfo{
ProtocolVerifyResource: {WaitResponse: true, TTL: 1 * time.Minute},
ProtocolMinioConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
ProtocolAdmiraltyConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
ProtocolObserve: {WaitResponse: true, TTL: 1 * time.Minute},
}
var protocolsPartners = map[protocol.ID]*common.ProtocolInfo{
@@ -61,6 +63,21 @@ type StreamService struct {
// IsPeerKnown, when set, is called at stream open for every inbound protocol.
// Return false to reset the stream immediately. Left nil until wired by the node.
IsPeerKnown func(pid pp.ID) bool
// dnt is the Disconnection Network Tolerance cache for outbound streams.
dnt *dntCache
// observeCache tracks running heartbeat goroutines on the OBSERVED side.
observeCache *observeCache
// hbBatcher accumulates incoming heartbeats (observing side) and flushes
// them as a single NATS batch after observeBatchWindow.
hbBatcher *heartbeatBatcher
// drainUntil / drainMu implement the startup drain window: for 30 s after a
// close-all, incoming ProtocolObserve requests are rejected so stale heartbeats
// from a previous run cannot mix with fresh observations.
drainUntil time.Time
drainMu sync.RWMutex
// observeUsers tracks which users are observing which peers so streams are
// closed only when the last observer for a peer disconnects.
observeUsers *userPeerIndex
}
func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node common.DiscoveryPeer) (*StreamService, error) {
@@ -72,31 +89,60 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
Streams: common.ProtocolStream{},
maxNodesConn: maxNode,
ResourceSearches: common.NewSearchTracker(),
dnt: newDNTCache(),
observeCache: newObserveCache(),
observeUsers: newUserPeerIndex(),
}
service.hbBatcher = newHeartbeatBatcher(flushObserveBatch)
for proto := range protocols {
service.Host.SetStreamHandler(proto, service.gate(service.HandleResponse))
}
// ProtocolObserve uses a dedicated handler (bidirectional, long-lived).
logger.Info().Msg("connect to partners...")
service.connectToPartners() // we set up a stream
go service.StartGC(8 * time.Second)
go service.startDNTLoop()
return service, nil
}
// gate wraps a stream handler with IsPeerKnown validation.
// If the peer is unknown the entire connection is closed and the handler is not called.
// IsPeerKnown is read at stream-open time so it works even when set after InitStream.
func (s *StreamService) gatePrivilege(h func(network.Stream)) func(network.Stream) {
return func(stream network.Stream) {
if config.GetConfig().IsNano {
d := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.MASTER}},
},
}, "", false, 0, 1)
if len(d.Data) == 0 {
return
}
}
s.knowingGate(stream, h)
}
}
// gate wraps a stream handler with IsPeerKnown validation.
// If the peer is unknown the entire connection is closed and the handler is not called.
// IsPeerKnown is read at stream-open time so it works even when set after InitStream.
func (s *StreamService) gate(h func(network.Stream)) func(network.Stream) {
return func(stream network.Stream) {
if s.IsPeerKnown != nil && !s.IsPeerKnown(stream.Conn().RemotePeer()) {
logger := oclib.GetLogger()
logger.Warn().Str("peer", stream.Conn().RemotePeer().String()).Msg("[stream] unknown peer, closing connection")
stream.Conn().Close()
return
}
h(stream)
s.knowingGate(stream, h)
}
}
func (s *StreamService) knowingGate(stream network.Stream, h func(network.Stream)) {
if s.IsPeerKnown != nil && !s.IsPeerKnown(stream.Conn().RemotePeer()) {
logger := oclib.GetLogger()
logger.Warn().Str("peer", stream.Conn().RemotePeer().String()).Msg("[stream] unknown peer, closing connection")
stream.Conn().Close()
return
}
h(stream)
}
func (s *StreamService) HandleResponse(stream network.Stream) {
s.Mu.Lock()
defer s.Mu.Unlock()
@@ -137,13 +183,27 @@ func (s *StreamService) connectToPartners() error {
go s.readLoop(s.Streams[proto][ss.Conn().RemotePeer()], ss.Conn().RemotePeer(), proto, info)
}
logger.Info().Msg("SetStreamHandler " + string(proto))
s.Host.SetStreamHandler(proto, s.gate(f))
s.Host.SetStreamHandler(proto, s.gatePrivilege(f))
}
return nil
}
func (s *StreamService) searchPeer(search string) ([]*peer.Peer, error) {
ps := []*peer.Peer{}
if conf.GetConfig().NanoIDS != "" {
for _, peerID := range strings.Split(conf.GetConfig().NanoIDS, ",") {
ppID := strings.Split(peerID, "/")
ps = append(ps, &peer.Peer{
AbstractObject: utils.AbstractObject{
UUID: uuid.New().String(),
Name: ppID[1],
},
PeerID: ppID[len(ppID)-1],
StreamAddress: peerID,
Relation: peer.NANO,
})
}
}
if conf.GetConfig().PeerIDS != "" {
for _, peerID := range strings.Split(conf.GetConfig().PeerIDS, ",") {
ppID := strings.Split(peerID, "/")
@@ -159,7 +219,7 @@ func (s *StreamService) searchPeer(search string) ([]*peer.Peer, error) {
}
}
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
peers := access.Search(nil, search, false)
peers := access.Search(nil, search, false, 0, 0)
for _, p := range peers.Data {
ps = append(ps, p.(*peer.Peer))
}
@@ -230,7 +290,7 @@ func (ps *StreamService) readLoop(s *common.Stream, id pp.ID, proto protocol.ID,
}
continue
}
ps.handleEvent(evt.Type, &evt)
ps.handleEvent(evt.Type, &evt, s.Stream)
if protocolInfo.WaitResponse && !protocolInfo.PersistantStream {
break
}