Discovery Nano the light version.

2026-04-29 07:41:00 +02:00
parent fa341494d9
commit 7f951afd41
34 changed files with 2961 additions and 1501 deletions
--- a/daemons/node/stream/dnt_cache.go
+++ b/daemons/node/stream/dnt_cache.go
@@ -0,0 +1,362 @@
+package stream
+
+// dnt_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
+//
+// When a stream write fails because the remote peer is unreachable, the request
+// is saved here and retried on the next tick.  Two levels are defined:
+//
+//   - dntCritical : retry indefinitely (create / update / delete resource).
+//   - dntModerate : up to dntMaxModerateRetries retries, then abandon.
+//
+// Pubsub messages and search streams are explicitly excluded.
+// Streams initiated from the indexer side are never enqueued here.
+//
+// # Crash-resilient persistence
+//
+// Critical entries are written to an encrypted file (AES-256-GCM) so they
+// survive a node crash/restart.  The AES key is derived deterministically from
+// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
+// Moderate entries are intentionally not persisted: their retry budget is small
+// enough that re-loading them after a restart would be misleading.
+
+import (
+	"crypto/aes"
+	"crypto/cipher"
+	"crypto/rand"
+	"crypto/sha256"
+	"encoding/json"
+	"io"
+	"os"
+	"path/filepath"
+	"sync"
+	"time"
+
+	oclib "cloud.o-forge.io/core/oc-lib"
+	"cloud.o-forge.io/core/oc-lib/tools"
+	"golang.org/x/crypto/hkdf"
+
+	"oc-discovery/conf"
+
+	pp "github.com/libp2p/go-libp2p/core/peer"
+	"github.com/libp2p/go-libp2p/core/protocol"
+)
+
+type dntLevel int
+
+const (
+	dntCritical dntLevel = iota // retry until the message is delivered
+	dntModerate                 // retry up to dntMaxModerateRetries times
+)
+
+const dntMaxModerateRetries = 3
+const dntRetryInterval = 15 * time.Second
+
+// dntProtocols maps each stream protocol to its DNT level.
+// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
+var dntProtocols = map[protocol.ID]dntLevel{
+	// Critical — data mutations that must eventually be delivered.
+	ProtocolCreateResource: dntCritical,
+	ProtocolUpdateResource: dntCritical,
+	ProtocolDeleteResource: dntCritical,
+	// Moderate — confirmations / config / planner: 3 retries before abandon.
+	ProtocolVerifyResource:          dntModerate,
+	ProtocolSendPlanner:             dntModerate,
+	ProtocolConsidersResource:       dntModerate,
+	ProtocolMinioConfigResource:     dntModerate,
+	ProtocolAdmiraltyConfigResource: dntModerate,
+}
+
+// dntEntryJSON is the on-disk representation of a dntEntry.
+// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
+type dntEntryJSON struct {
+	DID     string           `json:"did"`
+	Addr    pp.AddrInfo      `json:"addr"`
+	DT      *tools.DataType  `json:"dt,omitempty"`
+	User    string           `json:"user"`
+	Payload []byte           `json:"payload"`
+	Proto   protocol.ID      `json:"proto"`
+	Retries int              `json:"retries"`
+	AddedAt time.Time        `json:"added_at"`
+}
+
+type dntEntry struct {
+	did     string
+	addr    pp.AddrInfo
+	dt      *tools.DataType
+	user    string
+	payload []byte
+	proto   protocol.ID
+	retries int
+	addedAt time.Time
+}
+
+func (e *dntEntry) toJSON() dntEntryJSON {
+	return dntEntryJSON{
+		DID:     e.did,
+		Addr:    e.addr,
+		DT:      e.dt,
+		User:    e.user,
+		Payload: e.payload,
+		Proto:   e.proto,
+		Retries: e.retries,
+		AddedAt: e.addedAt,
+	}
+}
+
+func entryFromJSON(j dntEntryJSON) *dntEntry {
+	return &dntEntry{
+		did:     j.DID,
+		addr:    j.Addr,
+		dt:      j.DT,
+		user:    j.User,
+		payload: j.Payload,
+		proto:   j.Proto,
+		retries: j.Retries,
+		addedAt: j.AddedAt,
+	}
+}
+
+type dntCache struct {
+	mu      sync.Mutex
+	entries []*dntEntry
+	// aesKey is the derived AES-256 key used for on-disk encryption.
+	// Nil when key derivation failed: persistence is disabled but the in-memory
+	// cache continues to function normally.
+	aesKey []byte
+}
+
+// newDNTCache initialises the cache, derives the encryption key, and restores
+// any critical entries that were persisted before the last crash.
+func newDNTCache() *dntCache {
+	log := oclib.GetLogger()
+	c := &dntCache{}
+	key, err := deriveDNTKey()
+	if err != nil {
+		log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
+	} else {
+		c.aesKey = key
+		c.loadFromDisk()
+	}
+	return c
+}
+
+// enqueue adds an entry to the cache and persists critical entries to disk.
+func (c *dntCache) enqueue(e *dntEntry) {
+	c.mu.Lock()
+	c.entries = append(c.entries, e)
+	c.mu.Unlock()
+	if dntProtocols[e.proto] == dntCritical {
+		go c.persistToDisk()
+	}
+}
+
+// drain atomically removes and returns all current entries.
+func (c *dntCache) drain() []*dntEntry {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	out := c.entries
+	c.entries = nil
+	return out
+}
+
+// requeue puts entries back at the head of the list, preserving any new
+// entries added while the retry loop was running.
+func (c *dntCache) requeue(entries []*dntEntry) {
+	if len(entries) == 0 {
+		return
+	}
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.entries = append(entries, c.entries...)
+}
+
+// ── Persistence ──────────────────────────────────────────────────────────────
+
+// dntCachePath returns the path of the on-disk cache file, placed next to the
+// node's private key so it lives on the same persistent volume.
+func dntCachePath() string {
+	return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
+}
+
+// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
+// using HKDF-SHA256.  The derivation is deterministic: the same key is always
+// produced from the same private key, so no symmetric secret needs storing.
+func deriveDNTKey() ([]byte, error) {
+	priv, err := tools.LoadKeyFromFilePrivate()
+	if err != nil {
+		return nil, err
+	}
+	// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
+	// (32-byte seed || 32-byte public key).  We use the full 64 bytes as IKM.
+	raw, err := priv.Raw()
+	if err != nil {
+		return nil, err
+	}
+	reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
+	key := make([]byte, 32)
+	if _, err := io.ReadFull(reader, key); err != nil {
+		return nil, err
+	}
+	return key, nil
+}
+
+// persistToDisk encrypts all current critical entries and writes them to disk.
+// Non-critical entries are deliberately excluded — they are not worth restoring
+// after a restart given their limited retry budget.
+func (c *dntCache) persistToDisk() {
+	if c.aesKey == nil {
+		return
+	}
+	log := oclib.GetLogger()
+	c.mu.Lock()
+	var toSave []dntEntryJSON
+	for _, e := range c.entries {
+		if dntProtocols[e.proto] == dntCritical {
+			toSave = append(toSave, e.toJSON())
+		}
+	}
+	c.mu.Unlock()
+
+	plaintext, err := json.Marshal(toSave)
+	if err != nil {
+		return
+	}
+
+	block, err := aes.NewCipher(c.aesKey)
+	if err != nil {
+		return
+	}
+	gcm, err := cipher.NewGCM(block)
+	if err != nil {
+		return
+	}
+	nonce := make([]byte, gcm.NonceSize())
+	if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
+		return
+	}
+	ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
+
+	path := dntCachePath()
+	tmp := path + ".tmp"
+	if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
+		log.Warn().Err(err).Msg("[dnt] failed to write cache file")
+		return
+	}
+	if err := os.Rename(tmp, path); err != nil {
+		log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
+		_ = os.Remove(tmp)
+	}
+}
+
+// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
+// Errors (missing file, decryption failure) are non-fatal: the cache simply
+// starts empty, which is safe.
+func (c *dntCache) loadFromDisk() {
+	if c.aesKey == nil {
+		return
+	}
+	log := oclib.GetLogger()
+	path := dntCachePath()
+	data, err := os.ReadFile(path)
+	if err != nil {
+		if !os.IsNotExist(err) {
+			log.Warn().Err(err).Msg("[dnt] failed to read cache file")
+		}
+		return
+	}
+
+	block, err := aes.NewCipher(c.aesKey)
+	if err != nil {
+		return
+	}
+	gcm, err := cipher.NewGCM(block)
+	if err != nil {
+		return
+	}
+	if len(data) < gcm.NonceSize() {
+		log.Warn().Msg("[dnt] cache file too short, ignoring")
+		return
+	}
+	nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
+	plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
+	if err != nil {
+		log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
+		return
+	}
+
+	var saved []dntEntryJSON
+	if err := json.Unmarshal(plaintext, &saved); err != nil {
+		log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
+		return
+	}
+
+	count := 0
+	for _, j := range saved {
+		// Only restore critical entries — moderate entries are intentionally
+		// not persisted, but this guard defends against format changes.
+		if dntProtocols[j.Proto] != dntCritical {
+			continue
+		}
+		c.entries = append(c.entries, entryFromJSON(j))
+		count++
+	}
+	if count > 0 {
+		log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
+	}
+}
+
+// ── Retry loop ────────────────────────────────────────────────────────────────
+
+// startDNTLoop runs the background retry goroutine.  Call once after init.
+func (s *StreamService) startDNTLoop() {
+	logger := oclib.GetLogger()
+	ticker := time.NewTicker(dntRetryInterval)
+	defer ticker.Stop()
+	for range ticker.C {
+		entries := s.dnt.drain()
+		if len(entries) == 0 {
+			continue
+		}
+		var keep []*dntEntry
+		for _, e := range entries {
+			_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
+			if err == nil {
+				level := dntProtocols[e.proto]
+				if level == dntCritical {
+					logger.Info().
+						Str("proto", string(e.proto)).
+						Str("peer", e.did).
+						Msg("[dnt] critical message delivered after retry")
+				} else {
+					logger.Info().
+						Str("proto", string(e.proto)).
+						Str("peer", e.did).
+						Int("retries", e.retries).
+						Msg("[dnt] moderate message delivered after retry")
+				}
+				continue
+			}
+			level := dntProtocols[e.proto]
+			switch level {
+			case dntCritical:
+				keep = append(keep, e)
+			case dntModerate:
+				e.retries++
+				if e.retries < dntMaxModerateRetries {
+					keep = append(keep, e)
+				} else {
+					logger.Warn().
+						Str("proto", string(e.proto)).
+						Str("peer", e.did).
+						Int("retries", e.retries).
+						Msg("[dnt] moderate message abandoned after max retries")
+				}
+			}
+		}
+		s.dnt.requeue(keep)
+		// Persist after each tick so the on-disk file reflects the current
+		// state (entries delivered are removed, new ones from concurrent
+		// enqueues are included).
+		go s.dnt.persistToDisk()
+	}
+}
--- a/daemons/node/stream/handler.go
+++ b/daemons/node/stream/handler.go
@@ -14,14 +14,23 @@ import (
 	"cloud.o-forge.io/core/oc-lib/models/peer"
 	"cloud.o-forge.io/core/oc-lib/models/resources"
 	"cloud.o-forge.io/core/oc-lib/tools"
+	"github.com/libp2p/go-libp2p/core/network"
 )

 type Verify struct {
 	IsVerified bool `json:"is_verified"`
 }

-func (ps *StreamService) handleEvent(protocol string, evt *common.Event) error {
-	fmt.Println("handleEvent")
+func (ps *StreamService) handleEvent(protocol string, evt *common.Event, s network.Stream) error {
+	fmt.Println("handleEvent", protocol)
+	// Heartbeat received on an outgoing ProtocolObserve stream.
+	if protocol == ProtocolObserve {
+		return ps.handleIncomingObserve(s)
+	}
+	if protocol == observeHBEventType {
+		return ps.handleObserveHeartbeat(evt)
+	}
+
 	ps.handleEventFromPartner(evt, protocol)
 	/*if protocol == ProtocolVerifyResource {
 		if evt.DataType == -1 {
@@ -159,7 +168,7 @@ func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol stri
 				And: map[string][]dbs.Filter{
 					"peer_id": {{Operator: dbs.EQUAL.String(), Value: evt.From}},
 				},
-			}, evt.From, false)
+			}, evt.From, false, 0, 1)
 			if len(peers.Data) > 0 {
 				p := peers.Data[0].(*peer.Peer)
 				ps.SendResponse(p, evt, fmt.Sprintf("%v", search))
@@ -212,7 +221,7 @@ func (abs *StreamService) SendResponse(p *peer.Peer, event *common.Event, search
 	} else {
 		for _, dt := range dts {
 			access := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil)
-			searched := access.Search(abs.FilterPeer(self.GetID(), event.Groups, search), "", false)
+			searched := access.Search(abs.FilterPeer(self.GetID(), event.Groups, search), "", false, 0, 0)
 			for _, ss := range searched.Data {
 				if j, err := json.Marshal(ss); err == nil {
 					abs.PublishCommon(&dt, event.User, event.Groups, p.PeerID, ProtocolSearchResource, j)
--- a/daemons/node/stream/observe.go
+++ b/daemons/node/stream/observe.go
@@ -0,0 +1,552 @@
+package stream
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"sync"
+	"time"
+
+	"oc-discovery/daemons/node/common"
+
+	oclib "cloud.o-forge.io/core/oc-lib"
+	"cloud.o-forge.io/core/oc-lib/dbs"
+	"cloud.o-forge.io/core/oc-lib/models/peer"
+	"cloud.o-forge.io/core/oc-lib/tools"
+	"github.com/libp2p/go-libp2p/core/network"
+	pp "github.com/libp2p/go-libp2p/core/peer"
+)
+
+// ProtocolObserve is the libp2p protocol for peer connectivity observation.
+// The requesting oc-discovery opens a stream to the remote oc-discovery and
+// sends an ObserveRequest. The remote side keeps the stream open and writes
+// ObserveHeartbeat events back every observeHBInterval seconds.
+const ProtocolObserve = "/opencloud/peer/observe/1.0"
+
+// observeHBEventType is used as the common.Event.Type for heartbeat responses.
+const observeHBEventType = "/opencloud/peer/observe/heartbeat"
+
+const observeHBInterval = 30 * time.Second
+const observeDrainDuration = 30 * time.Second
+
+// observeBatchWindow is the accumulation window before a heartbeat batch is
+// flushed to NATS. All peer heartbeats received within this window are grouped
+// into a single PEER_OBSERVE_RESPONSE_EVENT, reducing NATS traffic.
+const observeBatchWindow = 2 * time.Second
+
+// ObserveRequest is the first (and only) message sent by the observing side
+// when opening a ProtocolObserve stream.
+type ObserveRequest struct {
+	// Close, when true, asks the remote side to stop the heartbeat goroutine
+	// and remove the observer from its cache. Used for graceful teardown.
+	Close bool `json:"close,omitempty"`
+}
+
+// ObserveHeartbeat is sent by the observed side every observeHBInterval.
+type ObserveHeartbeat struct {
+	State string `json:"state"` // always "online" when actively emitted
+}
+
+// ShallowPeer is the minimal peer representation sent by oc-peer in a
+// PEER_OBSERVE_EVENT. StreamAddress lets oc-discovery connect without a DB
+// lookup; Address carries the NATSAddress (unused here, forwarded as-is).
+type ShallowPeer struct {
+	ID            string `json:"id"`
+	PeerID        string `json:"peer_id"`
+	Address       string `json:"address"`
+	StreamAddress string `json:"stream_address"`
+}
+
+// ObserveCommand is the payload carried by a PEER_OBSERVE_EVENT NATS message
+// (from oc-peer).
+//
+//	Observe  → User + Peers populated
+//	Close    → User + PeerIDs + Close=true
+//	CloseAll → CloseAll=true (User optional)
+type ObserveCommand struct {
+	User     string        `json:"user"`
+	Peers    []ShallowPeer `json:"peers,omitempty"`
+	PeerIDs  []string      `json:"peer_ids,omitempty"`
+	Close    bool          `json:"close,omitempty"`
+	CloseAll bool          `json:"close_all,omitempty"`
+}
+
+// ── observe cache (observed side) ────────────────────────────────────────────
+
+// observeCache tracks running heartbeat goroutines keyed by the observing
+// peer's libp2p PeerID string. It is used exclusively on the OBSERVED side.
+type observeCache struct {
+	mu      sync.Mutex
+	cancels map[string]context.CancelFunc
+}
+
+func newObserveCache() *observeCache {
+	return &observeCache{cancels: map[string]context.CancelFunc{}}
+}
+
+func (c *observeCache) set(pid string, cancel context.CancelFunc) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if old, ok := c.cancels[pid]; ok {
+		old() // cancel previous goroutine if any
+	}
+	c.cancels[pid] = cancel
+}
+
+func (c *observeCache) cancel(pid string) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if fn, ok := c.cancels[pid]; ok {
+		fn()
+		delete(c.cancels, pid)
+	}
+}
+
+func (c *observeCache) cancelAll() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	for _, fn := range c.cancels {
+		fn()
+	}
+	c.cancels = map[string]context.CancelFunc{}
+}
+
+func (c *observeCache) delete(pid string) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	delete(c.cancels, pid)
+}
+
+// ── heartbeat batcher (observing side) ───────────────────────────────────────
+
+// heartbeatBatcher accumulates peer_ids from incoming heartbeats over
+// observeBatchWindow, then flushes them in a single NATS call.
+// Using a map as the backing store deduplicates multiple heartbeats from the
+// same peer within the same window (should not happen, but is harmless).
+type heartbeatBatcher struct {
+	mu    sync.Mutex
+	ids   map[string]struct{}
+	timer *time.Timer
+	flush func(peerIDs []string)
+}
+
+func newHeartbeatBatcher(flush func([]string)) *heartbeatBatcher {
+	return &heartbeatBatcher{
+		ids:   make(map[string]struct{}),
+		flush: flush,
+	}
+}
+
+// add records peerID in the current batch and arms the flush timer if needed.
+func (b *heartbeatBatcher) add(peerID string) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.ids[peerID] = struct{}{}
+	if b.timer == nil {
+		b.timer = time.AfterFunc(observeBatchWindow, b.fire)
+	}
+}
+
+// fire is called by the timer; it drains the batch and invokes flush.
+func (b *heartbeatBatcher) fire() {
+	b.mu.Lock()
+	ids := make([]string, 0, len(b.ids))
+	for id := range b.ids {
+		ids = append(ids, id)
+	}
+	b.ids = make(map[string]struct{})
+	b.timer = nil
+	b.mu.Unlock()
+	if len(ids) > 0 {
+		b.flush(ids)
+	}
+}
+
+// flushObserveBatch is the flush function wired into the heartbeatBatcher.
+// It emits two NATS messages:
+//   - PEER_OBSERVE_RESPONSE_EVENT  → consumed by oc-peer (direct channel)
+//   - PROPALGATION_EVENT / PB_PROPAGATE → consumed by other oc-discovery nodes
+func flushObserveBatch(peerIDs []string) {
+	payload, err := json.Marshal(map[string]interface{}{
+		"peer_ids": peerIDs,
+		"state":    "online",
+	})
+	if err != nil {
+		return
+	}
+
+	// Direct notification to oc-peer.
+	tools.NewNATSCaller().SetNATSPub(tools.PEER_OBSERVE_RESPONSE_EVENT, tools.NATSResponse{
+		FromApp:  "oc-discovery",
+		Datatype: tools.PEER,
+		Method:   int(tools.PEER_OBSERVE_RESPONSE_EVENT),
+		Payload:  payload,
+	})
+
+	// Broadcast to other oc-discovery nodes so they can forward to their
+	// local oc-peer if needed.
+	propPayload, err := json.Marshal(tools.PropalgationMessage{
+		DataType: int(tools.PEER),
+		Action:   tools.PB_PROPAGATE,
+		Payload:  payload,
+	})
+	if err != nil {
+		return
+	}
+	tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
+		FromApp:  "oc-discovery",
+		Datatype: tools.PEER,
+		Method:   int(tools.PROPALGATION_EVENT),
+		Payload:  propPayload,
+	})
+}
+
+// ── incoming observe handler (observed side) ──────────────────────────────────
+
+// handleIncomingObserve is registered as the ProtocolObserve stream handler.
+// It is called when a remote peer opens an observe stream to us.
+// The function reads the request, validates it, then starts (or stops) the
+// heartbeat goroutine and returns immediately — the goroutine owns the stream.
+func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
+	remotePeerID := rawStream.Conn().RemotePeer().String()
+	addr := rawStream.Conn().RemoteMultiaddr().String()
+	ad, err := pp.AddrInfoFromString(addr + "/p2p/" + remotePeerID)
+	if err != nil {
+		fmt.Println("qndlqnl EERR", addr, err)
+		return err
+	}
+	log := oclib.GetLogger()
+
+	// Drain mode: reject any new observations for 30 s after a close-all.
+	s.drainMu.RLock()
+	draining := !s.drainUntil.IsZero() && time.Now().Before(s.drainUntil)
+	s.drainMu.RUnlock()
+	if draining {
+		rawStream.Close()
+		fmt.Println("Draining")
+		return errors.New("Draining")
+	}
+	// Read the observe request (with a generous deadline to avoid hangs).
+	// Guard: the requesting peer must not be blacklisted or be ourself.
+	did := ""
+	access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
+	res := access.Search(&dbs.Filters{
+		And: map[string][]dbs.Filter{
+			"peer_id": {{Operator: dbs.EQUAL.String(), Value: remotePeerID}},
+		},
+	}, "", false, 0, 1)
+	if len(res.Data) > 0 {
+		p := res.Data[0].(*peer.Peer)
+		did = p.GetID()
+		if p.Relation == peer.BLACKLIST { // || p.Relation == peer.SELF
+			rawStream.Close()
+			fmt.Println("CLOSE blacklist or self")
+			return errors.New("can't exploit blacklist or self")
+		}
+	}
+
+	// Replace any existing heartbeat goroutine for this observer.
+	ctx, cancel := context.WithCancel(context.Background())
+	s.observeCache.set(remotePeerID, cancel)
+	fmt.Println("LOOP OBSERVE")
+	go func() {
+		defer rawStream.Close()
+		defer cancel()
+		defer s.observeCache.delete(remotePeerID)
+
+		ticker := time.NewTicker(observeHBInterval)
+		defer ticker.Stop()
+
+		hbPayload, _ := json.Marshal(ObserveHeartbeat{State: "online"})
+		evt := common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", hbPayload)
+		if evt == nil {
+			return
+		}
+		if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
+			stream := s.Streams[ProtocolObserve][ad.ID]
+			if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
+				// Moderate connectivity event: the observer is unreachable.
+				// The deferred calls above purge this observer from the cache.
+				fmt.Println("LOOP EVT ERR", err)
+				log.Info().
+					Str("observer", remotePeerID).
+					Err(err).
+					Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
+				return
+			}
+		}
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case <-ticker.C:
+
+				rawStream.SetWriteDeadline(time.Now().Add(5 * time.Second))
+				fmt.Println("LOOP EVT", evt)
+				var err error
+				if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
+					stream := s.Streams[ProtocolObserve][ad.ID]
+					if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
+						// Moderate connectivity event: the observer is unreachable.
+						// The deferred calls above purge this observer from the cache.
+						fmt.Println("LOOP EVT ERR", err)
+						log.Info().
+							Str("observer", remotePeerID).
+							Err(err).
+							Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
+						return
+					}
+				}
+				rawStream.SetWriteDeadline(time.Time{})
+			}
+		}
+	}()
+	return nil
+}
+
+// ── heartbeat receiver (observing side) ───────────────────────────────────────
+
+// handleObserveHeartbeat is called by readLoop when a heartbeat event arrives
+// on an outgoing ProtocolObserve stream. It queues the peer_id in the batch
+// accumulator; the batcher flushes to NATS after observeBatchWindow.
+func (ps *StreamService) handleObserveHeartbeat(evt *common.Event) error {
+	// ps.hbBatcher.add(evt.From)
+	flushObserveBatch([]string{evt.From})
+	return nil
+}
+
+// ── user→peer index (ref-counted observe management) ─────────────────────────
+
+// userPeerIndex tracks which users are observing which peers.
+// A libp2p observe stream is kept open as long as at least one user watches
+// the peer; it is closed only when the last user stops.
+type userPeerIndex struct {
+	mu    sync.Mutex
+	index map[string]map[string]struct{} // user → set of peer_id strings
+}
+
+func newUserPeerIndex() *userPeerIndex {
+	return &userPeerIndex{index: map[string]map[string]struct{}{}}
+}
+
+// add registers user as an observer of peerID.
+// Returns true if peerID was not yet observed by any user (first observer).
+func (u *userPeerIndex) add(user, peerID string) (isFirst bool) {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	// Count total observers for peerID across all users before adding.
+	total := 0
+	for _, peers := range u.index {
+		if _, ok := peers[peerID]; ok {
+			total++
+		}
+	}
+	if u.index[user] == nil {
+		u.index[user] = map[string]struct{}{}
+	}
+	u.index[user][peerID] = struct{}{}
+	return total == 0
+}
+
+// remove unregisters user from peerID.
+// Returns true if no user is observing peerID anymore (last observer removed).
+func (u *userPeerIndex) remove(user, peerID string) (isLast bool) {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	delete(u.index[user], peerID)
+	if len(u.index[user]) == 0 {
+		delete(u.index, user)
+	}
+	for _, peers := range u.index {
+		if _, ok := peers[peerID]; ok {
+			return false
+		}
+	}
+	return true
+}
+
+// removeUser removes all entries for user and returns the peer_ids that now
+// have no remaining observers (i.e., those whose streams should be closed).
+func (u *userPeerIndex) removeUser(user string) []string {
+	u.mu.Lock()
+	defer u.mu.Unlock()
+	watched := u.index[user]
+	delete(u.index, user)
+	var orphans []string
+	for peerID := range watched {
+		found := false
+		for _, peers := range u.index {
+			if _, ok := peers[peerID]; ok {
+				found = true
+				break
+			}
+		}
+		if !found {
+			orphans = append(orphans, peerID)
+		}
+	}
+	return orphans
+}
+
+// ── NATS command handler (observing side) ─────────────────────────────────────
+
+// HandleObserveNATSCommand processes a PEER_OBSERVE_EVENT received from oc-peer.
+func (ps *StreamService) HandleObserveNATSCommand(resp tools.NATSResponse) {
+	log := oclib.GetLogger()
+	var cmd ObserveCommand
+	if err := json.Unmarshal(resp.Payload, &cmd); err != nil {
+		log.Warn().Err(err).Msg("[observe] failed to unmarshal ObserveCommand")
+		return
+	}
+	if cmd.CloseAll {
+		log.Info().Msg("[observe] close-all received via NATS")
+		ps.CloseAllObserves()
+		return
+	}
+	if cmd.Close {
+		for _, peerID := range cmd.PeerIDs {
+			if isLast := ps.observeUsers.remove(cmd.User, peerID); isLast {
+				if err := ps.closeObserveStream(peerID); err != nil {
+					log.Warn().Str("peer", peerID).Err(err).Msg("[observe] closeObserveStream failed")
+				}
+			}
+		}
+		return
+	}
+	// Observe: open streams for any new peer, using the address from the payload.
+	for _, p := range cmd.Peers {
+		if isFirst := ps.observeUsers.add(cmd.User, p.PeerID); isFirst {
+			if err := ps.openObserveStream(p); err != nil {
+				// Roll back the index entry so the next NATS command can retry.
+				ps.observeUsers.remove(cmd.User, p.PeerID)
+				log.Warn().Str("peer", p.PeerID).Err(err).Msg("[observe] openObserveStream failed")
+			}
+		}
+	}
+}
+
+// ── outgoing observe management (observing side) ──────────────────────────────
+
+// OpenObserveStream is the exported variant for inter-discovery propagation
+// (no user context available). It bypasses the user index and opens the stream
+// directly if not already open.
+func (ps *StreamService) OpenObserveStream(p ShallowPeer) error {
+	return ps.openObserveStream(p)
+}
+
+// CloseObserveStream is the exported variant for inter-discovery propagation.
+func (ps *StreamService) CloseObserveStream(toPeerID string) error {
+	return ps.closeObserveStream(toPeerID)
+}
+
+// openObserveStream opens a ProtocolObserve stream to p.
+// Uses p.StreamAddress directly; falls back to DB then DHT lookup if empty.
+func (ps *StreamService) openObserveStream(p ShallowPeer) error {
+	streamAddr := p.StreamAddress
+	fmt.Println("STREAM OBS", streamAddr)
+	access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
+	res := access.Search(&dbs.Filters{
+		And: map[string][]dbs.Filter{
+			"peer_id": {{Operator: dbs.EQUAL.String(), Value: p.PeerID}},
+		},
+	}, "", false, 0, 1)
+	if streamAddr == "" {
+		// Fallback: DB then DHT.
+		if len(res.Data) > 0 {
+			streamAddr = res.Data[0].(*peer.Peer).StreamAddress
+		} else if peers, err := ps.Node.GetPeerRecord(context.Background(), p.PeerID); err == nil && len(peers) > 0 {
+			streamAddr = peers[0].StreamAddress
+		}
+	}
+	if len(res.Data) > 0 && res.Data[0].(*peer.Peer).Relation == peer.SELF {
+		return errors.New("Can't send to self")
+	}
+	fmt.Println("STREAM OBS SSS", streamAddr)
+
+	if streamAddr == "" {
+		return nil // can't resolve address — silently skip
+	}
+
+	decodedID, err := pp.Decode(p.PeerID)
+	if err != nil {
+		return err
+	}
+
+	// If a stream already exists, reuse it.
+	ps.Mu.RLock()
+	_, alreadyOpen := ps.Streams[ProtocolObserve][decodedID]
+	ps.Mu.RUnlock()
+	if alreadyOpen {
+		return nil
+	}
+	ad, err := pp.AddrInfoFromString(streamAddr)
+	if err != nil {
+		return err
+	}
+	fmt.Println("TempStream OBSERVE", ad)
+	if ps.Streams, err = common.TempStream(ps.Host, *ad, ProtocolObserve, p.ID, ps.Streams, protocols, &ps.Mu); err == nil {
+		rawStream := ps.Streams[ProtocolObserve][ad.ID]
+		if hbPayload, err := json.Marshal(ObserveRequest{Close: false}); err == nil {
+			if err := json.NewEncoder(rawStream.Stream).Encode(common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", hbPayload)); err != nil {
+				fmt.Println("ERR")
+				rawStream.Stream.Close()
+				return err
+			}
+			s := &common.Stream{
+				Stream: rawStream.Stream,
+				Expiry: time.Now().Add(365 * 24 * time.Hour),
+			}
+			ps.Mu.Lock()
+			if ps.Streams[ProtocolObserve] == nil {
+				ps.Streams[ProtocolObserve] = map[pp.ID]*common.Stream{}
+			}
+			ps.Streams[ProtocolObserve][ad.ID] = s
+			ps.Mu.Unlock()
+
+			go ps.readLoop(s, ad.ID, ProtocolObserve, &common.ProtocolInfo{PersistantStream: true})
+		}
+
+	} else {
+		return err
+	}
+	return nil
+}
+
+// closeObserveStream closes the ProtocolObserve stream to toPeerID and notifies
+// the remote side.
+func (ps *StreamService) closeObserveStream(toPeerID string) error {
+	decodedID, err := pp.Decode(toPeerID)
+	if err != nil {
+		return err
+	}
+	ps.Mu.Lock()
+	if ps.Streams[ProtocolObserve] != nil {
+		if s, ok := ps.Streams[ProtocolObserve][decodedID]; ok {
+			_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
+			s.Stream.Close()
+			delete(ps.Streams[ProtocolObserve], decodedID)
+		}
+	}
+	ps.Mu.Unlock()
+	return nil
+}
+
+// CloseAllObserves closes every outgoing ProtocolObserve stream, clears the
+// user index, and enters drain mode for observeDrainDuration.
+func (ps *StreamService) CloseAllObserves() {
+	ps.Mu.Lock()
+	for _, s := range ps.Streams[ProtocolObserve] {
+		_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
+		s.Stream.Close()
+	}
+	delete(ps.Streams, ProtocolObserve)
+	ps.Mu.Unlock()
+
+	// Reset user index so stale ref-counts don't block future opens.
+	ps.observeUsers = newUserPeerIndex()
+
+	ps.drainMu.Lock()
+	ps.drainUntil = time.Now().Add(observeDrainDuration)
+	ps.drainMu.Unlock()
+}
--- a/daemons/node/stream/publish.go
+++ b/daemons/node/stream/publish.go
@@ -6,6 +6,8 @@ import (
 	"errors"
 	"fmt"
 	"oc-discovery/daemons/node/common"
+	"strings"
+	"time"

 	oclib "cloud.o-forge.io/core/oc-lib"
 	"cloud.o-forge.io/core/oc-lib/dbs"
@@ -19,9 +21,9 @@ func (ps *StreamService) PublishesCommon(dt *tools.DataType, user string, groups
 	access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
 	var p oclib.LibDataShallow
 	if filter == nil {
-		p = access.LoadAll(false)
+		p = access.LoadAll(false, 0, 10000)
 	} else {
-		p = access.Search(filter, "", false)
+		p = access.Search(filter, "", false, 0, 10000)
 	}
 	for _, pes := range p.Data {
 		for _, proto := range protos {
@@ -45,7 +47,7 @@ func (ps *StreamService) PublishCommon(dt *tools.DataType, user string, groups [
 		And: map[string][]dbs.Filter{ // search by name if no filters are provided
 			"peer_id": {{Operator: dbs.EQUAL.String(), Value: toPeerID}},
 		},
-	}, toPeerID, false)
+	}, toPeerID, false, 0, 1)
 	var pe *peer.Peer
 	if len(p.Data) > 0 && p.Data[0].(*peer.Peer).Relation != peer.BLACKLIST {
 		pe = p.Data[0].(*peer.Peer)
@@ -57,13 +59,36 @@ func (ps *StreamService) PublishCommon(dt *tools.DataType, user string, groups [
 		if err != nil {
 			return nil, err
 		}
-		return ps.write(toPeerID, ad, dt, user, resource, proto)
+		stream, err := ps.write(toPeerID, ad, dt, user, resource, proto)
+		if err != nil {
+			if _, ok := dntProtocols[proto]; ok {
+				ps.dnt.enqueue(&dntEntry{
+					did:     toPeerID,
+					addr:    *ad,
+					dt:      dt,
+					user:    user,
+					payload: resource,
+					proto:   proto,
+					addedAt: time.Now().UTC(),
+				})
+			}
+			return nil, err
+		}
+		return stream, nil
 	}
 	return nil, errors.New("peer unvalid " + toPeerID)
 }

 func (ps *StreamService) ToPartnerPublishEvent(
 	ctx context.Context, action tools.PubSubAction, dt *tools.DataType, user string, groups []string, payload []byte) error {
+	var proto protocol.ID
+	proto = ProtocolCreateResource
+	switch action {
+	case tools.PB_DELETE:
+		proto = ProtocolDeleteResource
+	case tools.PB_UPDATE:
+		proto = ProtocolUpdateResource
+	}
 	if *dt == tools.PEER {
 		var p peer.Peer
 		if err := json.Unmarshal(payload, &p); err != nil {
@@ -87,25 +112,30 @@ func (ps *StreamService) ToPartnerPublishEvent(

 			}
 		}
+		var per peer.Peer
+		if err := json.Unmarshal(payload, &per); err == nil && !strings.Contains(per.Relation.String(), "master") && !strings.Contains(per.Relation.String(), "nano") {
+			for _, rel := range []peer.PeerRelation{peer.MASTER, peer.NANO} {
+				ps.PublishesCommon(dt, user, groups, &dbs.Filters{
+					And: map[string][]dbs.Filter{
+						"relation": {{Operator: dbs.EQUAL.String(), Value: rel}},
+					},
+				}, payload, proto)
+			}
+		}
+
 		return nil
 	}
 	ks := []protocol.ID{}
 	for k := range protocolsPartners {
 		ks = append(ks, k)
 	}
-	var proto protocol.ID
-	proto = ProtocolCreateResource
-	switch action {
-	case tools.PB_DELETE:
-		proto = ProtocolDeleteResource
-	case tools.PB_UPDATE:
-		proto = ProtocolUpdateResource
+	for _, rel := range []peer.PeerRelation{peer.PARTNER, peer.MASTER, peer.NANO} {
+		ps.PublishesCommon(dt, user, groups, &dbs.Filters{
+			And: map[string][]dbs.Filter{
+				"relation": {{Operator: dbs.EQUAL.String(), Value: rel}},
+			},
+		}, payload, proto)
 	}
-	ps.PublishesCommon(dt, user, groups, &dbs.Filters{ // filter by like name, short_description, description, owner, url if no filters are provided
-		And: map[string][]dbs.Filter{
-			"relation": {{Operator: dbs.EQUAL.String(), Value: peer.PARTNER}},
-		},
-	}, payload, proto)
 	return nil
 }

@@ -129,7 +159,6 @@ func (s *StreamService) write(
 	if s.Streams, err = common.TempStream(s.Host, *peerID, proto, did, s.Streams, pts, &s.Mu); err != nil {
 		fmt.Println("TempStream", err)
 		return nil, errors.New("no stream available for protocol " + fmt.Sprintf("%v", proto) + " from PID " + peerID.ID.String())
-
 	}

 	stream := s.Streams[proto][peerID.ID]
--- a/daemons/node/stream/service.go
+++ b/daemons/node/stream/service.go
@@ -12,6 +12,7 @@ import (
 	"time"

 	oclib "cloud.o-forge.io/core/oc-lib"
+	"cloud.o-forge.io/core/oc-lib/config"
 	"cloud.o-forge.io/core/oc-lib/dbs"
 	"cloud.o-forge.io/core/oc-lib/models/peer"
 	"cloud.o-forge.io/core/oc-lib/models/utils"
@@ -42,6 +43,7 @@ var protocols = map[protocol.ID]*common.ProtocolInfo{
 	ProtocolVerifyResource:          {WaitResponse: true, TTL: 1 * time.Minute},
 	ProtocolMinioConfigResource:     {WaitResponse: true, TTL: 1 * time.Minute},
 	ProtocolAdmiraltyConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
+	ProtocolObserve:                 {WaitResponse: true, TTL: 1 * time.Minute},
 }

 var protocolsPartners = map[protocol.ID]*common.ProtocolInfo{
@@ -61,6 +63,21 @@ type StreamService struct {
 	// IsPeerKnown, when set, is called at stream open for every inbound protocol.
 	// Return false to reset the stream immediately. Left nil until wired by the node.
 	IsPeerKnown func(pid pp.ID) bool
+	// dnt is the Disconnection Network Tolerance cache for outbound streams.
+	dnt *dntCache
+	// observeCache tracks running heartbeat goroutines on the OBSERVED side.
+	observeCache *observeCache
+	// hbBatcher accumulates incoming heartbeats (observing side) and flushes
+	// them as a single NATS batch after observeBatchWindow.
+	hbBatcher *heartbeatBatcher
+	// drainUntil / drainMu implement the startup drain window: for 30 s after a
+	// close-all, incoming ProtocolObserve requests are rejected so stale heartbeats
+	// from a previous run cannot mix with fresh observations.
+	drainUntil time.Time
+	drainMu    sync.RWMutex
+	// observeUsers tracks which users are observing which peers so streams are
+	// closed only when the last observer for a peer disconnects.
+	observeUsers *userPeerIndex
 }

 func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node common.DiscoveryPeer) (*StreamService, error) {
@@ -72,31 +89,60 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
 		Streams:          common.ProtocolStream{},
 		maxNodesConn:     maxNode,
 		ResourceSearches: common.NewSearchTracker(),
+		dnt:              newDNTCache(),
+		observeCache:     newObserveCache(),
+		observeUsers:     newUserPeerIndex(),
 	}
+	service.hbBatcher = newHeartbeatBatcher(flushObserveBatch)
 	for proto := range protocols {
 		service.Host.SetStreamHandler(proto, service.gate(service.HandleResponse))
 	}
+	// ProtocolObserve uses a dedicated handler (bidirectional, long-lived).
 	logger.Info().Msg("connect to partners...")
 	service.connectToPartners() // we set up a stream
 	go service.StartGC(8 * time.Second)
+	go service.startDNTLoop()
 	return service, nil
 }

+// gate wraps a stream handler with IsPeerKnown validation.
+// If the peer is unknown the entire connection is closed and the handler is not called.
+// IsPeerKnown is read at stream-open time so it works even when set after InitStream.
+func (s *StreamService) gatePrivilege(h func(network.Stream)) func(network.Stream) {
+	return func(stream network.Stream) {
+		if config.GetConfig().IsNano {
+			d := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).Search(&dbs.Filters{
+				And: map[string][]dbs.Filter{
+					"relation": {{Operator: dbs.EQUAL.String(), Value: peer.MASTER}},
+				},
+			}, "", false, 0, 1)
+			if len(d.Data) == 0 {
+				return
+			}
+		}
+		s.knowingGate(stream, h)
+	}
+}
+
 // gate wraps a stream handler with IsPeerKnown validation.
 // If the peer is unknown the entire connection is closed and the handler is not called.
 // IsPeerKnown is read at stream-open time so it works even when set after InitStream.
 func (s *StreamService) gate(h func(network.Stream)) func(network.Stream) {
 	return func(stream network.Stream) {
-		if s.IsPeerKnown != nil && !s.IsPeerKnown(stream.Conn().RemotePeer()) {
-			logger := oclib.GetLogger()
-			logger.Warn().Str("peer", stream.Conn().RemotePeer().String()).Msg("[stream] unknown peer, closing connection")
-			stream.Conn().Close()
-			return
-		}
-		h(stream)
+		s.knowingGate(stream, h)
 	}
 }

+func (s *StreamService) knowingGate(stream network.Stream, h func(network.Stream)) {
+	if s.IsPeerKnown != nil && !s.IsPeerKnown(stream.Conn().RemotePeer()) {
+		logger := oclib.GetLogger()
+		logger.Warn().Str("peer", stream.Conn().RemotePeer().String()).Msg("[stream] unknown peer, closing connection")
+		stream.Conn().Close()
+		return
+	}
+	h(stream)
+}
+
 func (s *StreamService) HandleResponse(stream network.Stream) {
 	s.Mu.Lock()
 	defer s.Mu.Unlock()
@@ -137,13 +183,27 @@ func (s *StreamService) connectToPartners() error {
 			go s.readLoop(s.Streams[proto][ss.Conn().RemotePeer()], ss.Conn().RemotePeer(), proto, info)
 		}
 		logger.Info().Msg("SetStreamHandler " + string(proto))
-		s.Host.SetStreamHandler(proto, s.gate(f))
+		s.Host.SetStreamHandler(proto, s.gatePrivilege(f))
 	}
 	return nil
 }

 func (s *StreamService) searchPeer(search string) ([]*peer.Peer, error) {
 	ps := []*peer.Peer{}
+	if conf.GetConfig().NanoIDS != "" {
+		for _, peerID := range strings.Split(conf.GetConfig().NanoIDS, ",") {
+			ppID := strings.Split(peerID, "/")
+			ps = append(ps, &peer.Peer{
+				AbstractObject: utils.AbstractObject{
+					UUID: uuid.New().String(),
+					Name: ppID[1],
+				},
+				PeerID:        ppID[len(ppID)-1],
+				StreamAddress: peerID,
+				Relation:      peer.NANO,
+			})
+		}
+	}
 	if conf.GetConfig().PeerIDS != "" {
 		for _, peerID := range strings.Split(conf.GetConfig().PeerIDS, ",") {
 			ppID := strings.Split(peerID, "/")
@@ -159,7 +219,7 @@ func (s *StreamService) searchPeer(search string) ([]*peer.Peer, error) {
 		}
 	}
 	access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
-	peers := access.Search(nil, search, false)
+	peers := access.Search(nil, search, false, 0, 0)
 	for _, p := range peers.Data {
 		ps = append(ps, p.(*peer.Peer))
 	}
@@ -230,7 +290,7 @@ func (ps *StreamService) readLoop(s *common.Stream, id pp.ID, proto protocol.ID,
 			}
 			continue
 		}
-		ps.handleEvent(evt.Type, &evt)
+		ps.handleEvent(evt.Type, &evt, s.Stream)
 		if protocolInfo.WaitResponse && !protocolInfo.PersistantStream {
 			break
 		}