package stream // dnt_cache.go — Disconnection Network Tolerance cache for outbound stream requests. // // When a stream write fails because the remote peer is unreachable, the request // is saved here and retried on the next tick. Two levels are defined: // // - dntCritical : retry indefinitely (create / update / delete resource). // - dntModerate : up to dntMaxModerateRetries retries, then abandon. // // Pubsub messages and search streams are explicitly excluded. // Streams initiated from the indexer side are never enqueued here. // // # Crash-resilient persistence // // Critical entries are written to an encrypted file (AES-256-GCM) so they // survive a node crash/restart. The AES key is derived deterministically from // the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage. // Moderate entries are intentionally not persisted: their retry budget is small // enough that re-loading them after a restart would be misleading. import ( "crypto/aes" "crypto/cipher" "crypto/rand" "crypto/sha256" "encoding/json" "io" "os" "path/filepath" "sync" "time" oclib "cloud.o-forge.io/core/oc-lib" "cloud.o-forge.io/core/oc-lib/tools" "golang.org/x/crypto/hkdf" "oc-discovery/conf" pp "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/core/protocol" ) type dntLevel int const ( dntCritical dntLevel = iota // retry until the message is delivered dntModerate // retry up to dntMaxModerateRetries times ) const dntMaxModerateRetries = 3 const dntRetryInterval = 15 * time.Second // dntProtocols maps each stream protocol to its DNT level. // Protocols absent from this map receive no caching (e.g. ProtocolSearchResource). var dntProtocols = map[protocol.ID]dntLevel{ // Critical — data mutations that must eventually be delivered. ProtocolCreateResource: dntCritical, ProtocolUpdateResource: dntCritical, ProtocolDeleteResource: dntCritical, // Moderate — confirmations / config / planner: 3 retries before abandon. ProtocolVerifyResource: dntModerate, ProtocolSendPlanner: dntModerate, ProtocolConsidersResource: dntModerate, ProtocolMinioConfigResource: dntModerate, ProtocolAdmiraltyConfigResource: dntModerate, } // dntEntryJSON is the on-disk representation of a dntEntry. // pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them. type dntEntryJSON struct { DID string `json:"did"` Addr pp.AddrInfo `json:"addr"` DT *tools.DataType `json:"dt,omitempty"` User string `json:"user"` Payload []byte `json:"payload"` Proto protocol.ID `json:"proto"` Retries int `json:"retries"` AddedAt time.Time `json:"added_at"` } type dntEntry struct { did string addr pp.AddrInfo dt *tools.DataType user string payload []byte proto protocol.ID retries int addedAt time.Time } func (e *dntEntry) toJSON() dntEntryJSON { return dntEntryJSON{ DID: e.did, Addr: e.addr, DT: e.dt, User: e.user, Payload: e.payload, Proto: e.proto, Retries: e.retries, AddedAt: e.addedAt, } } func entryFromJSON(j dntEntryJSON) *dntEntry { return &dntEntry{ did: j.DID, addr: j.Addr, dt: j.DT, user: j.User, payload: j.Payload, proto: j.Proto, retries: j.Retries, addedAt: j.AddedAt, } } type dntCache struct { mu sync.Mutex entries []*dntEntry // aesKey is the derived AES-256 key used for on-disk encryption. // Nil when key derivation failed: persistence is disabled but the in-memory // cache continues to function normally. aesKey []byte } // newDNTCache initialises the cache, derives the encryption key, and restores // any critical entries that were persisted before the last crash. func newDNTCache() *dntCache { log := oclib.GetLogger() c := &dntCache{} key, err := deriveDNTKey() if err != nil { log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled") } else { c.aesKey = key c.loadFromDisk() } return c } // enqueue adds an entry to the cache and persists critical entries to disk. func (c *dntCache) enqueue(e *dntEntry) { c.mu.Lock() c.entries = append(c.entries, e) c.mu.Unlock() if dntProtocols[e.proto] == dntCritical { go c.persistToDisk() } } // drain atomically removes and returns all current entries. func (c *dntCache) drain() []*dntEntry { c.mu.Lock() defer c.mu.Unlock() out := c.entries c.entries = nil return out } // requeue puts entries back at the head of the list, preserving any new // entries added while the retry loop was running. func (c *dntCache) requeue(entries []*dntEntry) { if len(entries) == 0 { return } c.mu.Lock() defer c.mu.Unlock() c.entries = append(entries, c.entries...) } // ── Persistence ────────────────────────────────────────────────────────────── // dntCachePath returns the path of the on-disk cache file, placed next to the // node's private key so it lives on the same persistent volume. func dntCachePath() string { return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin") } // deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key // using HKDF-SHA256. The derivation is deterministic: the same key is always // produced from the same private key, so no symmetric secret needs storing. func deriveDNTKey() ([]byte, error) { priv, err := tools.LoadKeyFromFilePrivate() if err != nil { return nil, err } // Raw() on a libp2p Ed25519 private key returns the 64-byte representation // (32-byte seed || 32-byte public key). We use the full 64 bytes as IKM. raw, err := priv.Raw() if err != nil { return nil, err } reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1")) key := make([]byte, 32) if _, err := io.ReadFull(reader, key); err != nil { return nil, err } return key, nil } // persistToDisk encrypts all current critical entries and writes them to disk. // Non-critical entries are deliberately excluded — they are not worth restoring // after a restart given their limited retry budget. func (c *dntCache) persistToDisk() { if c.aesKey == nil { return } log := oclib.GetLogger() c.mu.Lock() var toSave []dntEntryJSON for _, e := range c.entries { if dntProtocols[e.proto] == dntCritical { toSave = append(toSave, e.toJSON()) } } c.mu.Unlock() plaintext, err := json.Marshal(toSave) if err != nil { return } block, err := aes.NewCipher(c.aesKey) if err != nil { return } gcm, err := cipher.NewGCM(block) if err != nil { return } nonce := make([]byte, gcm.NonceSize()) if _, err := io.ReadFull(rand.Reader, nonce); err != nil { return } ciphertext := gcm.Seal(nonce, nonce, plaintext, nil) path := dntCachePath() tmp := path + ".tmp" if err := os.WriteFile(tmp, ciphertext, 0600); err != nil { log.Warn().Err(err).Msg("[dnt] failed to write cache file") return } if err := os.Rename(tmp, path); err != nil { log.Warn().Err(err).Msg("[dnt] failed to rename cache file") _ = os.Remove(tmp) } } // loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries. // Errors (missing file, decryption failure) are non-fatal: the cache simply // starts empty, which is safe. func (c *dntCache) loadFromDisk() { if c.aesKey == nil { return } log := oclib.GetLogger() path := dntCachePath() data, err := os.ReadFile(path) if err != nil { if !os.IsNotExist(err) { log.Warn().Err(err).Msg("[dnt] failed to read cache file") } return } block, err := aes.NewCipher(c.aesKey) if err != nil { return } gcm, err := cipher.NewGCM(block) if err != nil { return } if len(data) < gcm.NonceSize() { log.Warn().Msg("[dnt] cache file too short, ignoring") return } nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():] plaintext, err := gcm.Open(nil, nonce, ciphertext, nil) if err != nil { log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring") return } var saved []dntEntryJSON if err := json.Unmarshal(plaintext, &saved); err != nil { log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring") return } count := 0 for _, j := range saved { // Only restore critical entries — moderate entries are intentionally // not persisted, but this guard defends against format changes. if dntProtocols[j.Proto] != dntCritical { continue } c.entries = append(c.entries, entryFromJSON(j)) count++ } if count > 0 { log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk") } } // ── Retry loop ──────────────────────────────────────────────────────────────── // startDNTLoop runs the background retry goroutine. Call once after init. func (s *StreamService) startDNTLoop() { logger := oclib.GetLogger() ticker := time.NewTicker(dntRetryInterval) defer ticker.Stop() for range ticker.C { entries := s.dnt.drain() if len(entries) == 0 { continue } var keep []*dntEntry for _, e := range entries { _, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto) if err == nil { level := dntProtocols[e.proto] if level == dntCritical { logger.Info(). Str("proto", string(e.proto)). Str("peer", e.did). Msg("[dnt] critical message delivered after retry") } else { logger.Info(). Str("proto", string(e.proto)). Str("peer", e.did). Int("retries", e.retries). Msg("[dnt] moderate message delivered after retry") } continue } level := dntProtocols[e.proto] switch level { case dntCritical: keep = append(keep, e) case dntModerate: e.retries++ if e.retries < dntMaxModerateRetries { keep = append(keep, e) } else { logger.Warn(). Str("proto", string(e.proto)). Str("peer", e.did). Int("retries", e.retries). Msg("[dnt] moderate message abandoned after max retries") } } } s.dnt.requeue(keep) // Persist after each tick so the on-disk file reflects the current // state (entries delivered are removed, new ones from concurrent // enqueues are included). go s.dnt.persistToDisk() } }