Files
oc-discovery/daemons/node/stream/dnt_cache.go
2026-04-29 07:41:00 +02:00

363 lines
10 KiB
Go

package stream
// dnt_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
//
// When a stream write fails because the remote peer is unreachable, the request
// is saved here and retried on the next tick. Two levels are defined:
//
// - dntCritical : retry indefinitely (create / update / delete resource).
// - dntModerate : up to dntMaxModerateRetries retries, then abandon.
//
// Pubsub messages and search streams are explicitly excluded.
// Streams initiated from the indexer side are never enqueued here.
//
// # Crash-resilient persistence
//
// Critical entries are written to an encrypted file (AES-256-GCM) so they
// survive a node crash/restart. The AES key is derived deterministically from
// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
// Moderate entries are intentionally not persisted: their retry budget is small
// enough that re-loading them after a restart would be misleading.
import (
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"crypto/sha256"
"encoding/json"
"io"
"os"
"path/filepath"
"sync"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/tools"
"golang.org/x/crypto/hkdf"
"oc-discovery/conf"
pp "github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/core/protocol"
)
type dntLevel int
const (
dntCritical dntLevel = iota // retry until the message is delivered
dntModerate // retry up to dntMaxModerateRetries times
)
const dntMaxModerateRetries = 3
const dntRetryInterval = 15 * time.Second
// dntProtocols maps each stream protocol to its DNT level.
// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
var dntProtocols = map[protocol.ID]dntLevel{
// Critical — data mutations that must eventually be delivered.
ProtocolCreateResource: dntCritical,
ProtocolUpdateResource: dntCritical,
ProtocolDeleteResource: dntCritical,
// Moderate — confirmations / config / planner: 3 retries before abandon.
ProtocolVerifyResource: dntModerate,
ProtocolSendPlanner: dntModerate,
ProtocolConsidersResource: dntModerate,
ProtocolMinioConfigResource: dntModerate,
ProtocolAdmiraltyConfigResource: dntModerate,
}
// dntEntryJSON is the on-disk representation of a dntEntry.
// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
type dntEntryJSON struct {
DID string `json:"did"`
Addr pp.AddrInfo `json:"addr"`
DT *tools.DataType `json:"dt,omitempty"`
User string `json:"user"`
Payload []byte `json:"payload"`
Proto protocol.ID `json:"proto"`
Retries int `json:"retries"`
AddedAt time.Time `json:"added_at"`
}
type dntEntry struct {
did string
addr pp.AddrInfo
dt *tools.DataType
user string
payload []byte
proto protocol.ID
retries int
addedAt time.Time
}
func (e *dntEntry) toJSON() dntEntryJSON {
return dntEntryJSON{
DID: e.did,
Addr: e.addr,
DT: e.dt,
User: e.user,
Payload: e.payload,
Proto: e.proto,
Retries: e.retries,
AddedAt: e.addedAt,
}
}
func entryFromJSON(j dntEntryJSON) *dntEntry {
return &dntEntry{
did: j.DID,
addr: j.Addr,
dt: j.DT,
user: j.User,
payload: j.Payload,
proto: j.Proto,
retries: j.Retries,
addedAt: j.AddedAt,
}
}
type dntCache struct {
mu sync.Mutex
entries []*dntEntry
// aesKey is the derived AES-256 key used for on-disk encryption.
// Nil when key derivation failed: persistence is disabled but the in-memory
// cache continues to function normally.
aesKey []byte
}
// newDNTCache initialises the cache, derives the encryption key, and restores
// any critical entries that were persisted before the last crash.
func newDNTCache() *dntCache {
log := oclib.GetLogger()
c := &dntCache{}
key, err := deriveDNTKey()
if err != nil {
log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
} else {
c.aesKey = key
c.loadFromDisk()
}
return c
}
// enqueue adds an entry to the cache and persists critical entries to disk.
func (c *dntCache) enqueue(e *dntEntry) {
c.mu.Lock()
c.entries = append(c.entries, e)
c.mu.Unlock()
if dntProtocols[e.proto] == dntCritical {
go c.persistToDisk()
}
}
// drain atomically removes and returns all current entries.
func (c *dntCache) drain() []*dntEntry {
c.mu.Lock()
defer c.mu.Unlock()
out := c.entries
c.entries = nil
return out
}
// requeue puts entries back at the head of the list, preserving any new
// entries added while the retry loop was running.
func (c *dntCache) requeue(entries []*dntEntry) {
if len(entries) == 0 {
return
}
c.mu.Lock()
defer c.mu.Unlock()
c.entries = append(entries, c.entries...)
}
// ── Persistence ──────────────────────────────────────────────────────────────
// dntCachePath returns the path of the on-disk cache file, placed next to the
// node's private key so it lives on the same persistent volume.
func dntCachePath() string {
return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
}
// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
// using HKDF-SHA256. The derivation is deterministic: the same key is always
// produced from the same private key, so no symmetric secret needs storing.
func deriveDNTKey() ([]byte, error) {
priv, err := tools.LoadKeyFromFilePrivate()
if err != nil {
return nil, err
}
// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
// (32-byte seed || 32-byte public key). We use the full 64 bytes as IKM.
raw, err := priv.Raw()
if err != nil {
return nil, err
}
reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
key := make([]byte, 32)
if _, err := io.ReadFull(reader, key); err != nil {
return nil, err
}
return key, nil
}
// persistToDisk encrypts all current critical entries and writes them to disk.
// Non-critical entries are deliberately excluded — they are not worth restoring
// after a restart given their limited retry budget.
func (c *dntCache) persistToDisk() {
if c.aesKey == nil {
return
}
log := oclib.GetLogger()
c.mu.Lock()
var toSave []dntEntryJSON
for _, e := range c.entries {
if dntProtocols[e.proto] == dntCritical {
toSave = append(toSave, e.toJSON())
}
}
c.mu.Unlock()
plaintext, err := json.Marshal(toSave)
if err != nil {
return
}
block, err := aes.NewCipher(c.aesKey)
if err != nil {
return
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return
}
nonce := make([]byte, gcm.NonceSize())
if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
return
}
ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
path := dntCachePath()
tmp := path + ".tmp"
if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
log.Warn().Err(err).Msg("[dnt] failed to write cache file")
return
}
if err := os.Rename(tmp, path); err != nil {
log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
_ = os.Remove(tmp)
}
}
// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
// Errors (missing file, decryption failure) are non-fatal: the cache simply
// starts empty, which is safe.
func (c *dntCache) loadFromDisk() {
if c.aesKey == nil {
return
}
log := oclib.GetLogger()
path := dntCachePath()
data, err := os.ReadFile(path)
if err != nil {
if !os.IsNotExist(err) {
log.Warn().Err(err).Msg("[dnt] failed to read cache file")
}
return
}
block, err := aes.NewCipher(c.aesKey)
if err != nil {
return
}
gcm, err := cipher.NewGCM(block)
if err != nil {
return
}
if len(data) < gcm.NonceSize() {
log.Warn().Msg("[dnt] cache file too short, ignoring")
return
}
nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
if err != nil {
log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
return
}
var saved []dntEntryJSON
if err := json.Unmarshal(plaintext, &saved); err != nil {
log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
return
}
count := 0
for _, j := range saved {
// Only restore critical entries — moderate entries are intentionally
// not persisted, but this guard defends against format changes.
if dntProtocols[j.Proto] != dntCritical {
continue
}
c.entries = append(c.entries, entryFromJSON(j))
count++
}
if count > 0 {
log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
}
}
// ── Retry loop ────────────────────────────────────────────────────────────────
// startDNTLoop runs the background retry goroutine. Call once after init.
func (s *StreamService) startDNTLoop() {
logger := oclib.GetLogger()
ticker := time.NewTicker(dntRetryInterval)
defer ticker.Stop()
for range ticker.C {
entries := s.dnt.drain()
if len(entries) == 0 {
continue
}
var keep []*dntEntry
for _, e := range entries {
_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
if err == nil {
level := dntProtocols[e.proto]
if level == dntCritical {
logger.Info().
Str("proto", string(e.proto)).
Str("peer", e.did).
Msg("[dnt] critical message delivered after retry")
} else {
logger.Info().
Str("proto", string(e.proto)).
Str("peer", e.did).
Int("retries", e.retries).
Msg("[dnt] moderate message delivered after retry")
}
continue
}
level := dntProtocols[e.proto]
switch level {
case dntCritical:
keep = append(keep, e)
case dntModerate:
e.retries++
if e.retries < dntMaxModerateRetries {
keep = append(keep, e)
} else {
logger.Warn().
Str("proto", string(e.proto)).
Str("peer", e.did).
Int("retries", e.retries).
Msg("[dnt] moderate message abandoned after max retries")
}
}
}
s.dnt.requeue(keep)
// Persist after each tick so the on-disk file reflects the current
// state (entries delivered are removed, new ones from concurrent
// enqueues are included).
go s.dnt.persistToDisk()
}
}