WatchDog Kube

This commit is contained in:
mr
2026-03-24 10:50:36 +01:00
parent a7ffede3e2
commit dab61463f0
14 changed files with 884 additions and 261 deletions

View File

@@ -2,11 +2,13 @@ package infrastructure
import (
"context"
"encoding/json"
"fmt"
"sync"
"time"
"oc-datacenter/infrastructure/minio"
"oc-datacenter/infrastructure/storage"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/dbs"
@@ -17,15 +19,10 @@ import (
"go.mongodb.org/mongo-driver/bson/primitive"
)
// processedBookings tracks booking IDs whose start-expiry has already been handled.
// Resets on restart; teardown methods are idempotent so duplicate runs are safe.
// processedBookings tracks booking IDs already handled this process lifetime.
var processedBookings sync.Map
// processedEndBookings tracks booking IDs whose end-expiry (Admiralty source cleanup)
// has already been triggered in this process lifetime.
var processedEndBookings sync.Map
// closingStates is the set of terminal booking states after which infra must be torn down.
// closingStates is the set of terminal booking states.
var closingStates = map[enum.BookingStatus]bool{
enum.FAILURE: true,
enum.SUCCESS: true,
@@ -33,9 +30,12 @@ var closingStates = map[enum.BookingStatus]bool{
enum.CANCELLED: true,
}
// WatchBookings starts a passive loop that ticks every minute, scans bookings whose
// ExpectedStartDate + 1 min has passed, transitions them to terminal states when needed,
// and tears down the associated Kubernetes / Minio infrastructure.
// WatchBookings is a safety-net fallback for when oc-monitord fails to launch.
// It detects bookings that are past expected_start_date by at least 1 minute and
// are still in a non-terminal state. Instead of writing to the database directly,
// it emits WORKFLOW_STEP_DONE_EVENT with State=FAILURE on NATS so that oc-scheduler
// handles the state transition — keeping a single source of truth for booking state.
//
// Must be launched in a goroutine from main.
func WatchBookings() {
logger := oclib.GetLogger()
@@ -43,18 +43,16 @@ func WatchBookings() {
ticker := time.NewTicker(time.Minute)
defer ticker.Stop()
for range ticker.C {
if err := scanExpiredBookings(); err != nil {
logger.Error().Msg("BookingWatchdog: " + err.Error())
}
if err := scanEndedExec(); err != nil {
if err := scanStaleBookings(); err != nil {
logger.Error().Msg("BookingWatchdog: " + err.Error())
}
}
}
// scanExpiredBookings queries all bookings whose start deadline has passed and
// dispatches each one to processExpiredBooking.
func scanExpiredBookings() error {
// scanStaleBookings queries all bookings whose ExpectedStartDate passed more than
// 1 minute ago. Non-terminal ones get a WORKFLOW_STEP_DONE_EVENT FAILURE emitted
// on NATS so oc-scheduler closes them.
func scanStaleBookings() error {
myself, err := oclib.GetMySelf()
if err != nil {
return fmt.Errorf("could not resolve local peer: %w", err)
@@ -73,7 +71,7 @@ func scanExpiredBookings() error {
}, "", false)
if res.Err != "" {
return fmt.Errorf("booking search failed: %s", res.Err)
return fmt.Errorf("stale booking search failed: %s", res.Err)
}
for _, dbo := range res.Data {
@@ -81,164 +79,162 @@ func scanExpiredBookings() error {
if !ok {
continue
}
go processExpiredBooking(b, peerID)
go emitWatchdogFailure(b)
}
return nil
}
// processExpiredBooking transitions the booking to a terminal state when applicable,
// then tears down infrastructure based on the resource type:
// - LIVE_DATACENTER / COMPUTE_RESOURCE → Admiralty (as target) + Minio (as target)
// - LIVE_STORAGE / STORAGE_RESOURCE → Minio (as source)
func processExpiredBooking(b *bookingmodel.Booking, peerID string) {
// emitWatchdogFailure publishes a WORKFLOW_STEP_DONE_EVENT FAILURE for a stale
// booking. oc-scheduler is the single authority for booking state transitions.
func emitWatchdogFailure(b *bookingmodel.Booking) {
logger := oclib.GetLogger()
ctx := context.Background()
// Skip bookings already handled during this process lifetime.
if _, done := processedBookings.Load(b.GetID()); done {
return
}
// Transition non-terminal bookings.
if !closingStates[b.State] {
var newState enum.BookingStatus
switch b.State {
case enum.DRAFT, enum.DELAYED:
// DRAFT: never launched; DELAYED: was SCHEDULED but start never arrived.
newState = enum.FORGOTTEN
case enum.SCHEDULED:
// Passed its start date without ever being launched.
newState = enum.FAILURE
case enum.STARTED:
// A running booking is never auto-closed by the watchdog.
return
default:
return
}
upd := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", peerID, []string{}, nil).
UpdateOne(map[string]any{"state": newState.EnumIndex()}, b.GetID())
if upd.Err != "" {
logger.Error().Msgf("BookingWatchdog: failed to update booking %s: %s", b.GetID(), upd.Err)
return
}
b.State = newState
logger.Info().Msgf("BookingWatchdog: booking %s (exec=%s, type=%s) → %s",
b.GetID(), b.ExecutionsID, b.ResourceType, b.State)
if closingStates[b.State] {
processedBookings.Store(b.GetID(), struct{}{})
return
}
// Mark as handled before triggering async teardown (avoids double-trigger on next tick).
now := time.Now().UTC()
payload, err := json.Marshal(tools.WorkflowLifecycleEvent{
BookingID: b.GetID(),
State: enum.FAILURE.EnumIndex(),
RealEnd: &now,
})
if err != nil {
return
}
tools.NewNATSCaller().SetNATSPub(tools.WORKFLOW_STEP_DONE_EVENT, tools.NATSResponse{
FromApp: "oc-datacenter",
Method: int(tools.WORKFLOW_STEP_DONE_EVENT),
Payload: payload,
})
logger.Info().Msgf("BookingWatchdog: booking %s stale → emitting FAILURE", b.GetID())
processedBookings.Store(b.GetID(), struct{}{})
// Tear down infrastructure according to resource type.
switch b.ResourceType {
case tools.LIVE_DATACENTER, tools.COMPUTE_RESOURCE:
logger.Info().Msgf("BookingWatchdog: tearing down compute infra exec=%s", b.ExecutionsID)
go NewAdmiraltySetter(b.ExecutionsID).TeardownAsSource(ctx) // i'm the compute units.
go teardownMinioForComputeBooking(ctx, b, peerID)
case tools.LIVE_STORAGE, tools.STORAGE_RESOURCE:
logger.Info().Msgf("BookingWatchdog: tearing down storage infra exec=%s", b.ExecutionsID)
go teardownMinioSourceBooking(ctx, b, peerID)
}
}
// scanEndedBookings queries LIVE_DATACENTER / COMPUTE_RESOURCE bookings whose
// ExpectedEndDate + 1 min has passed and triggers TeardownAsSource for Admiralty,
// cleaning up the compute-side namespace once the execution window is over.
func scanEndedExec() error {
myself, err := oclib.GetMySelf()
if err != nil {
return fmt.Errorf("could not resolve local peer: %w", err)
}
peerID := myself.GetID()
res := oclib.NewRequest(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), "", peerID, []string{}, nil).
// ── Infra teardown helpers (called from nats.go on WORKFLOW_DONE_EVENT) ────────
// teardownAdmiraltyIfRemote triggers Admiralty TeardownAsTarget only when at
// least one compute booking for the execution is on a remote peer.
// Local executions do not involve Admiralty.
func teardownAdmiraltyIfRemote(exec *workflow_execution.WorkflowExecution, selfPeerID string) {
logger := oclib.GetLogger()
res := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", selfPeerID, []string{}, nil).
Search(&dbs.Filters{
And: map[string][]dbs.Filter{
// Only compute bookings require Admiralty source cleanup.
"state": {{
Operator: dbs.GT.String(),
Value: 2,
}},
"executions_id": {{Operator: dbs.EQUAL.String(), Value: exec.ExecutionsID}},
"resource_type": {{Operator: dbs.EQUAL.String(), Value: tools.COMPUTE_RESOURCE.EnumIndex()}},
},
}, "", false)
if res.Err != "" {
return fmt.Errorf("ended-booking search failed: %s", res.Err)
if res.Err != "" || len(res.Data) == 0 {
return
}
for _, dbo := range res.Data {
b, ok := dbo.(*workflow_execution.WorkflowExecution)
b, ok := dbo.(*bookingmodel.Booking)
if !ok {
continue
}
go teardownAdmiraltyTarget(b)
}
return nil
}
// teardownAdmiraltySource triggers TeardownAsSource for the compute-side namespace
// of an execution whose expected end date has passed.
func teardownAdmiraltyTarget(b *workflow_execution.WorkflowExecution) {
logger := oclib.GetLogger()
// Each executionsID is processed at most once per process lifetime.
if _, done := processedEndBookings.Load(b.ExecutionsID); done {
return
}
processedEndBookings.Store(b.ExecutionsID, struct{}{})
logger.Info().Msgf("BookingWatchdog: tearing down Admiralty source exec=%s (booking=%s)",
b.ExecutionsID, b.GetID())
if p, err := oclib.GetMySelf(); err == nil {
NewAdmiraltySetter(b.ExecutionsID).TeardownAsTarget(context.Background(), p.GetID())
if b.DestPeerID != selfPeerID {
logger.Info().Msgf("InfraTeardown: Admiralty teardown exec=%s (remote peer=%s)",
exec.ExecutionsID, b.DestPeerID)
NewAdmiraltySetter(exec.ExecutionsID).TeardownAsTarget(context.Background(), selfPeerID)
return // one teardown per execution is enough
}
}
}
// teardownMinioForComputeBooking finds the LIVE_STORAGE bookings belonging to the same
// execution and triggers Minio-as-target teardown for each (K8s secret + configmap).
// The Minio-as-source side is handled separately by the storage booking's own watchdog pass.
func teardownMinioForComputeBooking(ctx context.Context, computeBooking *bookingmodel.Booking, localPeerID string) {
// teardownMinioForExecution tears down all Minio configuration for the execution:
// - storage bookings where this peer is the compute target → TeardownAsTarget
// - storage bookings where this peer is the Minio source → TeardownAsSource
func teardownMinioForExecution(ctx context.Context, executionsID string, localPeerID string) {
logger := oclib.GetLogger()
res := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", localPeerID, []string{}, nil).
Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"executions_id": {{Operator: dbs.EQUAL.String(), Value: computeBooking.ExecutionsID}},
"executions_id": {{Operator: dbs.EQUAL.String(), Value: executionsID}},
"resource_type": {{Operator: dbs.EQUAL.String(), Value: tools.LIVE_STORAGE.EnumIndex()}},
},
}, "", false)
if res.Err != "" || len(res.Data) == 0 {
logger.Warn().Msgf("BookingWatchdog: no storage booking found for exec=%s", computeBooking.ExecutionsID)
return
}
for _, dbo := range res.Data {
sb, ok := dbo.(*bookingmodel.Booking)
b, ok := dbo.(*bookingmodel.Booking)
if !ok {
continue
}
event := minio.MinioDeleteEvent{
ExecutionsID: computeBooking.ExecutionsID,
MinioID: sb.ResourceID,
SourcePeerID: sb.DestPeerID, // peer hosting Minio
DestPeerID: localPeerID, // this peer (compute/target)
OriginID: "",
if b.DestPeerID == localPeerID {
// This peer is the compute target: tear down K8s secret + configmap.
logger.Info().Msgf("InfraTeardown: Minio target teardown exec=%s storage=%s", executionsID, b.ResourceID)
event := minio.MinioDeleteEvent{
ExecutionsID: executionsID,
MinioID: b.ResourceID,
SourcePeerID: b.DestPeerID,
DestPeerID: localPeerID,
OriginID: "",
}
minio.NewMinioSetter(executionsID, b.ResourceID).TeardownAsTarget(ctx, event)
} else {
// This peer is the Minio source: revoke SA + remove execution bucket.
logger.Info().Msgf("InfraTeardown: Minio source teardown exec=%s storage=%s", executionsID, b.ResourceID)
event := minio.MinioDeleteEvent{
ExecutionsID: executionsID,
MinioID: b.ResourceID,
SourcePeerID: localPeerID,
DestPeerID: b.DestPeerID,
OriginID: "",
}
minio.NewMinioSetter(executionsID, b.ResourceID).TeardownAsSource(ctx, event)
}
minio.NewMinioSetter(computeBooking.ExecutionsID, sb.ResourceID).TeardownAsTarget(ctx, event)
}
}
// teardownMinioSourceBooking triggers Minio-as-source teardown for a storage booking:
// revokes the scoped service account and removes the execution bucket on this Minio host.
func teardownMinioSourceBooking(ctx context.Context, b *bookingmodel.Booking, localPeerID string) {
event := minio.MinioDeleteEvent{
ExecutionsID: b.ExecutionsID,
MinioID: b.ResourceID,
SourcePeerID: localPeerID, // this peer IS the Minio host
DestPeerID: b.DestPeerID,
OriginID: "",
// teardownPVCForExecution deletes all local PVCs provisioned for the execution.
// It searches LIVE_STORAGE bookings and resolves the storage name via the live storage.
func teardownPVCForExecution(ctx context.Context, executionsID string, localPeerID string) {
logger := oclib.GetLogger()
res := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", localPeerID, []string{}, nil).
Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"executions_id": {{Operator: dbs.EQUAL.String(), Value: executionsID}},
"resource_type": {{Operator: dbs.EQUAL.String(), Value: tools.LIVE_STORAGE.EnumIndex()}},
},
}, "", false)
if res.Err != "" || len(res.Data) == 0 {
return
}
for _, dbo := range res.Data {
b, ok := dbo.(*bookingmodel.Booking)
if !ok {
continue
}
// Resolve storage name from live storage to compute the claim name.
storageName := storage.ResolveStorageName(b.ResourceID, localPeerID)
if storageName == "" {
continue
}
logger.Info().Msgf("InfraTeardown: PVC teardown exec=%s storage=%s", executionsID, b.ResourceID)
event := storage.PVCDeleteEvent{
ExecutionsID: executionsID,
StorageID: b.ResourceID,
StorageName: storageName,
SourcePeerID: localPeerID,
DestPeerID: b.DestPeerID,
OriginID: "",
}
storage.NewPVCSetter(executionsID, b.ResourceID).TeardownAsSource(ctx, event)
}
minio.NewMinioSetter(b.ExecutionsID, b.ResourceID).TeardownAsSource(ctx, event)
}