package infrastructure import ( "encoding/json" "fmt" "sync" "time" oclib "cloud.o-forge.io/core/oc-lib" "cloud.o-forge.io/core/oc-lib/dbs" bookingmodel "cloud.o-forge.io/core/oc-lib/models/booking" "cloud.o-forge.io/core/oc-lib/models/common/enum" "cloud.o-forge.io/core/oc-lib/tools" "go.mongodb.org/mongo-driver/bson/primitive" ) // processedBookings tracks booking IDs already handled this process lifetime. var processedBookings sync.Map // closingStates is the set of terminal booking states. var ClosingStates = map[enum.BookingStatus]bool{ enum.FAILURE: true, enum.SUCCESS: true, enum.FORGOTTEN: true, enum.CANCELLED: true, } // WatchBookings is a safety-net fallback for when oc-monitord fails to launch. // It detects bookings that are past expected_start_date by at least 1 minute and // are still in a non-terminal state. Instead of writing to the database directly, // it emits WORKFLOW_STEP_DONE_EVENT with State=FAILURE on NATS so that oc-scheduler // handles the state transition — keeping a single source of truth for booking state. // // Must be launched in a goroutine from main. func WatchBookings() { logger := oclib.GetLogger() logger.Info().Msg("BookingWatchdog: started") ticker := time.NewTicker(time.Minute) defer ticker.Stop() for range ticker.C { if err := scanStaleBookings(); err != nil { logger.Error().Msg("BookingWatchdog: " + err.Error()) } } } // scanStaleBookings queries all bookings whose ExpectedStartDate passed more than // 1 minute ago. Non-terminal ones get a WORKFLOW_STEP_DONE_EVENT FAILURE emitted // on NATS so oc-scheduler closes them. func scanStaleBookings() error { myself, err := oclib.GetMySelf() if err != nil { return fmt.Errorf("could not resolve local peer: %w", err) } peerID := myself.GetID() deadline := time.Now().UTC().Add(-time.Minute) res := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", peerID, []string{}, nil). Search(&dbs.Filters{ And: map[string][]dbs.Filter{ "expected_start_date": {{ Operator: dbs.LTE.String(), Value: primitive.NewDateTimeFromTime(deadline), }}, }, }, "", false) if res.Err != "" { return fmt.Errorf("stale booking search failed: %s", res.Err) } for _, dbo := range res.Data { b, ok := dbo.(*bookingmodel.Booking) if !ok { continue } go emitWatchdogFailure(b) } return nil } // emitWatchdogFailure publishes a WORKFLOW_STEP_DONE_EVENT FAILURE for a stale // booking. oc-scheduler is the single authority for booking state transitions. func emitWatchdogFailure(b *bookingmodel.Booking) { logger := oclib.GetLogger() if _, done := processedBookings.Load(b.GetID()); done { return } if ClosingStates[b.State] { processedBookings.Store(b.GetID(), struct{}{}) return } now := time.Now().UTC() payload, err := json.Marshal(tools.WorkflowLifecycleEvent{ BookingID: b.GetID(), State: enum.FAILURE.EnumIndex(), RealEnd: &now, }) if err != nil { return } tools.NewNATSCaller().SetNATSPub(tools.WORKFLOW_STEP_DONE_EVENT, tools.NATSResponse{ FromApp: "oc-datacenter", Method: int(tools.WORKFLOW_STEP_DONE_EVENT), Payload: payload, }) logger.Info().Msgf("BookingWatchdog: booking %s stale → emitting FAILURE", b.GetID()) processedBookings.Store(b.GetID(), struct{}{}) }