oc-scheduler/infrastructure/check.go

package infrastructure

import (
	"errors"
	"fmt"
	"time"

	oclib "cloud.o-forge.io/core/oc-lib"
	"cloud.o-forge.io/core/oc-lib/models/booking/planner"
	"cloud.o-forge.io/core/oc-lib/models/resources"
	"cloud.o-forge.io/core/oc-lib/models/workflow"
	"cloud.o-forge.io/core/oc-lib/tools"
)

// ---------------------------------------------------------------------------
// Slot availability check
// ---------------------------------------------------------------------------

const (
	checkWindowHours = 5  // how far ahead to scan for a free slot (hours)
	checkStepMin     = 15 // time increment per scan step (minutes)
	// asapBuffer is the minimum lead time added to time.Now() for as_possible
	// and WHEN_POSSIBLE bookings. It absorbs NATS propagation + p2p stream
	// latency so the ExpectedStartDate never arrives already in the past at
	// the destination peer.
	asapBuffer = 2 * time.Minute
)

// CheckResult holds the outcome of a slot availability check.
type CheckResult struct {
	Available bool       `json:"available"`
	Start     time.Time  `json:"start"`
	End       *time.Time `json:"end,omitempty"`
	// NextSlot is the nearest free slot found within checkWindowHours when
	// the requested slot is unavailable, or the preferred (conflict-free) slot
	// when running in preemption mode.
	NextSlot *time.Time `json:"next_slot,omitempty"`
	Warnings []string   `json:"warnings,omitempty"`
	// Preemptible is true when the check was run in preemption mode.
	Preemptible bool `json:"preemptible,omitempty"`
	// SchedulingID is the session identifier the client must supply to Schedule
	// in order to confirm the draft bookings created during this Check session.
	SchedulingID string `json:"scheduling_id,omitempty"`
}

// bookingResource is the minimum info needed to verify a resource against the
// planner cache.
type bookingResource struct {
	id         string // resource MongoDB _id
	peerPID    string // peer public PeerID (PID) — PlannerCache key
	instanceID string // resolved from WorkflowSchedule.SelectedInstances
}

// Check verifies that all booking-relevant resources (storage and compute) of
// the given workflow have capacity for the requested time slot.
//
//   - asap=true  → ignore ws.Start, begin searching from time.Now()
//   - preemption → always return Available=true but populate Warnings with
//     conflicts and NextSlot with the nearest conflict-free alternative
func (ws *WorkflowSchedule) Check(wfID string, asap bool, preemption bool, request *tools.APIRequest) (*CheckResult, error) {
	// 1. Load workflow
	obj, code, err := workflow.NewAccessor(request).LoadOne(wfID)
	if code != 200 || err != nil {
		msg := "could not load workflow " + wfID
		if err != nil {
			msg += ": " + err.Error()
		}
		return nil, errors.New(msg)
	}
	wf := obj.(*workflow.Workflow)

	// 2. Resolve start
	start := ws.Start
	if asap || start.IsZero() {
		start = time.Now().UTC().Add(asapBuffer)
	}

	// 3. Resolve end – use explicit end/duration or estimate via Planify
	end := ws.End
	if end == nil {
		if ws.DurationS > 0 {
			e := start.Add(time.Duration(ws.DurationS * float64(time.Second)))
			end = &e
		} else {
			_, longest, _, _, planErr := wf.Planify(
				start, nil,
				ws.SelectedInstances, ws.SelectedPartnerships,
				ws.SelectedBuyings, ws.SelectedStrategies,
				int(ws.BookingMode), request,
			)
			if planErr == nil && longest > 0 {
				e := start.Add(time.Duration(longest) * time.Second)
				end = &e
			}
		}
	}

	// 4. Extract booking-relevant (storage + compute) resources from the graph,
	//    resolving the selected instance for each resource.
	checkables := collectBookingResources(wf, ws.SelectedInstances)
	// 5. Check every resource against its peer's planner
	unavailable, warnings := checkResourceAvailability(checkables, start, end)
	result := &CheckResult{
		Start:    start,
		End:      end,
		Warnings: warnings,
	}

	// 6. Preemption mode: mark as schedulable regardless of conflicts, but
	//    surface warnings and the nearest conflict-free alternative.
	if preemption {
		result.Available = true
		result.Preemptible = true
		if len(unavailable) > 0 {
			result.NextSlot = findNextSlot(checkables, start, end, checkWindowHours)
		}
		return result, nil
	}

	// 7. All resources are free
	if len(unavailable) == 0 {
		result.Available = true
		return result, nil
	}

	// 8. Slot unavailable – locate the nearest free slot within the window
	result.Available = false
	result.NextSlot = findNextSlot(checkables, start, end, checkWindowHours)
	return result, nil
}

// collectBookingResources returns unique storage and compute resources from the
// workflow graph. For each resource the selected instance ID is resolved from
// selectedInstances (the scheduler's SelectedInstances ConfigItem) so the planner
// check targets the exact instance chosen by the user.
func collectBookingResources(wf *workflow.Workflow, selectedInstances workflow.ConfigItem) map[string]bookingResource {
	if wf.Graph == nil {
		return nil
	}
	seen := map[string]bool{}
	result := map[string]bookingResource{}

	// Resolve MongoDB peer _id (DID) → public PeerID (PID) used as PlannerCache key.
	peerAccess := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
	didToPID := map[string]string{}
	resolvePID := func(did string) string {
		if pid, ok := didToPID[did]; ok {
			return pid
		}
		if data := peerAccess.LoadOne(did); data.Data != nil {
			if p := data.ToPeer(); p != nil {
				didToPID[did] = p.PeerID
				return p.PeerID
			}
		}
		return ""
	}

	resolveInstanceID := func(res interface {
		GetID() string
		GetCreatorID() string
	}) string {
		idx := selectedInstances.Get(res.GetID())
		switch r := res.(type) {
		case *resources.StorageResource:
			if inst := r.GetSelectedInstance(idx); inst != nil {
				return inst.GetID()
			}
		case *resources.ComputeResource:
			if inst := r.GetSelectedInstance(idx); inst != nil {
				return inst.GetID()
			}
		}
		return ""
	}

	for _, item := range wf.GetGraphItems(wf.Graph.IsStorage) {
		i := item
		_, res := i.GetResource()
		if res == nil {
			continue
		}
		id := res.GetID()
		if seen[id] {
			continue
		}
		pid := resolvePID(res.GetCreatorID())
		if pid == "" {
			continue
		}
		seen[id] = true
		result[pid] = bookingResource{
			id:         id,
			peerPID:    pid,
			instanceID: resolveInstanceID(res),
		}
	}

	for _, item := range wf.GetGraphItems(wf.Graph.IsCompute) {
		i := item
		_, res := i.GetResource()
		if res == nil {
			continue
		}
		id := res.GetID()
		if seen[id] {
			continue
		}
		pid := resolvePID(res.GetCreatorID())
		if pid == "" {
			continue
		}
		seen[id] = true
		result[pid] = bookingResource{
			id:         id,
			peerPID:    pid,
			instanceID: resolveInstanceID(res),
		}
	}

	return result
}

// checkResourceAvailability returns the IDs of unavailable resources and
// human-readable warning messages.
func checkResourceAvailability(res map[string]bookingResource, start time.Time, end *time.Time) (unavailable []string, warnings []string) {
	for _, r := range res {
		plannerMu.RLock()
		entry := PlannerCache[r.peerPID]
		plannerMu.RUnlock()
		if entry == nil || entry.Planner == nil {
			warnings = append(warnings, fmt.Sprintf(
				"peer %s planner not in cache for resource %s – assuming available", r.peerPID, r.id))
			continue
		}
		if !checkInstance(entry.Planner, r.id, r.instanceID, start, end) {
			unavailable = append(unavailable, r.id)
			warnings = append(warnings, fmt.Sprintf(
				"resource %s is not available in [%s – %s]",
				r.id, start.Format(time.RFC3339), formatOptTime(end)))
		}
	}
	return
}

// checkInstance checks availability for the specific instance resolved by the
// scheduler. When instanceID is empty (no instance selected / none resolvable),
// it falls back to checking all instances known in the planner and returns true
// if any one has remaining capacity. Returns true when no capacity is recorded.
func checkInstance(p *planner.Planner, resourceID string, instanceID string, start time.Time, end *time.Time) bool {
	if instanceID != "" {
		return p.Check(resourceID, instanceID, nil, start, end)
	}
	// Fallback: accept if any known instance has free capacity
	caps, ok := p.Capacities[resourceID]
	if !ok || len(caps) == 0 {
		return true // no recorded usage → assume free
	}
	for id := range caps {
		if p.Check(resourceID, id, nil, start, end) {
			return true
		}
	}
	return false
}

// findNextSlot scans forward from 'from' in checkStepMin increments for up to
// windowH hours and returns the first candidate start time at which all
// resources are simultaneously free.
func findNextSlot(resources map[string]bookingResource, from time.Time, originalEnd *time.Time, windowH int) *time.Time {
	duration := time.Hour
	if originalEnd != nil {
		if d := originalEnd.Sub(from); d > 0 {
			duration = d
		}
	}
	step := time.Duration(checkStepMin) * time.Minute
	limit := from.Add(time.Duration(windowH) * time.Hour)
	for t := from.Add(step); t.Before(limit); t = t.Add(step) {
		e := t.Add(duration)
		if unavail, _ := checkResourceAvailability(resources, t, &e); len(unavail) == 0 {
			return &t
		}
	}
	return nil
}

func formatOptTime(t *time.Time) string {
	if t == nil {
		return "open"
	}
	return t.Format(time.RFC3339)
}

// GetWorkflowPeerIDs loads the workflow and returns the deduplicated list of
// creator peer IDs for all its storage and compute resources.
// These are the peers whose planners must be watched by a check stream.
func GetWorkflowPeerIDs(wfID string, request *tools.APIRequest) ([]string, error) {
	obj, code, err := workflow.NewAccessor(request).LoadOne(wfID)
	if code != 200 || err != nil {
		msg := "could not load workflow " + wfID
		if err != nil {
			msg += ": " + err.Error()
		}
		return nil, errors.New(msg)
	}
	wf := obj.(*workflow.Workflow)
	if wf.Graph == nil {
		return nil, nil
	}
	seen := map[string]bool{}
	var peerIDs []string
	for _, item := range wf.GetGraphItems(wf.Graph.IsStorage) {
		i := item
		_, res := i.GetResource()
		if res == nil {
			continue
		}
		if id := res.GetCreatorID(); id != "" && !seen[id] {
			seen[id] = true
			peerIDs = append(peerIDs, id)
		}
	}
	for _, item := range wf.GetGraphItems(wf.Graph.IsCompute) {
		i := item
		_, res := i.GetResource()
		if res == nil {
			continue
		}
		if id := res.GetCreatorID(); id != "" && !seen[id] {
			seen[id] = true
			peerIDs = append(peerIDs, id)
		}
	}
	realPeersID := []string{}
	access := oclib.NewRequestAdmin(oclib.LibDataEnum(tools.PEER), nil)
	for _, id := range peerIDs {
		if data := access.LoadOne(id); data.Data != nil {
			realPeersID = append(realPeersID, data.ToPeer().PeerID)
		}
	}
	return realPeersID, nil
}