344 lines
10 KiB
Go
344 lines
10 KiB
Go
package infrastructure
|
||
|
||
import (
|
||
"errors"
|
||
"fmt"
|
||
"time"
|
||
|
||
oclib "cloud.o-forge.io/core/oc-lib"
|
||
"cloud.o-forge.io/core/oc-lib/models/booking/planner"
|
||
"cloud.o-forge.io/core/oc-lib/models/resources"
|
||
"cloud.o-forge.io/core/oc-lib/models/workflow"
|
||
"cloud.o-forge.io/core/oc-lib/tools"
|
||
)
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Slot availability check
|
||
// ---------------------------------------------------------------------------
|
||
|
||
const (
|
||
checkWindowHours = 5 // how far ahead to scan for a free slot (hours)
|
||
checkStepMin = 15 // time increment per scan step (minutes)
|
||
// asapBuffer is the minimum lead time added to time.Now() for as_possible
|
||
// and WHEN_POSSIBLE bookings. It absorbs NATS propagation + p2p stream
|
||
// latency so the ExpectedStartDate never arrives already in the past at
|
||
// the destination peer.
|
||
asapBuffer = 2 * time.Minute
|
||
)
|
||
|
||
// CheckResult holds the outcome of a slot availability check.
|
||
type CheckResult struct {
|
||
Available bool `json:"available"`
|
||
Start time.Time `json:"start"`
|
||
End *time.Time `json:"end,omitempty"`
|
||
// NextSlot is the nearest free slot found within checkWindowHours when
|
||
// the requested slot is unavailable, or the preferred (conflict-free) slot
|
||
// when running in preemption mode.
|
||
NextSlot *time.Time `json:"next_slot,omitempty"`
|
||
Warnings []string `json:"warnings,omitempty"`
|
||
// Preemptible is true when the check was run in preemption mode.
|
||
Preemptible bool `json:"preemptible,omitempty"`
|
||
// SchedulingID is the session identifier the client must supply to Schedule
|
||
// in order to confirm the draft bookings created during this Check session.
|
||
SchedulingID string `json:"scheduling_id,omitempty"`
|
||
}
|
||
|
||
// bookingResource is the minimum info needed to verify a resource against the
|
||
// planner cache.
|
||
type bookingResource struct {
|
||
id string // resource MongoDB _id
|
||
peerPID string // peer public PeerID (PID) — PlannerCache key
|
||
instanceID string // resolved from WorkflowSchedule.SelectedInstances
|
||
}
|
||
|
||
// Check verifies that all booking-relevant resources (storage and compute) of
|
||
// the given workflow have capacity for the requested time slot.
|
||
//
|
||
// - asap=true → ignore ws.Start, begin searching from time.Now()
|
||
// - preemption → always return Available=true but populate Warnings with
|
||
// conflicts and NextSlot with the nearest conflict-free alternative
|
||
func (ws *WorkflowSchedule) Check(wfID string, asap bool, preemption bool, request *tools.APIRequest) (*CheckResult, error) {
|
||
// 1. Load workflow
|
||
obj, code, err := workflow.NewAccessor(request).LoadOne(wfID)
|
||
if code != 200 || err != nil {
|
||
msg := "could not load workflow " + wfID
|
||
if err != nil {
|
||
msg += ": " + err.Error()
|
||
}
|
||
return nil, errors.New(msg)
|
||
}
|
||
wf := obj.(*workflow.Workflow)
|
||
|
||
// 2. Resolve start
|
||
start := ws.Start
|
||
if asap || start.IsZero() {
|
||
start = time.Now().UTC().Add(asapBuffer)
|
||
}
|
||
|
||
// 3. Resolve end – use explicit end/duration or estimate via Planify
|
||
end := ws.End
|
||
if end == nil {
|
||
if ws.DurationS > 0 {
|
||
e := start.Add(time.Duration(ws.DurationS * float64(time.Second)))
|
||
end = &e
|
||
} else {
|
||
_, longest, _, _, planErr := wf.Planify(
|
||
start, nil,
|
||
ws.SelectedInstances, ws.SelectedPartnerships,
|
||
ws.SelectedBuyings, ws.SelectedStrategies,
|
||
int(ws.BookingMode), request,
|
||
)
|
||
if planErr == nil && longest > 0 {
|
||
e := start.Add(time.Duration(longest) * time.Second)
|
||
end = &e
|
||
}
|
||
}
|
||
}
|
||
|
||
// 4. Extract booking-relevant (storage + compute) resources from the graph,
|
||
// resolving the selected instance for each resource.
|
||
checkables := collectBookingResources(wf, ws.SelectedInstances)
|
||
// 5. Check every resource against its peer's planner
|
||
unavailable, warnings := checkResourceAvailability(checkables, start, end)
|
||
result := &CheckResult{
|
||
Start: start,
|
||
End: end,
|
||
Warnings: warnings,
|
||
}
|
||
|
||
// 6. Preemption mode: mark as schedulable regardless of conflicts, but
|
||
// surface warnings and the nearest conflict-free alternative.
|
||
if preemption {
|
||
result.Available = true
|
||
result.Preemptible = true
|
||
if len(unavailable) > 0 {
|
||
result.NextSlot = findNextSlot(checkables, start, end, checkWindowHours)
|
||
}
|
||
return result, nil
|
||
}
|
||
|
||
// 7. All resources are free
|
||
if len(unavailable) == 0 {
|
||
result.Available = true
|
||
return result, nil
|
||
}
|
||
|
||
// 8. Slot unavailable – locate the nearest free slot within the window
|
||
result.Available = false
|
||
result.NextSlot = findNextSlot(checkables, start, end, checkWindowHours)
|
||
return result, nil
|
||
}
|
||
|
||
// collectBookingResources returns unique storage and compute resources from the
|
||
// workflow graph. For each resource the selected instance ID is resolved from
|
||
// selectedInstances (the scheduler's SelectedInstances ConfigItem) so the planner
|
||
// check targets the exact instance chosen by the user.
|
||
func collectBookingResources(wf *workflow.Workflow, selectedInstances workflow.ConfigItem) map[string]bookingResource {
|
||
if wf.Graph == nil {
|
||
return nil
|
||
}
|
||
seen := map[string]bool{}
|
||
result := map[string]bookingResource{}
|
||
|
||
// Resolve MongoDB peer _id (DID) → public PeerID (PID) used as PlannerCache key.
|
||
peerAccess := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
|
||
didToPID := map[string]string{}
|
||
resolvePID := func(did string) string {
|
||
if pid, ok := didToPID[did]; ok {
|
||
return pid
|
||
}
|
||
if data := peerAccess.LoadOne(did); data.Data != nil {
|
||
if p := data.ToPeer(); p != nil {
|
||
didToPID[did] = p.PeerID
|
||
return p.PeerID
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
resolveInstanceID := func(res interface {
|
||
GetID() string
|
||
GetCreatorID() string
|
||
}) string {
|
||
idx := selectedInstances.Get(res.GetID())
|
||
switch r := res.(type) {
|
||
case *resources.StorageResource:
|
||
if inst := r.GetSelectedInstance(idx); inst != nil {
|
||
return inst.GetID()
|
||
}
|
||
case *resources.ComputeResource:
|
||
if inst := r.GetSelectedInstance(idx); inst != nil {
|
||
return inst.GetID()
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
for _, item := range wf.GetGraphItems(wf.Graph.IsStorage) {
|
||
i := item
|
||
_, res := i.GetResource()
|
||
if res == nil {
|
||
continue
|
||
}
|
||
id := res.GetID()
|
||
if seen[id] {
|
||
continue
|
||
}
|
||
pid := resolvePID(res.GetCreatorID())
|
||
if pid == "" {
|
||
continue
|
||
}
|
||
seen[id] = true
|
||
result[pid] = bookingResource{
|
||
id: id,
|
||
peerPID: pid,
|
||
instanceID: resolveInstanceID(res),
|
||
}
|
||
}
|
||
|
||
for _, item := range wf.GetGraphItems(wf.Graph.IsCompute) {
|
||
i := item
|
||
_, res := i.GetResource()
|
||
if res == nil {
|
||
continue
|
||
}
|
||
id := res.GetID()
|
||
if seen[id] {
|
||
continue
|
||
}
|
||
pid := resolvePID(res.GetCreatorID())
|
||
if pid == "" {
|
||
continue
|
||
}
|
||
seen[id] = true
|
||
result[pid] = bookingResource{
|
||
id: id,
|
||
peerPID: pid,
|
||
instanceID: resolveInstanceID(res),
|
||
}
|
||
}
|
||
|
||
return result
|
||
}
|
||
|
||
// checkResourceAvailability returns the IDs of unavailable resources and
|
||
// human-readable warning messages.
|
||
func checkResourceAvailability(res map[string]bookingResource, start time.Time, end *time.Time) (unavailable []string, warnings []string) {
|
||
for _, r := range res {
|
||
plannerMu.RLock()
|
||
entry := PlannerCache[r.peerPID]
|
||
plannerMu.RUnlock()
|
||
if entry == nil || entry.Planner == nil {
|
||
warnings = append(warnings, fmt.Sprintf(
|
||
"peer %s planner not in cache for resource %s – assuming available", r.peerPID, r.id))
|
||
continue
|
||
}
|
||
if !checkInstance(entry.Planner, r.id, r.instanceID, start, end) {
|
||
unavailable = append(unavailable, r.id)
|
||
warnings = append(warnings, fmt.Sprintf(
|
||
"resource %s is not available in [%s – %s]",
|
||
r.id, start.Format(time.RFC3339), formatOptTime(end)))
|
||
}
|
||
}
|
||
return
|
||
}
|
||
|
||
// checkInstance checks availability for the specific instance resolved by the
|
||
// scheduler. When instanceID is empty (no instance selected / none resolvable),
|
||
// it falls back to checking all instances known in the planner and returns true
|
||
// if any one has remaining capacity. Returns true when no capacity is recorded.
|
||
func checkInstance(p *planner.Planner, resourceID string, instanceID string, start time.Time, end *time.Time) bool {
|
||
if instanceID != "" {
|
||
return p.Check(resourceID, instanceID, nil, start, end)
|
||
}
|
||
// Fallback: accept if any known instance has free capacity
|
||
caps, ok := p.Capacities[resourceID]
|
||
if !ok || len(caps) == 0 {
|
||
return true // no recorded usage → assume free
|
||
}
|
||
for id := range caps {
|
||
if p.Check(resourceID, id, nil, start, end) {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
// findNextSlot scans forward from 'from' in checkStepMin increments for up to
|
||
// windowH hours and returns the first candidate start time at which all
|
||
// resources are simultaneously free.
|
||
func findNextSlot(resources map[string]bookingResource, from time.Time, originalEnd *time.Time, windowH int) *time.Time {
|
||
duration := time.Hour
|
||
if originalEnd != nil {
|
||
if d := originalEnd.Sub(from); d > 0 {
|
||
duration = d
|
||
}
|
||
}
|
||
step := time.Duration(checkStepMin) * time.Minute
|
||
limit := from.Add(time.Duration(windowH) * time.Hour)
|
||
for t := from.Add(step); t.Before(limit); t = t.Add(step) {
|
||
e := t.Add(duration)
|
||
if unavail, _ := checkResourceAvailability(resources, t, &e); len(unavail) == 0 {
|
||
return &t
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func formatOptTime(t *time.Time) string {
|
||
if t == nil {
|
||
return "open"
|
||
}
|
||
return t.Format(time.RFC3339)
|
||
}
|
||
|
||
// GetWorkflowPeerIDs loads the workflow and returns the deduplicated list of
|
||
// creator peer IDs for all its storage and compute resources.
|
||
// These are the peers whose planners must be watched by a check stream.
|
||
func GetWorkflowPeerIDs(wfID string, request *tools.APIRequest) ([]string, error) {
|
||
obj, code, err := workflow.NewAccessor(request).LoadOne(wfID)
|
||
if code != 200 || err != nil {
|
||
msg := "could not load workflow " + wfID
|
||
if err != nil {
|
||
msg += ": " + err.Error()
|
||
}
|
||
return nil, errors.New(msg)
|
||
}
|
||
wf := obj.(*workflow.Workflow)
|
||
if wf.Graph == nil {
|
||
return nil, nil
|
||
}
|
||
seen := map[string]bool{}
|
||
var peerIDs []string
|
||
for _, item := range wf.GetGraphItems(wf.Graph.IsStorage) {
|
||
i := item
|
||
_, res := i.GetResource()
|
||
if res == nil {
|
||
continue
|
||
}
|
||
if id := res.GetCreatorID(); id != "" && !seen[id] {
|
||
seen[id] = true
|
||
peerIDs = append(peerIDs, id)
|
||
}
|
||
}
|
||
for _, item := range wf.GetGraphItems(wf.Graph.IsCompute) {
|
||
i := item
|
||
_, res := i.GetResource()
|
||
if res == nil {
|
||
continue
|
||
}
|
||
if id := res.GetCreatorID(); id != "" && !seen[id] {
|
||
seen[id] = true
|
||
peerIDs = append(peerIDs, id)
|
||
}
|
||
}
|
||
realPeersID := []string{}
|
||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(tools.PEER), nil)
|
||
for _, id := range peerIDs {
|
||
if data := access.LoadOne(id); data.Data != nil {
|
||
realPeersID = append(realPeersID, data.ToPeer().PeerID)
|
||
}
|
||
}
|
||
return realPeersID, nil
|
||
}
|