Workflow lifecycle events + resource instance duration tracking

- Add WorkflowLifecycleEvent + StepMetric to tools/workflow_lifecycle.go
- Add WORKFLOW_STARTED_EVENT, WORKFLOW_STEP_DONE_EVENT, WORKFLOW_DONE_EVENT NATS methods
- ResourceInstance.UpdateAverageDuration for AverageDurationS running average
- Support Steps recap in WORKFLOW_DONE_EVENT for catch-up by oc-scheduler/oc-catalog

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
mr
2026-03-20 10:30:30 +01:00
parent 6e28dce02c
commit a62fbc6c7a
10 changed files with 96 additions and 22 deletions

View File

@@ -30,6 +30,7 @@ var meths = []string{"remove execution", "create execution", "planner execution"
"workflow event", "argo kube event", "create resource", "remove resource",
"propalgation event", "search event", "confirm event",
"considers event", "admiralty config event", "minio config event",
"workflow started event", "workflow step done event", "workflow done event",
}
const (
@@ -52,6 +53,13 @@ const (
CONSIDERS_EVENT
ADMIRALTY_CONFIG_EVENT
MINIO_CONFIG_EVENT
// Workflow lifecycle events emitted by oc-monitord.
// oc-scheduler listens to STARTED and DONE to maintain WorkflowExecution state.
// oc-datacenter listens to STEP_DONE and DONE to close bookings and tear down infra.
WORKFLOW_STARTED_EVENT
WORKFLOW_STEP_DONE_EVENT
WORKFLOW_DONE_EVENT
)
func (n NATSMethod) String() string {
@@ -62,7 +70,8 @@ func (n NATSMethod) String() string {
func NameToMethod(name string) NATSMethod {
for _, v := range [...]NATSMethod{REMOVE_EXECUTION, CREATE_EXECUTION, PLANNER_EXECUTION, DISCOVERY, WORKFLOW_EVENT, ARGO_KUBE_EVENT,
CREATE_RESOURCE, REMOVE_RESOURCE, PROPALGATION_EVENT, SEARCH_EVENT, CONFIRM_EVENT,
CONSIDERS_EVENT, ADMIRALTY_CONFIG_EVENT, MINIO_CONFIG_EVENT} {
CONSIDERS_EVENT, ADMIRALTY_CONFIG_EVENT, MINIO_CONFIG_EVENT,
WORKFLOW_STARTED_EVENT, WORKFLOW_STEP_DONE_EVENT, WORKFLOW_DONE_EVENT} {
if strings.Contains(strings.ToLower(v.String()), strings.ToLower(name)) {
return v
}

View File

@@ -0,0 +1,33 @@
package tools
import "time"
// StepMetric carries the outcome of one Argo step node as observed by oc-monitord.
// Embedded in WorkflowLifecycleEvent.Steps for the WORKFLOW_DONE_EVENT recap.
type StepMetric struct {
BookingID string `json:"booking_id"`
State int `json:"state"`
RealStart *time.Time `json:"real_start,omitempty"`
RealEnd *time.Time `json:"real_end,omitempty"`
}
// WorkflowLifecycleEvent is the NATS payload emitted by oc-monitord on
// WORKFLOW_STARTED_EVENT, WORKFLOW_STEP_DONE_EVENT, and WORKFLOW_DONE_EVENT.
//
// - ExecutionID : WorkflowExecution UUID (used by oc-scheduler to update state)
// - ExecutionsID : run-group ID shared by all bookings of the same run
// - BookingID : non-empty only for WORKFLOW_STEP_DONE_EVENT
// - State : target state (enum index: SUCCESS=3, FAILURE=4, STARTED=2, …)
// - RealStart : actual start timestamp recorded by Argo (nil if unknown)
// - RealEnd : actual end timestamp recorded by Argo (nil for STARTED events)
// - Steps : non-nil only for WORKFLOW_DONE_EVENT — full recap of every step
// so oc-scheduler and oc-catalog can catch up if they missed STEP_DONE events
type WorkflowLifecycleEvent struct {
ExecutionID string `json:"execution_id"`
ExecutionsID string `json:"executions_id"`
BookingID string `json:"booking_id,omitempty"`
State int `json:"state"`
RealStart *time.Time `json:"real_start,omitempty"`
RealEnd *time.Time `json:"real_end,omitempty"`
Steps []StepMetric `json:"steps,omitempty"`
}