Oc-Datacenter Allowed Resource And Prepull Images For Efficient process

This commit is contained in:
mr
2026-03-25 11:11:03 +01:00
parent dab61463f0
commit c87245e83f
16 changed files with 836 additions and 292 deletions

View File

@@ -0,0 +1,323 @@
package kubernetes
import (
"context"
"encoding/base64"
"fmt"
"strings"
"sync"
"time"
"oc-datacenter/conf"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/dbs"
"cloud.o-forge.io/core/oc-lib/models/allowed_image"
"cloud.o-forge.io/core/oc-lib/tools"
appsv1 "k8s.io/api/apps/v1"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
)
type KubernetesService struct {
ExecutionsID string
}
func NewKubernetesService(executionsID string) *KubernetesService {
return &KubernetesService{
ExecutionsID: executionsID,
}
}
// prepullRegistry associe executionsID → images pre-pullées pour ce run.
// Utilisé par CleanupImages après WORKFLOW_DONE_EVENT.
var prepullRegistry sync.Map
// RunPrepull crée un Job k8s dans le namespace executionsID qui pre-pull chaque
// image de la liste (imagePullPolicy: IfNotPresent). Bloque jusqu'à la complétion
// du Job ou timeout (5 min). Enregistre les images pour le cleanup post-exec.
func (s *KubernetesService) RunPrepull(ctx context.Context, images []string) error {
logger := oclib.GetLogger()
// Toujours stocker pour le cleanup, même si le pull échoue.
prepullRegistry.Store(s.ExecutionsID, images)
if len(images) == 0 {
return nil
}
cs, err := s.newClientset()
if err != nil {
return fmt.Errorf("RunPrepull: failed to build clientset: %w", err)
}
// Un container par image — ils tournent tous en parallèle dans le même pod.
containers := make([]corev1.Container, 0, len(images))
for i, img := range images {
containers = append(containers, corev1.Container{
Name: fmt.Sprintf("prepull-%d", i),
Image: img,
ImagePullPolicy: corev1.PullIfNotPresent,
Command: []string{"true"},
})
}
var backoff int32 = 0
jobName := "prepull-" + s.ExecutionsID
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: jobName,
Namespace: s.ExecutionsID,
},
Spec: batchv1.JobSpec{
BackoffLimit: &backoff,
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
RestartPolicy: corev1.RestartPolicyNever,
Containers: containers,
},
},
},
}
if _, err := cs.BatchV1().Jobs(s.ExecutionsID).Create(ctx, job, metav1.CreateOptions{}); err != nil {
return fmt.Errorf("RunPrepull: failed to create job: %w", err)
}
timeout := int64(300) // 5 min, cohérent avec waitForConsiders
watcher, err := cs.BatchV1().Jobs(s.ExecutionsID).Watch(ctx, metav1.ListOptions{
FieldSelector: "metadata.name=" + jobName,
TimeoutSeconds: &timeout,
})
if err != nil {
return fmt.Errorf("RunPrepull: failed to watch job: %w", err)
}
defer watcher.Stop()
for event := range watcher.ResultChan() {
j, ok := event.Object.(*batchv1.Job)
if !ok {
continue
}
for _, cond := range j.Status.Conditions {
if cond.Type == batchv1.JobComplete && cond.Status == corev1.ConditionTrue {
logger.Info().Msgf("RunPrepull: job %s completed for ns %s", jobName, s.ExecutionsID)
return nil
}
if cond.Type == batchv1.JobFailed && cond.Status == corev1.ConditionTrue {
return fmt.Errorf("RunPrepull: job %s failed for ns %s", jobName, s.ExecutionsID)
}
}
}
return fmt.Errorf("RunPrepull: timeout waiting for job %s", jobName)
}
// CleanupImages récupère les images pre-pullées pour ce run, filtre celles
// absentes de AllowedImages, et planifie leur suppression via un DaemonSet
// privilégié (crictl rmi) sur tous les nœuds du cluster.
// Appelé depuis teardownInfraForExecution au WORKFLOW_DONE_EVENT.
func (s *KubernetesService) CleanupImages(ctx context.Context) {
logger := oclib.GetLogger()
raw, ok := prepullRegistry.LoadAndDelete(s.ExecutionsID)
if !ok {
return
}
images := raw.([]string)
if len(images) == 0 {
return
}
toRemove := s.filterNonAllowed(images)
if len(toRemove) == 0 {
logger.Info().Msgf("CleanupImages: all images for %s are in AllowedImages, keeping", s.ExecutionsID)
return
}
logger.Info().Msgf("CleanupImages: scheduling removal of %d image(s) for %s: %v",
len(toRemove), s.ExecutionsID, toRemove)
go s.scheduleImageRemoval(ctx, toRemove)
}
// filterNonAllowed retourne les images non présentes dans AllowedImages.
func (s *KubernetesService) filterNonAllowed(images []string) []string {
var toRemove []string
for _, img := range images {
registry, name, tag := s.parseImage(img)
res := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.ALLOWED_IMAGE), nil).Search(
&dbs.Filters{
And: map[string][]dbs.Filter{
"image": {{Operator: dbs.EQUAL.String(), Value: name}},
},
}, "", false)
if len(res.Data) == 0 {
toRemove = append(toRemove, img)
continue
}
allowed := false
for _, d := range res.Data {
a, ok := d.(*allowed_image.AllowedImage)
if !ok {
continue
}
if a.Registry != "" && a.Registry != registry {
continue
}
if s.matchesTagConstraint(a.TagConstraint, tag) {
allowed = true
break
}
}
if !allowed {
toRemove = append(toRemove, img)
}
}
return toRemove
}
// scheduleImageRemoval crée un DaemonSet privilégié sur tous les nœuds du cluster
// qui exécute "crictl rmi" pour chaque image à supprimer, puis supprime le DaemonSet.
func (s *KubernetesService) scheduleImageRemoval(ctx context.Context, images []string) {
logger := oclib.GetLogger()
cs, err := s.newClientset()
if err != nil {
logger.Error().Msgf("scheduleImageRemoval: failed to build clientset: %v", err)
return
}
// Commande shell : crictl rmi image1 image2 ... || true (best-effort)
args := strings.Join(images, " ")
cmd := fmt.Sprintf("crictl rmi %s || true", args)
privileged := true
dsName := "oc-cleanup-" + s.ExecutionsID
ds := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Name: dsName,
Namespace: "default",
Labels: map[string]string{"app": dsName},
},
Spec: appsv1.DaemonSetSpec{
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{"app": dsName},
},
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{"app": dsName},
},
Spec: corev1.PodSpec{
// Tolère tous les taints pour atteindre tous les nœuds.
Tolerations: []corev1.Toleration{
{Operator: corev1.TolerationOpExists},
},
HostPID: true,
Containers: []corev1.Container{{
Name: "cleanup",
Image: "alpine:3",
// nsenter entre dans le namespace mount du host (PID 1)
// pour accéder au crictl installé sur le nœud.
Command: []string{"sh", "-c",
"nsenter -t 1 -m -u -i -n -- sh -c '" + cmd + "'"},
SecurityContext: &corev1.SecurityContext{
Privileged: &privileged,
},
}},
},
},
},
}
if _, err := cs.AppsV1().DaemonSets("default").Create(ctx, ds, metav1.CreateOptions{}); err != nil {
logger.Error().Msgf("scheduleImageRemoval: failed to create DaemonSet: %v", err)
return
}
// Laisse le temps au DaemonSet de tourner sur tous les nœuds.
time.Sleep(30 * time.Second)
if err := cs.AppsV1().DaemonSets("default").Delete(ctx, dsName, metav1.DeleteOptions{}); err != nil {
logger.Error().Msgf("scheduleImageRemoval: failed to delete DaemonSet: %v", err)
}
logger.Info().Msgf("scheduleImageRemoval: completed for %s", s.ExecutionsID)
}
// parseImage décompose "registry/name:tag" en ses trois composants.
// registry vide si aucun composant ressemblant à un hostname n'est détecté.
func (s *KubernetesService) parseImage(image string) (registry, name, tag string) {
parts := strings.SplitN(image, ":", 2)
nameWithRegistry := parts[0]
if len(parts) == 2 {
tag = parts[1]
} else {
tag = "latest"
}
slashIdx := strings.Index(nameWithRegistry, "/")
if slashIdx == -1 {
return "", nameWithRegistry, tag
}
prefix := nameWithRegistry[:slashIdx]
// Présence d'un "." ou ":" ou "localhost" → c'est un hostname de registry.
if strings.ContainsAny(prefix, ".:") || prefix == "localhost" {
return prefix, nameWithRegistry[slashIdx+1:], tag
}
return "", nameWithRegistry, tag
}
// matchesTagConstraint vérifie si tag satisfait la contrainte.
// Vide = toutes versions. Supporte exact et glob suffixe ("3.*").
func (s *KubernetesService) matchesTagConstraint(constraint, tag string) bool {
if constraint == "" {
return true
}
if strings.HasSuffix(constraint, "*") {
return strings.HasPrefix(tag, strings.TrimSuffix(constraint, "*"))
}
return constraint == tag
}
// newClientset construit un client k8s depuis les credentials base64 en conf.
func (s *KubernetesService) newClientset() (*kubernetes.Clientset, error) {
caData, err := base64.StdEncoding.DecodeString(conf.GetConfig().KubeCA)
if err != nil {
return nil, fmt.Errorf("newClientset: invalid KubeCA: %w", err)
}
certData, err := base64.StdEncoding.DecodeString(conf.GetConfig().KubeCert)
if err != nil {
return nil, fmt.Errorf("newClientset: invalid KubeCert: %w", err)
}
keyData, err := base64.StdEncoding.DecodeString(conf.GetConfig().KubeData)
if err != nil {
return nil, fmt.Errorf("newClientset: invalid KubeData: %w", err)
}
cfg := &rest.Config{
Host: "https://" + conf.GetConfig().KubeHost + ":" + conf.GetConfig().KubePort,
TLSClientConfig: rest.TLSClientConfig{
CAData: caData,
CertData: certData,
KeyData: keyData,
},
}
return kubernetes.NewForConfig(cfg)
}
func (s *KubernetesService) CreateNamespace() error {
logger := oclib.GetLogger()
serv, err := tools.NewKubernetesService(
conf.GetConfig().KubeHost+":"+conf.GetConfig().KubePort, conf.GetConfig().KubeCA,
conf.GetConfig().KubeCert, conf.GetConfig().KubeData)
if err != nil {
logger.Error().Msg("CreateNamespace: failed to init k8s service: " + err.Error())
return err
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
return serv.ProvisionExecutionNamespace(ctx, s.ExecutionsID)
}

View File

@@ -0,0 +1,72 @@
package models
// KubeConfigValue is a struct used to create a kubectl configuration YAML file.
type KubeConfigValue struct {
APIVersion string `yaml:"apiVersion" json:"apiVersion"`
Kind string `yaml:"kind" json:"kind"`
Clusters []KubeconfigNamedCluster `yaml:"clusters" json:"clusters"`
Users []KubeconfigUser `yaml:"users" json:"users"`
Contexts []KubeconfigNamedContext `yaml:"contexts" json:"contexts"`
CurrentContext string `yaml:"current-context" json:"current-context"`
Preferences struct{} `yaml:"preferences" json:"preferences"`
}
// KubeconfigUser is a struct used to create a kubectl configuration YAML file
type KubeconfigUser struct {
Name string `yaml:"name" json:"name"`
User KubeconfigUserKeyPair `yaml:"user" json:"user"`
}
// KubeconfigUserKeyPair is a struct used to create a kubectl configuration YAML file
type KubeconfigUserKeyPair struct {
Token string `yaml:"token" json:"token"`
}
// KubeconfigAuthProvider is a struct used to create a kubectl authentication provider
type KubeconfigAuthProvider struct {
Name string `yaml:"name" json:"name"`
Config map[string]string `yaml:"config" json:"config"`
}
// KubeconfigNamedCluster is a struct used to create a kubectl configuration YAML file
type KubeconfigNamedCluster struct {
Name string `yaml:"name" json:"name"`
Cluster KubeconfigCluster `yaml:"cluster" json:"cluster"`
}
// KubeconfigCluster is a struct used to create a kubectl configuration YAML file
type KubeconfigCluster struct {
Server string `yaml:"server" json:"server"`
CertificateAuthorityData string `yaml:"certificate-authority-data" json:"certificate-authority-data"`
CertificateAuthority string `yaml:"certificate-authority" json:"certificate-authority"`
}
// KubeconfigNamedContext is a struct used to create a kubectl configuration YAML file
type KubeconfigNamedContext struct {
Name string `yaml:"name" json:"name"`
Context KubeconfigContext `yaml:"context" json:"context"`
}
// KubeconfigContext is a struct used to create a kubectl configuration YAML file
type KubeconfigContext struct {
Cluster string `yaml:"cluster" json:"cluster"`
Namespace string `yaml:"namespace,omitempty" json:"namespace,omitempty"`
User string `yaml:"user" json:"user"`
}
// kubeconfigEvent is the NATS payload used to transfer the kubeconfig from the source peer to the target peer.
type KubeconfigEvent struct {
DestPeerID string `json:"dest_peer_id"`
ExecutionsID string `json:"executions_id"`
Kubeconfig string `json:"kubeconfig"`
SourcePeerID string `json:"source_peer_id"`
// OriginID is the peer that initiated the provisioning request.
// The PB_CONSIDERS response is routed back to this peer.
OriginID string `json:"origin_id"`
// SourceExecutionsID is the execution namespace on the source cluster.
// Used by the target to provision PVCs with the correct claim name.
SourceExecutionsID string `json:"source_executions_id,omitempty"`
// Images is the list of container images to pre-pull on the compute peer
// before the workflow starts.
Images []string `json:"images,omitempty"`
}

View File

@@ -0,0 +1,359 @@
package kubernetes
import (
"context"
"fmt"
"regexp"
"strings"
"time"
"oc-datacenter/conf"
"oc-datacenter/infrastructure"
"oc-datacenter/infrastructure/admiralty"
"oc-datacenter/infrastructure/storage"
oclib "cloud.o-forge.io/core/oc-lib"
"cloud.o-forge.io/core/oc-lib/dbs"
bookingmodel "cloud.o-forge.io/core/oc-lib/models/booking"
"cloud.o-forge.io/core/oc-lib/models/workflow_execution"
"cloud.o-forge.io/core/oc-lib/tools"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
// uuidNsPattern matches Kubernetes namespace names that are execution UUIDs.
var uuidNsPattern = regexp.MustCompile(`^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$`)
// WatchInfra is a safety-net watchdog that periodically scans Kubernetes for
// execution namespaces whose WorkflowExecution has reached a terminal state
// but whose infra was never torn down (e.g. because WORKFLOW_DONE_EVENT was
// missed due to oc-monitord or oc-datacenter crash/restart).
//
// Must be launched in a goroutine from main.
func (s *KubernetesService) Watch() {
logger := oclib.GetLogger()
logger.Info().Msg("InfraWatchdog: started")
ticker := time.NewTicker(5 * time.Minute)
defer ticker.Stop()
for range ticker.C {
if err := s.scanOrphaned(); err != nil {
logger.Error().Msg("InfraWatchdog: " + err.Error())
}
if err := s.scanOrphanedMinio(); err != nil {
logger.Error().Msg("InfraWatchdog(minio): " + err.Error())
}
if err := s.scanOrphanedAdmiraltyNodes(); err != nil {
logger.Error().Msg("InfraWatchdog(admiralty-nodes): " + err.Error())
}
if err := s.scanOrphanedPVC(); err != nil {
logger.Error().Msg("InfraWatchdog(pvc): " + err.Error())
}
}
}
// scanOrphanedInfra lists all UUID-named Kubernetes namespaces, looks up their
// WorkflowExecution in the DB, and triggers teardown for any that are in a
// terminal state. Namespaces already in Terminating phase are skipped.
func (s *KubernetesService) scanOrphaned() error {
logger := oclib.GetLogger()
serv, err := tools.NewKubernetesService(
conf.GetConfig().KubeHost+":"+conf.GetConfig().KubePort,
conf.GetConfig().KubeCA,
conf.GetConfig().KubeCert,
conf.GetConfig().KubeData,
)
if err != nil {
return fmt.Errorf("failed to init k8s service: %w", err)
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
nsList, err := serv.Set.CoreV1().Namespaces().List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("failed to list namespaces: %w", err)
}
myself, err := oclib.GetMySelf()
if err != nil {
return fmt.Errorf("could not resolve local peer: %w", err)
}
peerID := myself.GetID()
for _, ns := range nsList.Items {
executionsID := ns.Name
if !uuidNsPattern.MatchString(executionsID) {
continue
}
// Skip namespaces already being deleted by a previous teardown.
if ns.Status.Phase == v1.NamespaceTerminating {
continue
}
exec := findTerminalExecution(executionsID, peerID)
if exec == nil {
continue
}
logger.Info().Msgf("InfraWatchdog: orphaned infra detected for execution %s (state=%v) → teardown",
executionsID, exec.State)
go s.TeardownForExecution(exec.GetID())
}
return nil
}
// scanOrphanedMinio scans LIVE_STORAGE bookings for executions that are in a
// terminal state and triggers Minio teardown for each unique executionsID found.
// This covers the case where the Kubernetes namespace is already gone (manual
// deletion, prior partial teardown) but Minio SA and bucket were never revoked.
func (s *KubernetesService) scanOrphanedMinio() error {
logger := oclib.GetLogger()
myself, err := oclib.GetMySelf()
if err != nil {
return fmt.Errorf("could not resolve local peer: %w", err)
}
peerID := myself.GetID()
res := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", peerID, []string{}, nil).
Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"resource_type": {{Operator: dbs.EQUAL.String(), Value: tools.LIVE_STORAGE.EnumIndex()}},
},
}, "", false)
if res.Err != "" {
return fmt.Errorf("failed to search LIVE_STORAGE bookings: %s", res.Err)
}
// Collect unique executionsIDs to avoid redundant teardowns.
seen := map[string]bool{}
ctx := context.Background()
for _, dbo := range res.Data {
b, ok := dbo.(*bookingmodel.Booking)
if !ok || seen[b.ExecutionsID] {
continue
}
exec := findTerminalExecution(b.ExecutionsID, peerID)
if exec == nil {
continue
}
seen[b.ExecutionsID] = true
minio := storage.NewMinioSetter(b.ExecutionsID, b.ResourceID)
// Determine this peer's role and call the appropriate teardown.
if b.DestPeerID == peerID {
logger.Info().Msgf("InfraWatchdog(minio): orphaned target resources for exec %s → TeardownAsTarget", b.ExecutionsID)
event := storage.MinioDeleteEvent{
ExecutionsID: b.ExecutionsID,
MinioID: b.ResourceID,
SourcePeerID: b.DestPeerID,
DestPeerID: peerID,
}
go minio.TeardownAsTarget(ctx, event)
} else {
logger.Info().Msgf("InfraWatchdog(minio): orphaned source resources for exec %s → TeardownAsSource", b.ExecutionsID)
event := storage.MinioDeleteEvent{
ExecutionsID: b.ExecutionsID,
MinioID: b.ResourceID,
SourcePeerID: peerID,
DestPeerID: b.DestPeerID,
}
go minio.TeardownAsSource(ctx, event)
}
}
return nil
}
// scanOrphanedAdmiraltyNodes lists all Kubernetes nodes, identifies Admiralty
// virtual nodes (name prefix "admiralty-{UUID}-") that are NotReady, and
// explicitly deletes them when their WorkflowExecution is in a terminal state.
//
// This covers the gap where the namespace is already gone (or Terminating) but
// the virtual node was never cleaned up by the Admiralty controller — which can
// happen when the node goes NotReady before the AdmiraltyTarget CRD is deleted.
func (s *KubernetesService) scanOrphanedAdmiraltyNodes() error {
logger := oclib.GetLogger()
serv, err := tools.NewKubernetesService(
conf.GetConfig().KubeHost+":"+conf.GetConfig().KubePort,
conf.GetConfig().KubeCA,
conf.GetConfig().KubeCert,
conf.GetConfig().KubeData,
)
if err != nil {
return fmt.Errorf("failed to init k8s service: %w", err)
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
nodeList, err := serv.Set.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
if err != nil {
return fmt.Errorf("failed to list nodes: %w", err)
}
myself, err := oclib.GetMySelf()
if err != nil {
return fmt.Errorf("could not resolve local peer: %w", err)
}
peerID := myself.GetID()
for _, node := range nodeList.Items {
// Admiralty virtual nodes are named: admiralty-{executionID}-target-{...}
rest := strings.TrimPrefix(node.Name, "admiralty-")
if rest == node.Name {
continue // not an admiralty node
}
// UUID is exactly 36 chars: 8-4-4-4-12
if len(rest) < 36 {
continue
}
executionsID := rest[:36]
if !uuidNsPattern.MatchString(executionsID) {
continue
}
// Only act on NotReady nodes.
ready := false
for _, cond := range node.Status.Conditions {
if cond.Type == v1.NodeReady {
ready = cond.Status == v1.ConditionTrue
break
}
}
if ready {
continue
}
exec := findTerminalExecution(executionsID, peerID)
if exec == nil {
continue
}
logger.Info().Msgf("InfraWatchdog(admiralty-nodes): NotReady orphaned node %s for terminal execution %s → deleting",
node.Name, executionsID)
if delErr := serv.Set.CoreV1().Nodes().Delete(ctx, node.Name, metav1.DeleteOptions{}); delErr != nil {
logger.Error().Msgf("InfraWatchdog(admiralty-nodes): failed to delete node %s: %v", node.Name, delErr)
}
}
return nil
}
// scanOrphanedPVC scans LIVE_STORAGE bookings for executions that are in a
// terminal state and triggers PVC teardown for each one where this peer holds
// the local storage. This covers the case where the Kubernetes namespace was
// already deleted (or its teardown was partial) but the PersistentVolume
// (cluster-scoped) was never reclaimed.
//
// A LIVE_STORAGE booking is treated as a local PVC only when ResolveStorageName
// returns a non-empty name — the same guard used by teardownPVCForExecution.
func (s *KubernetesService) scanOrphanedPVC() error {
logger := oclib.GetLogger()
myself, err := oclib.GetMySelf()
if err != nil {
return fmt.Errorf("could not resolve local peer: %w", err)
}
peerID := myself.GetID()
res := oclib.NewRequest(oclib.LibDataEnum(oclib.BOOKING), "", peerID, []string{}, nil).
Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"resource_type": {{Operator: dbs.EQUAL.String(), Value: tools.LIVE_STORAGE.EnumIndex()}},
},
}, "", false)
if res.Err != "" {
return fmt.Errorf("failed to search LIVE_STORAGE bookings: %s", res.Err)
}
seen := map[string]bool{}
ctx := context.Background()
for _, dbo := range res.Data {
b, ok := dbo.(*bookingmodel.Booking)
if !ok || seen[b.ExecutionsID+b.ResourceID] {
continue
}
storageName := storage.ResolveStorageName(b.ResourceID, peerID)
if storageName == "" {
continue // not a local PVC booking
}
exec := findTerminalExecution(b.ExecutionsID, peerID)
if exec == nil {
continue
}
seen[b.ExecutionsID+b.ResourceID] = true
logger.Info().Msgf("InfraWatchdog(pvc): orphaned PVC for exec %s storage %s → TeardownAsSource",
b.ExecutionsID, b.ResourceID)
event := storage.PVCDeleteEvent{
ExecutionsID: b.ExecutionsID,
StorageID: b.ResourceID,
StorageName: storageName,
SourcePeerID: peerID,
DestPeerID: b.DestPeerID,
}
go storage.NewPVCSetter(b.ExecutionsID, b.ResourceID).TeardownAsSource(ctx, event)
}
return nil
}
// findTerminalExecution returns the WorkflowExecution for the given executionsID
// if it exists in the DB and is in a terminal state, otherwise nil.
func findTerminalExecution(executionsID string, peerID string) *workflow_execution.WorkflowExecution {
res := oclib.NewRequest(oclib.LibDataEnum(oclib.WORKFLOW_EXECUTION), "", peerID, []string{}, nil).
Search(&dbs.Filters{
And: map[string][]dbs.Filter{
"executions_id": {{Operator: dbs.EQUAL.String(), Value: executionsID}},
},
}, "", false)
if res.Err != "" || len(res.Data) == 0 {
return nil
}
exec, ok := res.Data[0].(*workflow_execution.WorkflowExecution)
if !ok {
return nil
}
if !infrastructure.ClosingStates[exec.State] {
return nil
}
return exec
}
// teardownInfraForExecution handles infrastructure cleanup when a workflow terminates.
// oc-datacenter is responsible only for infra here — booking/execution state
// is managed by oc-scheduler.
func (s *KubernetesService) TeardownForExecution(executionID string) {
logger := oclib.GetLogger()
myself, err := oclib.GetMySelf()
if err != nil || myself == nil {
return
}
selfPeerID := myself.GetID()
adminReq := &tools.APIRequest{Admin: true}
res, _, loadErr := workflow_execution.NewAccessor(adminReq).LoadOne(executionID)
if loadErr != nil || res == nil {
logger.Warn().Msgf("teardownInfraForExecution: execution %s not found", executionID)
return
}
exec := res.(*workflow_execution.WorkflowExecution)
ctx := context.Background()
admiralty.NewAdmiraltySetter(s.ExecutionsID).TeardownIfRemote(exec, selfPeerID)
storage.NewMinioSetter(s.ExecutionsID, "").TeardownForExecution(ctx, selfPeerID)
storage.NewPVCSetter(s.ExecutionsID, "").TeardownForExecution(ctx, selfPeerID)
s.CleanupImages(ctx)
}