Oc-Datacenter Allowed Resource And Prepull Images For Efficient process
This commit is contained in:
323
infrastructure/kubernetes/kubernetes.go
Normal file
323
infrastructure/kubernetes/kubernetes.go
Normal file
@@ -0,0 +1,323 @@
|
||||
package kubernetes
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"oc-datacenter/conf"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/dbs"
|
||||
"cloud.o-forge.io/core/oc-lib/models/allowed_image"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
batchv1 "k8s.io/api/batch/v1"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/rest"
|
||||
)
|
||||
|
||||
type KubernetesService struct {
|
||||
ExecutionsID string
|
||||
}
|
||||
|
||||
func NewKubernetesService(executionsID string) *KubernetesService {
|
||||
return &KubernetesService{
|
||||
ExecutionsID: executionsID,
|
||||
}
|
||||
}
|
||||
|
||||
// prepullRegistry associe executionsID → images pre-pullées pour ce run.
|
||||
// Utilisé par CleanupImages après WORKFLOW_DONE_EVENT.
|
||||
var prepullRegistry sync.Map
|
||||
|
||||
// RunPrepull crée un Job k8s dans le namespace executionsID qui pre-pull chaque
|
||||
// image de la liste (imagePullPolicy: IfNotPresent). Bloque jusqu'à la complétion
|
||||
// du Job ou timeout (5 min). Enregistre les images pour le cleanup post-exec.
|
||||
func (s *KubernetesService) RunPrepull(ctx context.Context, images []string) error {
|
||||
logger := oclib.GetLogger()
|
||||
|
||||
// Toujours stocker pour le cleanup, même si le pull échoue.
|
||||
prepullRegistry.Store(s.ExecutionsID, images)
|
||||
|
||||
if len(images) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
cs, err := s.newClientset()
|
||||
if err != nil {
|
||||
return fmt.Errorf("RunPrepull: failed to build clientset: %w", err)
|
||||
}
|
||||
|
||||
// Un container par image — ils tournent tous en parallèle dans le même pod.
|
||||
containers := make([]corev1.Container, 0, len(images))
|
||||
for i, img := range images {
|
||||
containers = append(containers, corev1.Container{
|
||||
Name: fmt.Sprintf("prepull-%d", i),
|
||||
Image: img,
|
||||
ImagePullPolicy: corev1.PullIfNotPresent,
|
||||
Command: []string{"true"},
|
||||
})
|
||||
}
|
||||
|
||||
var backoff int32 = 0
|
||||
jobName := "prepull-" + s.ExecutionsID
|
||||
job := &batchv1.Job{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: jobName,
|
||||
Namespace: s.ExecutionsID,
|
||||
},
|
||||
Spec: batchv1.JobSpec{
|
||||
BackoffLimit: &backoff,
|
||||
Template: corev1.PodTemplateSpec{
|
||||
Spec: corev1.PodSpec{
|
||||
RestartPolicy: corev1.RestartPolicyNever,
|
||||
Containers: containers,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
if _, err := cs.BatchV1().Jobs(s.ExecutionsID).Create(ctx, job, metav1.CreateOptions{}); err != nil {
|
||||
return fmt.Errorf("RunPrepull: failed to create job: %w", err)
|
||||
}
|
||||
|
||||
timeout := int64(300) // 5 min, cohérent avec waitForConsiders
|
||||
watcher, err := cs.BatchV1().Jobs(s.ExecutionsID).Watch(ctx, metav1.ListOptions{
|
||||
FieldSelector: "metadata.name=" + jobName,
|
||||
TimeoutSeconds: &timeout,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("RunPrepull: failed to watch job: %w", err)
|
||||
}
|
||||
defer watcher.Stop()
|
||||
|
||||
for event := range watcher.ResultChan() {
|
||||
j, ok := event.Object.(*batchv1.Job)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
for _, cond := range j.Status.Conditions {
|
||||
if cond.Type == batchv1.JobComplete && cond.Status == corev1.ConditionTrue {
|
||||
logger.Info().Msgf("RunPrepull: job %s completed for ns %s", jobName, s.ExecutionsID)
|
||||
return nil
|
||||
}
|
||||
if cond.Type == batchv1.JobFailed && cond.Status == corev1.ConditionTrue {
|
||||
return fmt.Errorf("RunPrepull: job %s failed for ns %s", jobName, s.ExecutionsID)
|
||||
}
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("RunPrepull: timeout waiting for job %s", jobName)
|
||||
}
|
||||
|
||||
// CleanupImages récupère les images pre-pullées pour ce run, filtre celles
|
||||
// absentes de AllowedImages, et planifie leur suppression via un DaemonSet
|
||||
// privilégié (crictl rmi) sur tous les nœuds du cluster.
|
||||
// Appelé depuis teardownInfraForExecution au WORKFLOW_DONE_EVENT.
|
||||
func (s *KubernetesService) CleanupImages(ctx context.Context) {
|
||||
logger := oclib.GetLogger()
|
||||
|
||||
raw, ok := prepullRegistry.LoadAndDelete(s.ExecutionsID)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
images := raw.([]string)
|
||||
if len(images) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
toRemove := s.filterNonAllowed(images)
|
||||
if len(toRemove) == 0 {
|
||||
logger.Info().Msgf("CleanupImages: all images for %s are in AllowedImages, keeping", s.ExecutionsID)
|
||||
return
|
||||
}
|
||||
|
||||
logger.Info().Msgf("CleanupImages: scheduling removal of %d image(s) for %s: %v",
|
||||
len(toRemove), s.ExecutionsID, toRemove)
|
||||
go s.scheduleImageRemoval(ctx, toRemove)
|
||||
}
|
||||
|
||||
// filterNonAllowed retourne les images non présentes dans AllowedImages.
|
||||
func (s *KubernetesService) filterNonAllowed(images []string) []string {
|
||||
var toRemove []string
|
||||
for _, img := range images {
|
||||
registry, name, tag := s.parseImage(img)
|
||||
res := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.ALLOWED_IMAGE), nil).Search(
|
||||
&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"image": {{Operator: dbs.EQUAL.String(), Value: name}},
|
||||
},
|
||||
}, "", false)
|
||||
|
||||
if len(res.Data) == 0 {
|
||||
toRemove = append(toRemove, img)
|
||||
continue
|
||||
}
|
||||
|
||||
allowed := false
|
||||
for _, d := range res.Data {
|
||||
a, ok := d.(*allowed_image.AllowedImage)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if a.Registry != "" && a.Registry != registry {
|
||||
continue
|
||||
}
|
||||
if s.matchesTagConstraint(a.TagConstraint, tag) {
|
||||
allowed = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !allowed {
|
||||
toRemove = append(toRemove, img)
|
||||
}
|
||||
}
|
||||
return toRemove
|
||||
}
|
||||
|
||||
// scheduleImageRemoval crée un DaemonSet privilégié sur tous les nœuds du cluster
|
||||
// qui exécute "crictl rmi" pour chaque image à supprimer, puis supprime le DaemonSet.
|
||||
func (s *KubernetesService) scheduleImageRemoval(ctx context.Context, images []string) {
|
||||
logger := oclib.GetLogger()
|
||||
|
||||
cs, err := s.newClientset()
|
||||
if err != nil {
|
||||
logger.Error().Msgf("scheduleImageRemoval: failed to build clientset: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Commande shell : crictl rmi image1 image2 ... || true (best-effort)
|
||||
args := strings.Join(images, " ")
|
||||
cmd := fmt.Sprintf("crictl rmi %s || true", args)
|
||||
|
||||
privileged := true
|
||||
dsName := "oc-cleanup-" + s.ExecutionsID
|
||||
|
||||
ds := &appsv1.DaemonSet{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: dsName,
|
||||
Namespace: "default",
|
||||
Labels: map[string]string{"app": dsName},
|
||||
},
|
||||
Spec: appsv1.DaemonSetSpec{
|
||||
Selector: &metav1.LabelSelector{
|
||||
MatchLabels: map[string]string{"app": dsName},
|
||||
},
|
||||
Template: corev1.PodTemplateSpec{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Labels: map[string]string{"app": dsName},
|
||||
},
|
||||
Spec: corev1.PodSpec{
|
||||
// Tolère tous les taints pour atteindre tous les nœuds.
|
||||
Tolerations: []corev1.Toleration{
|
||||
{Operator: corev1.TolerationOpExists},
|
||||
},
|
||||
HostPID: true,
|
||||
Containers: []corev1.Container{{
|
||||
Name: "cleanup",
|
||||
Image: "alpine:3",
|
||||
// nsenter entre dans le namespace mount du host (PID 1)
|
||||
// pour accéder au crictl installé sur le nœud.
|
||||
Command: []string{"sh", "-c",
|
||||
"nsenter -t 1 -m -u -i -n -- sh -c '" + cmd + "'"},
|
||||
SecurityContext: &corev1.SecurityContext{
|
||||
Privileged: &privileged,
|
||||
},
|
||||
}},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
if _, err := cs.AppsV1().DaemonSets("default").Create(ctx, ds, metav1.CreateOptions{}); err != nil {
|
||||
logger.Error().Msgf("scheduleImageRemoval: failed to create DaemonSet: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// Laisse le temps au DaemonSet de tourner sur tous les nœuds.
|
||||
time.Sleep(30 * time.Second)
|
||||
|
||||
if err := cs.AppsV1().DaemonSets("default").Delete(ctx, dsName, metav1.DeleteOptions{}); err != nil {
|
||||
logger.Error().Msgf("scheduleImageRemoval: failed to delete DaemonSet: %v", err)
|
||||
}
|
||||
logger.Info().Msgf("scheduleImageRemoval: completed for %s", s.ExecutionsID)
|
||||
}
|
||||
|
||||
// parseImage décompose "registry/name:tag" en ses trois composants.
|
||||
// registry vide si aucun composant ressemblant à un hostname n'est détecté.
|
||||
func (s *KubernetesService) parseImage(image string) (registry, name, tag string) {
|
||||
parts := strings.SplitN(image, ":", 2)
|
||||
nameWithRegistry := parts[0]
|
||||
if len(parts) == 2 {
|
||||
tag = parts[1]
|
||||
} else {
|
||||
tag = "latest"
|
||||
}
|
||||
|
||||
slashIdx := strings.Index(nameWithRegistry, "/")
|
||||
if slashIdx == -1 {
|
||||
return "", nameWithRegistry, tag
|
||||
}
|
||||
prefix := nameWithRegistry[:slashIdx]
|
||||
// Présence d'un "." ou ":" ou "localhost" → c'est un hostname de registry.
|
||||
if strings.ContainsAny(prefix, ".:") || prefix == "localhost" {
|
||||
return prefix, nameWithRegistry[slashIdx+1:], tag
|
||||
}
|
||||
return "", nameWithRegistry, tag
|
||||
}
|
||||
|
||||
// matchesTagConstraint vérifie si tag satisfait la contrainte.
|
||||
// Vide = toutes versions. Supporte exact et glob suffixe ("3.*").
|
||||
func (s *KubernetesService) matchesTagConstraint(constraint, tag string) bool {
|
||||
if constraint == "" {
|
||||
return true
|
||||
}
|
||||
if strings.HasSuffix(constraint, "*") {
|
||||
return strings.HasPrefix(tag, strings.TrimSuffix(constraint, "*"))
|
||||
}
|
||||
return constraint == tag
|
||||
}
|
||||
|
||||
// newClientset construit un client k8s depuis les credentials base64 en conf.
|
||||
func (s *KubernetesService) newClientset() (*kubernetes.Clientset, error) {
|
||||
caData, err := base64.StdEncoding.DecodeString(conf.GetConfig().KubeCA)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("newClientset: invalid KubeCA: %w", err)
|
||||
}
|
||||
certData, err := base64.StdEncoding.DecodeString(conf.GetConfig().KubeCert)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("newClientset: invalid KubeCert: %w", err)
|
||||
}
|
||||
keyData, err := base64.StdEncoding.DecodeString(conf.GetConfig().KubeData)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("newClientset: invalid KubeData: %w", err)
|
||||
}
|
||||
cfg := &rest.Config{
|
||||
Host: "https://" + conf.GetConfig().KubeHost + ":" + conf.GetConfig().KubePort,
|
||||
TLSClientConfig: rest.TLSClientConfig{
|
||||
CAData: caData,
|
||||
CertData: certData,
|
||||
KeyData: keyData,
|
||||
},
|
||||
}
|
||||
return kubernetes.NewForConfig(cfg)
|
||||
}
|
||||
|
||||
func (s *KubernetesService) CreateNamespace() error {
|
||||
logger := oclib.GetLogger()
|
||||
serv, err := tools.NewKubernetesService(
|
||||
conf.GetConfig().KubeHost+":"+conf.GetConfig().KubePort, conf.GetConfig().KubeCA,
|
||||
conf.GetConfig().KubeCert, conf.GetConfig().KubeData)
|
||||
if err != nil {
|
||||
logger.Error().Msg("CreateNamespace: failed to init k8s service: " + err.Error())
|
||||
return err
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
return serv.ProvisionExecutionNamespace(ctx, s.ExecutionsID)
|
||||
}
|
||||
Reference in New Issue
Block a user