package main

import (
	"context"
	"fmt"
	"log/slog"
	"net"
	"os"
	"os/exec"
	"path/filepath"
	"regexp"
	"runtime"
	"sort"
	"strconv"
	"strings"
	"time"

	"github.com/google/uuid"
)

type ociWorkloadLaunchInput struct {
	WorkloadInstanceID string
	AllocationID       string
	Username           string
	ImageDigestRef     string
	ContainerName      string
	Env                map[string]string
	Mounts             []ociWorkloadMount
	Endpoints          []ociWorkloadEndpoint
	PullCredentialRef  string
	SupplementalGIDs   []int
	GPUCount           int
	CPUCores           int
	MemoryGiB          int
	PythonPackages     []string
	CommandArgs        []string
}

type ociWorkloadMount struct {
	Source   string
	Target   string
	ReadOnly bool
}

type ociWorkloadEndpoint struct {
	Name          string
	Protocol      string
	ExposureMode  string
	ContainerPort int
	HostPort      int
}

type ociWorkloadControlInput struct {
	WorkloadInstanceID string
	Action             string
	Runtime            string
}

type ociWorkloadContainerState struct {
	Status string
	Health string
	Exit   string
	Error  string
	OOM    string
}

type ociWorkloadRemoveInput struct {
	WorkloadInstanceID string
	RemoveScratch      bool
	Runtime            string
}

type ociWorkloadPullCredential struct {
	RegistryHost string
	Username     string
	Password     string
}

var (
	ociContainerNamePattern = regexp.MustCompile(`^gpuaas-oci-[a-f0-9]{32}$`)
	ociImageDigestPattern   = regexp.MustCompile(`@sha256:[a-f0-9]{64}$`)
	ociEnvKeyPattern        = regexp.MustCompile(`^[A-Z_][A-Z0-9_]{0,127}$`)
	ociNumericIDPattern     = regexp.MustCompile(`^[0-9]+$`)
	ociURLTokenPattern      = regexp.MustCompile(`(?i)(token=)[^&\s]+`)
	ociServerTokenPattern   = regexp.MustCompile(`(?i)(--ServerApp\.token=)[^\s]+`)
)

var ociWorkloadUserIdentityLookup = lookupOCIWorkloadUserIdentity
var ociWorkloadPrepareMounts = prepareOCIWorkloadMounts
var ociWorkloadPrepareIdentityFiles = prepareOCIWorkloadIdentityFiles
var ociWorkloadRemoveScratch = removeOCIWorkloadScratch
var ociWorkloadCheckHostPortAvailable = checkOCIWorkloadHostPortAvailable
var ociWorkloadHostCPUCount = runtime.NumCPU
var ociWorkloadHostMemoryGiB = detectOCIWorkloadHostMemoryGiB
var ociWorkloadHomeRoot = "/home"
var ociWorkloadCredentialRootEnv = "GPUAAS_OCI_PULL_CREDENTIAL_ROOTS"
var ociWorkloadImageInspectTimeout = 15 * time.Second
var ociWorkloadImagePullTimeout = 10 * time.Minute
var ociWorkloadReadyTimeout = 30 * time.Second

func handleOCIWorkloadLaunchTask(ctx context.Context, task nodeTask) (map[string]any, error) {
	in, err := parseOCIWorkloadLaunchInput(task.Params)
	if err != nil {
		return nil, err
	}
	return runOCIWorkloadLaunch(ctx, in)
}

func handleOCIWorkloadControlTask(ctx context.Context, task nodeTask) (map[string]any, error) {
	in, err := parseOCIWorkloadControlInput(task.Params)
	if err != nil {
		return nil, err
	}
	return runOCIWorkloadControl(ctx, in)
}

func handleOCIWorkloadRemoveTask(ctx context.Context, task nodeTask) (map[string]any, error) {
	in, err := parseOCIWorkloadRemoveInput(task.Params)
	if err != nil {
		return nil, err
	}
	return runOCIWorkloadRemove(ctx, in)
}

func parseOCIWorkloadLaunchInput(params map[string]any) (ociWorkloadLaunchInput, error) {
	workloadID, err := requiredUUIDParam(params, "workload_instance_id")
	if err != nil {
		return ociWorkloadLaunchInput{}, err
	}
	allocationID, err := requiredUUIDParam(params, "allocation_id")
	if err != nil {
		return ociWorkloadLaunchInput{}, err
	}
	username, _ := params["username"].(string)
	username = strings.TrimSpace(username)
	if username == "" {
		return ociWorkloadLaunchInput{}, fmt.Errorf("missing username")
	}
	if !usernameOnNodePattern.MatchString(username) {
		return ociWorkloadLaunchInput{}, fmt.Errorf("invalid username")
	}
	imageDigestRef, _ := params["image_digest_ref"].(string)
	imageDigestRef = strings.TrimSpace(imageDigestRef)
	if imageDigestRef == "" {
		return ociWorkloadLaunchInput{}, fmt.Errorf("missing image_digest_ref")
	}
	if !ociImageDigestPattern.MatchString(imageDigestRef) {
		return ociWorkloadLaunchInput{}, fmt.Errorf("image_digest_ref must be digest-pinned with sha256")
	}
	containerName, _ := params["container_name"].(string)
	containerName = strings.TrimSpace(containerName)
	if containerName == "" {
		containerName = generatedOCIContainerName(workloadID)
	}
	if !ociContainerNamePattern.MatchString(containerName) {
		return ociWorkloadLaunchInput{}, fmt.Errorf("invalid container_name")
	}
	if containerName != generatedOCIContainerName(workloadID) {
		return ociWorkloadLaunchInput{}, fmt.Errorf("container_name must match workload_instance_id")
	}
	pullCredentialRef, _ := params["pull_credential_ref"].(string)
	pullCredentialRef = strings.TrimSpace(pullCredentialRef)
	env, err := parseOCIWorkloadEnv(params["env"])
	if err != nil {
		return ociWorkloadLaunchInput{}, err
	}
	mounts, err := parseOCIWorkloadMounts(params["mounts"], username, workloadID)
	if err != nil {
		return ociWorkloadLaunchInput{}, err
	}
	pythonPackages, err := parseOCIWorkloadPythonPackages(params["python_packages"])
	if err != nil {
		return ociWorkloadLaunchInput{}, err
	}
	endpoints, err := parseOCIWorkloadEndpoints(params["endpoints"])
	if err != nil {
		return ociWorkloadLaunchInput{}, err
	}
	gpuCount, err := parseOCIWorkloadGPURequest(params["gpu_request"])
	if err != nil {
		return ociWorkloadLaunchInput{}, err
	}
	cpuCores, memoryGiB, err := parseOCIWorkloadResourceLimits(params["resource_limits"])
	if err != nil {
		return ociWorkloadLaunchInput{}, err
	}
	supplementalGIDs, err := parseOCIWorkloadSupplementalGIDs(params["container_supplemental_gids"])
	if err != nil {
		return ociWorkloadLaunchInput{}, err
	}
	commandArgs, err := parseOCIWorkloadCommandArgs(params["command_args"])
	if err != nil {
		return ociWorkloadLaunchInput{}, err
	}
	slog.Info("node agent oci launch input parsed",
		"workload_instance_id", workloadID,
		"container_name", containerName,
		"has_command_args", len(commandArgs) > 0,
		"command_arg_count", len(commandArgs),
		"jupyter_base_url_present", ociWorkloadArgsHavePrefix(commandArgs, "--ServerApp.base_url="),
		"jupyter_trust_xheaders_present", ociWorkloadArgsHave(commandArgs, "--ServerApp.trust_xheaders=True"),
		"jupyter_token_flag_present", ociWorkloadArgsHavePrefix(commandArgs, "--ServerApp.token="),
	)
	return ociWorkloadLaunchInput{
		WorkloadInstanceID: workloadID,
		AllocationID:       allocationID,
		Username:           username,
		ImageDigestRef:     imageDigestRef,
		ContainerName:      containerName,
		Env:                env,
		Mounts:             mounts,
		Endpoints:          endpoints,
		PullCredentialRef:  pullCredentialRef,
		SupplementalGIDs:   supplementalGIDs,
		GPUCount:           gpuCount,
		CPUCores:           cpuCores,
		MemoryGiB:          memoryGiB,
		PythonPackages:     pythonPackages,
		CommandArgs:        commandArgs,
	}, nil
}

func ociWorkloadArgsHave(args []string, want string) bool {
	for _, arg := range args {
		if strings.TrimSpace(arg) == want {
			return true
		}
	}
	return false
}

func ociWorkloadArgsHavePrefix(args []string, prefix string) bool {
	for _, arg := range args {
		if strings.HasPrefix(strings.TrimSpace(arg), prefix) {
			return true
		}
	}
	return false
}

func parseOCIWorkloadControlInput(params map[string]any) (ociWorkloadControlInput, error) {
	workloadID, err := requiredUUIDParam(params, "workload_instance_id")
	if err != nil {
		return ociWorkloadControlInput{}, err
	}
	action, _ := params["action"].(string)
	action = strings.TrimSpace(strings.ToLower(action))
	switch action {
	case "start", "stop", "restart", "status":
	default:
		return ociWorkloadControlInput{}, fmt.Errorf("invalid action")
	}
	preferredRuntime, err := optionalApprovedOCIRuntimeParam(params, "runtime")
	if err != nil {
		return ociWorkloadControlInput{}, err
	}
	return ociWorkloadControlInput{WorkloadInstanceID: workloadID, Action: action, Runtime: preferredRuntime}, nil
}

func parseOCIWorkloadRemoveInput(params map[string]any) (ociWorkloadRemoveInput, error) {
	workloadID, err := requiredUUIDParam(params, "workload_instance_id")
	if err != nil {
		return ociWorkloadRemoveInput{}, err
	}
	preferredRuntime, err := optionalApprovedOCIRuntimeParam(params, "runtime")
	if err != nil {
		return ociWorkloadRemoveInput{}, err
	}
	removeScratch, _ := params["remove_scratch"].(bool)
	return ociWorkloadRemoveInput{WorkloadInstanceID: workloadID, RemoveScratch: removeScratch, Runtime: preferredRuntime}, nil
}

func optionalApprovedOCIRuntimeParam(params map[string]any, name string) (string, error) {
	value, _ := params[name].(string)
	value = strings.TrimSpace(strings.ToLower(value))
	if value == "" {
		return "", nil
	}
	for _, probe := range approvedOCIRuntimeProbes {
		if value == probe.Name {
			return value, nil
		}
	}
	return "", fmt.Errorf("invalid %s", name)
}

func requiredUUIDParam(params map[string]any, name string) (string, error) {
	value, _ := params[name].(string)
	value = strings.TrimSpace(value)
	if value == "" {
		return "", fmt.Errorf("missing %s", name)
	}
	parsed, err := uuid.Parse(value)
	if err != nil {
		return "", fmt.Errorf("invalid %s", name)
	}
	return parsed.String(), nil
}

func parseOCIWorkloadEnv(raw any) (map[string]string, error) {
	env := map[string]string{}
	if raw == nil {
		return env, nil
	}
	rawMap, ok := raw.(map[string]any)
	if !ok {
		return nil, fmt.Errorf("env must be an object")
	}
	for key, value := range rawMap {
		key = strings.TrimSpace(key)
		if !ociEnvKeyPattern.MatchString(key) {
			return nil, fmt.Errorf("invalid env key")
		}
		text, ok := value.(string)
		if !ok {
			return nil, fmt.Errorf("env values must be strings")
		}
		if strings.ContainsRune(text, '\x00') || len(text) > 4096 {
			return nil, fmt.Errorf("invalid env value")
		}
		env[key] = text
	}
	return env, nil
}

func parseOCIWorkloadMounts(raw any, username, workloadID string) ([]ociWorkloadMount, error) {
	if raw == nil {
		return nil, nil
	}
	items, ok := raw.([]any)
	if !ok {
		return nil, fmt.Errorf("mounts must be an array")
	}
	mounts := make([]ociWorkloadMount, 0, len(items))
	allowedPrefix := filepath.Join("/home", username, ".gpuaas", "workloads", workloadID) + string(filepath.Separator)
	for _, item := range items {
		rawMount, ok := item.(map[string]any)
		if !ok {
			return nil, fmt.Errorf("mount entries must be objects")
		}
		source, _ := rawMount["source"].(string)
		source = filepath.Clean(strings.TrimSpace(source))
		if source == "." || !filepath.IsAbs(source) || !strings.HasPrefix(source, allowedPrefix) {
			return nil, fmt.Errorf("mount source must be workload-scoped")
		}
		target, _ := rawMount["target"].(string)
		target = filepath.Clean(strings.TrimSpace(target))
		if target == "." || !filepath.IsAbs(target) || (target != "/workspace" && !strings.HasPrefix(target, "/workspace/")) {
			return nil, fmt.Errorf("mount target must be under /workspace")
		}
		readOnly, _ := rawMount["read_only"].(bool)
		mounts = append(mounts, ociWorkloadMount{Source: source, Target: target, ReadOnly: readOnly})
	}
	return mounts, nil
}

func parseOCIWorkloadPythonPackages(raw any) ([]string, error) {
	if raw == nil {
		return nil, nil
	}
	items, ok := raw.([]any)
	if !ok {
		return nil, fmt.Errorf("python_packages must be an array")
	}
	packages := make([]string, 0, len(items))
	seen := map[string]bool{}
	for _, item := range items {
		value, ok := item.(string)
		if !ok {
			return nil, fmt.Errorf("python_packages entries must be strings")
		}
		value = strings.TrimSpace(value)
		if value == "" {
			continue
		}
		if strings.HasPrefix(value, "-") || len(value) > 128 || strings.ContainsAny(value, " \t\r\n;&|`$\\\"'(){}") {
			return nil, fmt.Errorf("invalid python package specifier")
		}
		if seen[value] {
			continue
		}
		seen[value] = true
		packages = append(packages, value)
	}
	if len(packages) > 100 {
		return nil, fmt.Errorf("too many python package specifiers")
	}
	return packages, nil
}

func parseOCIWorkloadCommandArgs(raw any) ([]string, error) {
	if raw == nil {
		return nil, nil
	}
	items, ok := raw.([]any)
	if !ok {
		return nil, fmt.Errorf("command_args must be an array")
	}
	if len(items) > 64 {
		return nil, fmt.Errorf("too many command_args")
	}
	args := make([]string, 0, len(items))
	for _, item := range items {
		value, ok := item.(string)
		if !ok {
			return nil, fmt.Errorf("command_args entries must be strings")
		}
		value = strings.TrimSpace(value)
		if value == "" || strings.ContainsRune(value, '\x00') || len(value) > 512 {
			return nil, fmt.Errorf("invalid command_args entry")
		}
		args = append(args, value)
	}
	return args, nil
}

func parseOCIWorkloadSupplementalGIDs(raw any) ([]int, error) {
	if raw == nil {
		return nil, nil
	}
	values, err := intSliceParam(map[string]any{"container_supplemental_gids": raw}, "container_supplemental_gids")
	if err != nil {
		return nil, err
	}
	if len(values) > 16 {
		return nil, fmt.Errorf("too many container_supplemental_gids")
	}
	seen := map[int]bool{}
	out := make([]int, 0, len(values))
	for _, gid := range values {
		if gid <= 0 || gid > 65535 {
			return nil, fmt.Errorf("container_supplemental_gids out of bounds")
		}
		if seen[gid] {
			continue
		}
		seen[gid] = true
		out = append(out, gid)
	}
	sort.Ints(out)
	return out, nil
}

func parseOCIWorkloadEndpoints(raw any) ([]ociWorkloadEndpoint, error) {
	if raw == nil {
		return nil, nil
	}
	items, ok := raw.([]any)
	if !ok {
		return nil, fmt.Errorf("endpoints must be an array")
	}
	endpoints := make([]ociWorkloadEndpoint, 0, len(items))
	for _, item := range items {
		rawEndpoint, ok := item.(map[string]any)
		if !ok {
			return nil, fmt.Errorf("endpoint entries must be objects")
		}
		name, _ := rawEndpoint["name"].(string)
		name = strings.TrimSpace(name)
		protocol, _ := rawEndpoint["protocol"].(string)
		protocol = strings.TrimSpace(strings.ToLower(protocol))
		if protocol == "" {
			protocol = "tcp"
		}
		switch protocol {
		case "tcp", "http":
		default:
			return nil, fmt.Errorf("invalid endpoint protocol")
		}
		exposureMode, _ := rawEndpoint["exposure_mode"].(string)
		exposureMode = strings.TrimSpace(strings.ToLower(exposureMode))
		containerPort, err := ociIntParam(rawEndpoint["container_port"])
		if err != nil || containerPort < 1 || containerPort > 65535 {
			return nil, fmt.Errorf("invalid endpoint container_port")
		}
		hostPort := 0
		if rawEndpoint["host_port"] != nil {
			hostPort, err = ociIntParam(rawEndpoint["host_port"])
			if err != nil || hostPort < 1 || hostPort > 65535 {
				return nil, fmt.Errorf("invalid endpoint host_port")
			}
		}
		if exposureMode == "" {
			if hostPort > 0 {
				exposureMode = "private"
			} else {
				exposureMode = "platform_proxy"
			}
		}
		if exposureMode == "platform_proxy" && hostPort == 0 {
			hostPort = containerPort
		}
		endpoints = append(endpoints, ociWorkloadEndpoint{
			Name:          name,
			Protocol:      protocol,
			ExposureMode:  exposureMode,
			ContainerPort: containerPort,
			HostPort:      hostPort,
		})
	}
	return endpoints, nil
}

func parseOCIWorkloadGPURequest(raw any) (int, error) {
	if raw == nil {
		return 0, nil
	}
	rawMap, ok := raw.(map[string]any)
	if !ok {
		return 0, fmt.Errorf("gpu_request must be an object")
	}
	kind, _ := rawMap["kind"].(string)
	switch strings.TrimSpace(strings.ToLower(kind)) {
	case "", "none":
		return 0, nil
	case "count":
		count, err := ociIntParam(rawMap["count"])
		if err != nil || count < 1 {
			return 0, fmt.Errorf("gpu_request count must be a positive integer")
		}
		return count, nil
	case "all":
		return -1, nil
	default:
		return 0, fmt.Errorf("unsupported gpu_request kind")
	}
}

func parseOCIWorkloadResourceLimits(raw any) (int, int, error) {
	if raw == nil {
		return 0, 0, nil
	}
	rawMap, ok := raw.(map[string]any)
	if !ok {
		return 0, 0, fmt.Errorf("resource_limits must be an object")
	}
	cpuCores := 0
	if rawMap["cpu_cores"] != nil {
		value, err := ociIntParam(rawMap["cpu_cores"])
		if err != nil || value < 1 || value > 1024 {
			return 0, 0, fmt.Errorf("invalid resource_limits cpu_cores")
		}
		cpuCores = value
	}
	memoryGiB := 0
	if rawMap["memory_gib"] != nil {
		value, err := ociIntParam(rawMap["memory_gib"])
		if err != nil || value < 1 || value > 1048576 {
			return 0, 0, fmt.Errorf("invalid resource_limits memory_gib")
		}
		memoryGiB = value
	}
	return cpuCores, memoryGiB, nil
}

func ociIntParam(raw any) (int, error) {
	switch v := raw.(type) {
	case int:
		return v, nil
	case int32:
		return int(v), nil
	case int64:
		return int(v), nil
	case float64:
		if v != float64(int(v)) {
			return 0, fmt.Errorf("not an integer")
		}
		return int(v), nil
	case string:
		return strconv.Atoi(strings.TrimSpace(v))
	default:
		return 0, fmt.Errorf("not an integer")
	}
}

func runOCIWorkloadLaunch(ctx context.Context, in ociWorkloadLaunchInput) (map[string]any, error) {
	runtime, path, err := selectReachableOCIRuntime(ctx)
	if err != nil {
		return nil, err
	}
	userIdentity, err := ociWorkloadUserIdentityLookup(in.Username)
	if err != nil {
		return nil, err
	}
	if err := ociWorkloadPrepareMounts(in.Mounts, userIdentity); err != nil {
		return nil, err
	}
	identityMounts, err := ociWorkloadPrepareIdentityFiles(in.Username, in.WorkloadInstanceID, userIdentity)
	if err != nil {
		return nil, err
	}
	if err := checkOCIWorkloadEndpointHostPorts(in.Endpoints); err != nil {
		return nil, err
	}
	if err := checkOCIWorkloadResourceCapacity(in.CPUCores, in.MemoryGiB); err != nil {
		return nil, err
	}
	if in.GPUCount != 0 && runtime != "docker" {
		return nil, fmt.Errorf("gpu_request is currently supported only with docker")
	}
	if err := checkNodeDiskPreflight(); err != nil {
		return nil, err
	}
	if in.PullCredentialRef != "" {
		if err := loginOCIWorkloadPullCredential(ctx, path, in.PullCredentialRef); err != nil {
			return nil, err
		}
	}
	if runtime == "docker" {
		if err := ensureOCIWorkloadImageAvailable(ctx, path, in.ImageDigestRef); err != nil {
			return nil, err
		}
	}
	args := []string{
		"run", "--detach",
		"--name", in.ContainerName,
		"--label", "gpuaas.workload_instance_id=" + in.WorkloadInstanceID,
		"--label", "gpuaas.allocation_id=" + in.AllocationID,
		"--user", userIdentity,
		"--restart", "unless-stopped",
	}
	if workdir := ociWorkloadWorkingDir(in.Mounts); workdir != "" {
		args = append(args, "--workdir", workdir)
	}
	for _, gid := range in.SupplementalGIDs {
		args = append(args, "--group-add", strconv.Itoa(gid))
	}
	if in.GPUCount < 0 {
		args = append(args, "--gpus", "all")
	} else if in.GPUCount > 0 {
		args = append(args, "--gpus", strconv.Itoa(in.GPUCount))
	}
	if in.CPUCores > 0 {
		args = append(args, "--cpus", strconv.Itoa(in.CPUCores))
	}
	if in.MemoryGiB > 0 {
		args = append(args, "--memory", strconv.Itoa(in.MemoryGiB)+"g")
	}
	envKeys := make([]string, 0, len(in.Env))
	for key := range in.Env {
		envKeys = append(envKeys, key)
	}
	sort.Strings(envKeys)
	for _, key := range envKeys {
		args = append(args, "--env", key+"="+in.Env[key])
	}
	for _, mount := range in.Mounts {
		spec := "type=bind,src=" + mount.Source + ",dst=" + mount.Target
		if mount.ReadOnly {
			spec += ",readonly"
		}
		args = append(args, "--mount", spec)
	}
	for _, mount := range identityMounts {
		spec := "type=bind,src=" + mount.Source + ",dst=" + mount.Target + ",readonly"
		args = append(args, "--mount", spec)
	}
	for _, endpoint := range in.Endpoints {
		args = append(args, "--expose", fmt.Sprintf("%d/%s", endpoint.ContainerPort, "tcp"))
		if endpoint.HostPort > 0 {
			args = append(args, "--publish", ociWorkloadEndpointPublishSpec(endpoint))
		}
	}
	args = append(args, in.ImageDigestRef)
	args = append(args, in.CommandArgs...)
	out, err := ociRuntimeCommandContext(ctx, path, args...).CombinedOutput()
	if err != nil {
		detail := ociWorkloadFailureDetail(ctx, path, in.ContainerName)
		if detail != "" {
			return nil, fmt.Errorf("launch OCI workload with %s: %s; %s", runtime, compactProbeOutput(out, err), detail)
		}
		return nil, fmt.Errorf("launch OCI workload with %s: %s", runtime, compactProbeOutput(out, err))
	}
	state, err := waitOCIWorkloadReady(ctx, path, in.ContainerName)
	if err != nil {
		detail := ociWorkloadFailureDetail(ctx, path, in.ContainerName)
		if detail != "" {
			return nil, fmt.Errorf("launch OCI workload with %s: %w; %s", runtime, err, detail)
		}
		return nil, fmt.Errorf("launch OCI workload with %s: %w", runtime, err)
	}
	pythonPackageInstallLog := ""
	if len(in.PythonPackages) > 0 {
		if runtime != "docker" {
			return nil, fmt.Errorf("python package install is currently supported only with docker")
		}
		var err error
		pythonPackageInstallLog, err = installOCIWorkloadPythonPackages(ctx, path, in.ContainerName, in.PythonPackages)
		if err != nil {
			return nil, fmt.Errorf("install Python packages: %w", err)
		}
		state, err = inspectOCIWorkloadState(ctx, path, in.ContainerName)
		if err != nil {
			return nil, fmt.Errorf("inspect OCI workload state after Python package install: %w", err)
		}
		switch {
		case state.Status == "exited" || state.Status == "dead":
			return nil, fmt.Errorf("container exited after Python package install: status=%s health=%s exit_code=%s", state.Status, state.Health, state.Exit)
		case state.Health == "unhealthy":
			return nil, fmt.Errorf("container became unhealthy after Python package install: status=%s health=%s exit_code=%s", state.Status, state.Health, state.Exit)
		}
	}
	output := map[string]any{
		"launched":                  true,
		"runtime":                   runtime,
		"container_name":            in.ContainerName,
		"workload_instance_id":      in.WorkloadInstanceID,
		"endpoints":                 ociWorkloadEndpointOutput(in.Endpoints),
		"container_id":              compactProbeOutput(out, nil),
		"container_status":          state.Status,
		"health_status":             state.Health,
		"exit_code":                 state.Exit,
		"python_packages_installed": len(in.PythonPackages),
	}
	if pythonPackageInstallLog != "" {
		output["python_package_install_log"] = pythonPackageInstallLog
	}
	return output, nil
}

func ensureOCIWorkloadImageAvailable(ctx context.Context, runtimePath, imageDigestRef string) error {
	inspectCtx, inspectCancel := context.WithTimeout(ctx, ociWorkloadImageInspectTimeout)
	out, err := ociRuntimeCommandContext(inspectCtx, runtimePath, "image", "inspect", imageDigestRef).CombinedOutput()
	inspectErr := inspectCtx.Err()
	inspectCancel()
	if err == nil {
		return nil
	}
	if inspectErr != nil {
		return fmt.Errorf("inspect OCI workload image cache timed out after %s: %s", ociWorkloadImageInspectTimeout, compactProbeOutput(out, err))
	}

	pullCtx, pullCancel := context.WithTimeout(ctx, ociWorkloadImagePullTimeout)
	out, err = ociRuntimeCommandContext(pullCtx, runtimePath, "pull", imageDigestRef).CombinedOutput()
	pullErr := pullCtx.Err()
	pullCancel()
	if err == nil {
		return nil
	}
	if pullErr != nil {
		return fmt.Errorf("pull OCI workload image timed out after %s: %s", ociWorkloadImagePullTimeout, compactProbeOutput(out, err))
	}
	return fmt.Errorf("pull OCI workload image: %s", compactProbeOutput(out, err))
}

func loginOCIWorkloadPullCredential(ctx context.Context, runtimePath, credentialRef string) error {
	credential, err := resolveOCIWorkloadPullCredential(credentialRef)
	if err != nil {
		return err
	}
	cmd := ociRuntimeCommandContext(ctx, runtimePath, "login", credential.RegistryHost, "-u", credential.Username, "--password-stdin")
	cmd.Stdin = strings.NewReader(credential.Password + "\n")
	out, err := cmd.CombinedOutput()
	if err != nil {
		return fmt.Errorf("login OCI registry for pull credential: %s", compactProbeOutput(out, err))
	}
	return nil
}

func resolveOCIWorkloadPullCredential(credentialRef string) (ociWorkloadPullCredential, error) {
	credentialRef = strings.TrimSpace(credentialRef)
	if credentialRef == "" {
		return ociWorkloadPullCredential{}, fmt.Errorf("missing pull_credential_ref")
	}
	const filePrefix = "file:"
	if !strings.HasPrefix(credentialRef, filePrefix) {
		return ociWorkloadPullCredential{}, fmt.Errorf("unsupported pull_credential_ref scheme")
	}
	path := strings.TrimSpace(strings.TrimPrefix(credentialRef, filePrefix))
	if path == "" {
		return ociWorkloadPullCredential{}, fmt.Errorf("empty pull credential file path")
	}
	path = filepath.Clean(path)
	if !filepath.IsAbs(path) {
		return ociWorkloadPullCredential{}, fmt.Errorf("pull credential file path must be absolute")
	}
	if !ociWorkloadCredentialPathAllowed(path) {
		return ociWorkloadPullCredential{}, fmt.Errorf("pull credential file path is outside allowed roots")
	}
	info, err := os.Stat(path)
	if err != nil {
		return ociWorkloadPullCredential{}, fmt.Errorf("stat pull credential file: %w", err)
	}
	if info.IsDir() {
		return ociWorkloadPullCredential{}, fmt.Errorf("pull credential path must be a file")
	}
	if info.Mode().Perm()&0o077 != 0 {
		return ociWorkloadPullCredential{}, fmt.Errorf("pull credential file permissions must not be group/world readable")
	}
	data, err := os.ReadFile(path)
	if err != nil {
		return ociWorkloadPullCredential{}, fmt.Errorf("read pull credential file: %w", err)
	}
	credential, err := parseOCIWorkloadPullCredentialEnv(string(data))
	if err != nil {
		return ociWorkloadPullCredential{}, err
	}
	return credential, nil
}

func ociWorkloadCredentialPathAllowed(path string) bool {
	for _, root := range ociWorkloadCredentialRoots() {
		root = filepath.Clean(root)
		if root == "." || !filepath.IsAbs(root) {
			continue
		}
		if path == root || strings.HasPrefix(path, root+string(os.PathSeparator)) {
			return true
		}
	}
	return false
}

func ociWorkloadCredentialRoots() []string {
	raw := strings.TrimSpace(os.Getenv(ociWorkloadCredentialRootEnv))
	if raw == "" {
		return []string{"/etc/gpuaas", "/run/gpuaas", "/var/lib/gpuaas/credentials"}
	}
	parts := strings.Split(raw, ",")
	roots := make([]string, 0, len(parts))
	for _, part := range parts {
		part = strings.TrimSpace(part)
		if part != "" {
			roots = append(roots, part)
		}
	}
	return roots
}

func parseOCIWorkloadPullCredentialEnv(data string) (ociWorkloadPullCredential, error) {
	values := map[string]string{}
	for _, line := range strings.Split(data, "\n") {
		line = strings.TrimSpace(line)
		if line == "" || strings.HasPrefix(line, "#") {
			continue
		}
		line = strings.TrimPrefix(line, "export ")
		key, value, ok := strings.Cut(line, "=")
		if !ok {
			continue
		}
		key = strings.TrimSpace(key)
		switch key {
		case "GPUAAS_NODE_AGENT_REGISTRY_HOST", "GPUAAS_NODE_AGENT_REGISTRY_USERNAME", "GPUAAS_NODE_AGENT_REGISTRY_PASSWORD":
			values[key] = parseOCIWorkloadCredentialValue(strings.TrimSpace(value))
		}
	}
	credential := ociWorkloadPullCredential{
		RegistryHost: strings.TrimSpace(values["GPUAAS_NODE_AGENT_REGISTRY_HOST"]),
		Username:     strings.TrimSpace(values["GPUAAS_NODE_AGENT_REGISTRY_USERNAME"]),
		Password:     values["GPUAAS_NODE_AGENT_REGISTRY_PASSWORD"],
	}
	if credential.RegistryHost == "" || credential.Username == "" || credential.Password == "" {
		return ociWorkloadPullCredential{}, fmt.Errorf("pull credential file missing registry host, username, or password")
	}
	if strings.ContainsAny(credential.RegistryHost, "/ \t\r\n") {
		return ociWorkloadPullCredential{}, fmt.Errorf("pull credential registry host is invalid")
	}
	return credential, nil
}

func parseOCIWorkloadCredentialValue(value string) string {
	if len(value) >= 2 && value[0] == '\'' && value[len(value)-1] == '\'' {
		return strings.ReplaceAll(value[1:len(value)-1], `'\''`, `'`)
	}
	if len(value) >= 2 && value[0] == '"' && value[len(value)-1] == '"' {
		return strings.ReplaceAll(value[1:len(value)-1], `\"`, `"`)
	}
	return value
}

func ociWorkloadWorkingDir(mounts []ociWorkloadMount) string {
	for _, mount := range mounts {
		if strings.TrimSpace(mount.Target) == "/workspace" {
			return "/workspace"
		}
	}
	return ""
}

func ociWorkloadEndpointOutput(endpoints []ociWorkloadEndpoint) []map[string]any {
	out := make([]map[string]any, 0, len(endpoints))
	for _, endpoint := range endpoints {
		item := map[string]any{
			"name":           endpoint.Name,
			"protocol":       endpoint.Protocol,
			"exposure_mode":  endpoint.ExposureMode,
			"container_port": endpoint.ContainerPort,
		}
		if endpoint.HostPort > 0 {
			item["host_port"] = endpoint.HostPort
		}
		out = append(out, item)
	}
	return out
}

func ociWorkloadEndpointPublishSpec(endpoint ociWorkloadEndpoint) string {
	host := "127.0.0.1"
	if strings.TrimSpace(strings.ToLower(endpoint.ExposureMode)) == "platform_proxy" {
		host = "0.0.0.0"
	}
	return fmt.Sprintf("%s:%d:%d/tcp", host, endpoint.HostPort, endpoint.ContainerPort)
}

func installOCIWorkloadPythonPackages(ctx context.Context, runtimePath, containerName string, packages []string) (string, error) {
	if len(packages) == 0 {
		return "", nil
	}
	// Install into the image's active Python environment. Many curated notebook
	// images run Python from a virtualenv, where pip rejects --user installs.
	args := []string{"exec", "--user", "0:0", containerName, "python", "-m", "pip", "install", "--no-cache-dir"}
	args = append(args, packages...)
	out, err := ociRuntimeCommandContext(ctx, runtimePath, args...).CombinedOutput()
	if err != nil {
		return "", fmt.Errorf("%s", compactProbeOutput(out, err))
	}
	return compactProbeOutput(out, nil), nil
}

func checkOCIWorkloadEndpointHostPorts(endpoints []ociWorkloadEndpoint) error {
	for _, endpoint := range endpoints {
		if endpoint.HostPort <= 0 {
			continue
		}
		if err := ociWorkloadCheckHostPortAvailable(endpoint.HostPort); err != nil {
			return fmt.Errorf("host port %d unavailable: %w", endpoint.HostPort, err)
		}
	}
	return nil
}

func checkOCIWorkloadResourceCapacity(cpuCores, memoryGiB int) error {
	if cpuCores > 0 {
		hostCores := ociWorkloadHostCPUCount()
		if hostCores > 0 && cpuCores > hostCores {
			return fmt.Errorf("requested OCI workload cpu_cores=%d exceeds worker capacity cpu_cores=%d", cpuCores, hostCores)
		}
	}
	if memoryGiB > 0 {
		hostMemoryGiB := ociWorkloadHostMemoryGiB()
		if hostMemoryGiB > 0 && memoryGiB > hostMemoryGiB {
			return fmt.Errorf("requested OCI workload memory_gib=%d exceeds worker capacity memory_gib=%d", memoryGiB, hostMemoryGiB)
		}
	}
	return nil
}

func detectOCIWorkloadHostMemoryGiB() int {
	raw, err := os.ReadFile("/proc/meminfo")
	if err != nil {
		return 0
	}
	for _, line := range strings.Split(string(raw), "\n") {
		fields := strings.Fields(line)
		if len(fields) < 2 || fields[0] != "MemTotal:" {
			continue
		}
		kib, err := strconv.Atoi(fields[1])
		if err != nil || kib <= 0 {
			return 0
		}
		return kib / 1024 / 1024
	}
	return 0
}

func checkOCIWorkloadHostPortAvailable(hostPort int) error {
	listener, err := net.Listen("tcp", fmt.Sprintf("127.0.0.1:%d", hostPort))
	if err != nil {
		return err
	}
	return listener.Close()
}

func runOCIWorkloadControl(ctx context.Context, in ociWorkloadControlInput) (map[string]any, error) {
	containerName := generatedOCIContainerName(in.WorkloadInstanceID)
	runtime, path, err := selectOCIWorkloadRuntimeForContainer(ctx, containerName, in.Runtime)
	if err != nil {
		return nil, err
	}
	args := []string{in.Action, containerName}
	if in.Action == "status" {
		args = []string{"inspect", "--format", "{{.State.Status}}", containerName}
	}
	if runtime == "docker" && (in.Action == "start" || in.Action == "restart") {
		if out, err := ociRuntimeCommandContext(ctx, path, "update", "--restart", "unless-stopped", containerName).CombinedOutput(); err != nil {
			return nil, fmt.Errorf("prepare OCI workload restart policy with %s: %s", runtime, compactProbeOutput(out, err))
		}
	}
	out, err := ociRuntimeCommandContext(ctx, path, args...).CombinedOutput()
	if err != nil {
		return nil, fmt.Errorf("control OCI workload with %s: %s", runtime, compactProbeOutput(out, err))
	}
	state, stateErr := inspectOCIWorkloadState(ctx, path, containerName)
	output := map[string]any{
		"runtime":              runtime,
		"container_name":       containerName,
		"workload_instance_id": in.WorkloadInstanceID,
		"action":               in.Action,
	}
	if in.Action == "status" {
		output["status"] = compactProbeOutput(out, nil)
	} else {
		output["applied"] = true
	}
	if in.Action == "start" || in.Action == "restart" {
		state, stateErr = waitOCIWorkloadReady(ctx, path, containerName)
	}
	if stateErr == nil {
		output["container_status"] = state.Status
		output["health_status"] = state.Health
		output["exit_code"] = state.Exit
	}
	return output, nil
}

func runOCIWorkloadRemove(ctx context.Context, in ociWorkloadRemoveInput) (map[string]any, error) {
	containerName := generatedOCIContainerName(in.WorkloadInstanceID)
	runtime, path, err := selectOCIWorkloadRuntimeForContainer(ctx, containerName, in.Runtime)
	if err != nil {
		return nil, err
	}
	out, err := ociRuntimeCommandContext(ctx, path, "rm", "--force", containerName).CombinedOutput()
	if err != nil {
		return nil, fmt.Errorf("remove OCI workload with %s: %s", runtime, compactProbeOutput(out, err))
	}
	scratchRemoved := 0
	if in.RemoveScratch {
		scratchRemoved, err = ociWorkloadRemoveScratch(in.WorkloadInstanceID)
		if err != nil {
			return nil, err
		}
	}
	return map[string]any{
		"removed":              true,
		"runtime":              runtime,
		"container_name":       containerName,
		"workload_instance_id": in.WorkloadInstanceID,
		"remove_scratch":       in.RemoveScratch,
		"scratch_removed":      scratchRemoved,
	}, nil
}

func selectReachableOCIRuntime(ctx context.Context) (name, path string, err error) {
	for _, probe := range approvedOCIRuntimeProbes {
		status := probeOCIRuntime(ctx, probe)
		if status["reachable"] == true {
			path, _ := status["path"].(string)
			if strings.TrimSpace(path) == "" {
				continue
			}
			return probe.Name, path, nil
		}
	}
	return "", "", fmt.Errorf("no approved OCI runtime is available")
}

func selectOCIWorkloadRuntimeForContainer(ctx context.Context, containerName, preferredRuntime string) (name, path string, err error) {
	preferredRuntime = strings.TrimSpace(strings.ToLower(preferredRuntime))
	if preferredRuntime != "" {
		for _, probe := range approvedOCIRuntimeProbes {
			if preferredRuntime != probe.Name {
				continue
			}
			candidatePath, lookupErr := ociRuntimeLookPath(probe.Name)
			if lookupErr == nil && strings.TrimSpace(candidatePath) != "" {
				return probe.Name, candidatePath, nil
			}
			break
		}
	}
	for _, probe := range approvedOCIRuntimeProbes {
		candidatePath, lookupErr := ociRuntimeLookPath(probe.Name)
		if lookupErr != nil || strings.TrimSpace(candidatePath) == "" {
			continue
		}
		probeCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
		out, inspectErr := ociRuntimeCommandContext(probeCtx, candidatePath, "inspect", "--format", "{{.State.Status}}", containerName).CombinedOutput()
		cancel()
		if inspectErr == nil && strings.TrimSpace(string(out)) != "" {
			return probe.Name, candidatePath, nil
		}
	}
	return selectReachableOCIRuntime(ctx)
}

func generatedOCIContainerName(workloadID string) string {
	return "gpuaas-oci-" + strings.ReplaceAll(strings.ToLower(strings.TrimSpace(workloadID)), "-", "")
}

func waitOCIWorkloadReady(ctx context.Context, runtimePath, containerName string) (ociWorkloadContainerState, error) {
	deadline := time.Now().Add(ociWorkloadReadyTimeout)
	var last ociWorkloadContainerState
	var lastErr error
	for {
		state, err := inspectOCIWorkloadState(ctx, runtimePath, containerName)
		if err != nil {
			lastErr = err
		} else {
			last = state
			switch {
			case state.Status == "running" && (state.Health == "" || state.Health == "none" || state.Health == "healthy"):
				return state, nil
			case state.Status == "exited" || state.Status == "dead":
				return state, fmt.Errorf("container exited before readiness: status=%s health=%s exit_code=%s", state.Status, state.Health, state.Exit)
			case state.Health == "unhealthy":
				return state, fmt.Errorf("container became unhealthy before readiness: status=%s health=%s exit_code=%s", state.Status, state.Health, state.Exit)
			}
		}
		if time.Now().After(deadline) {
			if lastErr != nil {
				return last, lastErr
			}
			if last.Status == "running" && last.Health == "starting" {
				return last, nil
			}
			return last, fmt.Errorf("container did not become ready: status=%s health=%s exit_code=%s", last.Status, last.Health, last.Exit)
		}
		select {
		case <-ctx.Done():
			return last, ctx.Err()
		case <-time.After(time.Second):
		}
	}
}

func inspectOCIWorkloadState(ctx context.Context, runtimePath, containerName string) (ociWorkloadContainerState, error) {
	format := "{{.State.Status}}|{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}|{{.State.ExitCode}}|{{.State.Error}}|{{.State.OOMKilled}}"
	out, err := ociRuntimeCommandContext(ctx, runtimePath, "inspect", "--format", format, containerName).CombinedOutput()
	if err != nil {
		return ociWorkloadContainerState{}, fmt.Errorf("inspect OCI workload state: %s", compactProbeOutput(out, err))
	}
	parts := strings.Split(strings.TrimSpace(string(out)), "|")
	if len(parts) != 5 {
		return ociWorkloadContainerState{}, fmt.Errorf("inspect OCI workload state returned invalid output")
	}
	return ociWorkloadContainerState{
		Status: strings.TrimSpace(parts[0]),
		Health: strings.TrimSpace(parts[1]),
		Exit:   strings.TrimSpace(parts[2]),
		Error:  sanitizeOCIWorkloadLogTail([]byte(parts[3])),
		OOM:    strings.TrimSpace(parts[4]),
	}, nil
}

func ociWorkloadFailureDetail(ctx context.Context, runtimePath, containerName string) string {
	parts := make([]string, 0, 4)
	if state, err := inspectOCIWorkloadState(ctx, runtimePath, containerName); err == nil {
		parts = append(parts, fmt.Sprintf("container_state=status=%s health=%s exit_code=%s", state.Status, state.Health, state.Exit))
		if state.Error != "" {
			parts = append(parts, "container_error="+state.Error)
		}
		if state.OOM != "" && state.OOM != "false" {
			parts = append(parts, "container_oom_killed="+state.OOM)
		}
	}
	if out, err := ociRuntimeCommandContext(ctx, runtimePath, "logs", "--tail", "40", containerName).CombinedOutput(); err == nil {
		if logTail := sanitizeOCIWorkloadLogTail(out); logTail != "" {
			parts = append(parts, "container_logs="+logTail)
		}
	}
	return strings.Join(parts, "; ")
}

func sanitizeOCIWorkloadLogTail(out []byte) string {
	text := strings.TrimSpace(string(out))
	if text == "" {
		return ""
	}
	text = ociURLTokenPattern.ReplaceAllString(text, `${1}[REDACTED]`)
	text = ociServerTokenPattern.ReplaceAllString(text, `${1}[REDACTED]`)
	text = strings.Join(strings.Fields(text), " ")
	if len(text) > 800 {
		text = text[:800]
	}
	return text
}

func lookupOCIWorkloadUserIdentity(username string) (string, error) {
	uidOut, err := exec.Command("id", "-u", username).Output()
	if err != nil {
		return "", fmt.Errorf("lookup user uid: %w", err)
	}
	gidOut, err := exec.Command("id", "-g", username).Output()
	if err != nil {
		return "", fmt.Errorf("lookup user gid: %w", err)
	}
	uid := strings.TrimSpace(string(uidOut))
	gid := strings.TrimSpace(string(gidOut))
	if !ociNumericIDPattern.MatchString(uid) || !ociNumericIDPattern.MatchString(gid) {
		return "", fmt.Errorf("lookup user identity returned invalid uid or gid")
	}
	return uid + ":" + gid, nil
}

func prepareOCIWorkloadMounts(mounts []ociWorkloadMount, userIdentity string) error {
	uid, gid, err := parseOCIWorkloadNumericUserIdentity(userIdentity)
	if err != nil {
		return err
	}
	for _, mount := range mounts {
		if err := os.MkdirAll(mount.Source, 0o700); err != nil {
			return fmt.Errorf("prepare OCI workload mount: %w", err)
		}
		if mount.Target == "/workspace" {
			if err := prepareOCIWorkloadWorkspaceDirs(mount.Source); err != nil {
				return err
			}
		}
		if err := os.Chown(mount.Source, uid, gid); err != nil {
			return fmt.Errorf("chown OCI workload mount: %w", err)
		}
		if mount.Target == "/workspace" {
			if err := chownOCIWorkloadWorkspaceDirs(mount.Source, uid, gid); err != nil {
				return err
			}
		}
	}
	return nil
}

func prepareOCIWorkloadWorkspaceDirs(source string) error {
	for _, rel := range []string{
		".jupyter/runtime",
		".jupyter/config",
		".jupyter/data",
		".jupyter/lab/user-settings",
		".jupyter/lab/workspaces",
	} {
		if err := os.MkdirAll(filepath.Join(source, rel), 0o700); err != nil {
			return fmt.Errorf("prepare OCI workload workspace dir: %w", err)
		}
	}
	return nil
}

func chownOCIWorkloadWorkspaceDirs(source string, uid, gid int) error {
	for _, rel := range []string{
		".jupyter",
		".jupyter/runtime",
		".jupyter/config",
		".jupyter/data",
		".jupyter/lab",
		".jupyter/lab/user-settings",
		".jupyter/lab/workspaces",
	} {
		if err := os.Chown(filepath.Join(source, rel), uid, gid); err != nil {
			return fmt.Errorf("chown OCI workload workspace dir: %w", err)
		}
	}
	return nil
}

func prepareOCIWorkloadIdentityFiles(username, workloadID, userIdentity string) ([]ociWorkloadMount, error) {
	if !usernameOnNodePattern.MatchString(strings.TrimSpace(username)) {
		return nil, fmt.Errorf("invalid OCI workload username")
	}
	if _, err := uuid.Parse(workloadID); err != nil {
		return nil, fmt.Errorf("invalid OCI workload identity id")
	}
	uid, gid, err := parseOCIWorkloadNumericUserIdentity(userIdentity)
	if err != nil {
		return nil, err
	}
	homeRoot := filepath.Clean(strings.TrimSpace(ociWorkloadHomeRoot))
	if homeRoot == "" || homeRoot == "." || homeRoot == string(filepath.Separator) || !filepath.IsAbs(homeRoot) {
		return nil, fmt.Errorf("invalid OCI workload home root")
	}
	identityDir := filepath.Join(homeRoot, username, ".gpuaas", "workloads", workloadID, "identity")
	if err := os.MkdirAll(identityDir, 0o700); err != nil {
		return nil, fmt.Errorf("prepare OCI workload identity dir: %w", err)
	}
	passwdPath := filepath.Join(identityDir, "passwd")
	groupPath := filepath.Join(identityDir, "group")
	passwd := fmt.Sprintf(
		"root:x:0:0:root:/root:/bin/bash\njovyan:x:1000:100:Jupyter User:/home/jovyan:/bin/bash\n%s:x:%d:%d:GPUaaS allocation user:/workspace:/bin/bash\nnobody:x:65534:65534:nobody:/nonexistent:/usr/sbin/nologin\n",
		username,
		uid,
		gid,
	)
	group := fmt.Sprintf("root:x:0:\nusers:x:100:\n%s:x:%d:%s\nnogroup:x:65534:\n", username, gid, username)
	if err := os.WriteFile(passwdPath, []byte(passwd), 0o644); err != nil {
		return nil, fmt.Errorf("write OCI workload passwd: %w", err)
	}
	if err := os.WriteFile(groupPath, []byte(group), 0o644); err != nil {
		return nil, fmt.Errorf("write OCI workload group: %w", err)
	}
	return []ociWorkloadMount{
		{Source: passwdPath, Target: "/etc/passwd", ReadOnly: true},
		{Source: groupPath, Target: "/etc/group", ReadOnly: true},
	}, nil
}

func parseOCIWorkloadNumericUserIdentity(userIdentity string) (int, int, error) {
	parts := strings.Split(strings.TrimSpace(userIdentity), ":")
	if len(parts) != 2 || !ociNumericIDPattern.MatchString(parts[0]) || !ociNumericIDPattern.MatchString(parts[1]) {
		return 0, 0, fmt.Errorf("invalid OCI workload user identity")
	}
	uid, err := strconv.Atoi(parts[0])
	if err != nil {
		return 0, 0, fmt.Errorf("invalid OCI workload uid: %w", err)
	}
	gid, err := strconv.Atoi(parts[1])
	if err != nil {
		return 0, 0, fmt.Errorf("invalid OCI workload gid: %w", err)
	}
	return uid, gid, nil
}

func removeOCIWorkloadScratch(workloadID string) (int, error) {
	if _, err := uuid.Parse(workloadID); err != nil {
		return 0, fmt.Errorf("invalid OCI workload scratch id")
	}
	homeRoot := filepath.Clean(strings.TrimSpace(ociWorkloadHomeRoot))
	if homeRoot == "" || homeRoot == "." || homeRoot == string(filepath.Separator) || !filepath.IsAbs(homeRoot) {
		return 0, fmt.Errorf("invalid OCI workload home root")
	}
	matches, err := filepath.Glob(filepath.Join(homeRoot, "*", ".gpuaas", "workloads", workloadID))
	if err != nil {
		return 0, fmt.Errorf("find OCI workload scratch: %w", err)
	}
	removed := 0
	for _, match := range matches {
		clean := filepath.Clean(match)
		if filepath.Base(clean) != workloadID {
			return removed, fmt.Errorf("refusing to remove non-workload scratch path")
		}
		if !strings.HasPrefix(clean, homeRoot+string(filepath.Separator)) {
			return removed, fmt.Errorf("refusing to remove scratch path outside home root")
		}
		if err := os.RemoveAll(clean); err != nil {
			return removed, fmt.Errorf("remove OCI workload scratch: %w", err)
		}
		removed++
	}
	return removed, nil
}
