package main

import (
	"context"
	"fmt"
	"math"
	"net/url"
	"os"
	"os/exec"
	"path/filepath"
	"regexp"
	"sort"
	"strconv"
	"strings"
	"syscall"
	"time"

	"github.com/google/uuid"
	"golang.org/x/sys/execabs"
)

type taskHandler func(context.Context, nodeTask) (map[string]any, error)

const (
	taskTypeWorkloadOCIRuntimeStatus = "workload.oci_runtime_status"
	taskTypeWorkloadOCILaunch        = "workload.oci_launch"
	taskTypeWorkloadOCIControl       = "workload.oci_control"
	taskTypeWorkloadOCIRemove        = "workload.oci_remove"
	taskTypeWorkloadComposeLaunch    = "workload.compose_launch"
	taskTypeWorkloadComposeControl   = "workload.compose_control"
	taskTypeWorkloadComposeRemove    = "workload.compose_remove"
	taskTypeSliceTopologyDiscover    = "slice.topology_discover"
	taskTypeSliceVMProvision         = "slice.vm_provision"
	taskTypeSliceVMRelease           = "slice.vm_release"
	taskTypeSliceVMStart             = "slice.vm_start"
	taskTypeSliceVMRestart           = "slice.vm_restart"
	taskTypeNodeGPUScrub             = "node.gpu_scrub"
	taskTypeNodeStorageCleanup       = "node.storage_cleanup"
	taskTypeNodeValidateClean        = "node.validate_clean"
	taskTypeStorageMountAttach       = "storage.mount_attach"
	taskTypeStorageMountDetach       = "storage.mount_detach"
	taskTypeStorageMountProbe        = "storage.mount_probe"
)

var taskCatalog = map[string]taskHandler{
	"allocation.provision_user":                   handleProvisionUserTask,
	"allocation.install_authorized_keys":          handleInstallAuthorizedKeysTask,
	"allocation.reconcile_managed_authorized_key": handleReconcileManagedAuthorizedKeyTask,
	"allocation.revoke_user":                      handleRevokeUserTask,
	"runtime.write_env_file":                      handleRuntimeWriteEnvFileTask,
	"runtime.install_service_unit":                handleRuntimeInstallServiceUnitTask,
	"runtime.service_control":                     handleRuntimeServiceControlTask,
	"terminal.open":                               handleTerminalOpenTask,
	"terminal.close":                              handleTerminalCloseTask,
	"node.drain":                                  handleNodeDrainTask,
	"node.reboot":                                 handleNodeRebootTask,
	"node.uninstall":                              handleNodeUninstallTask,
	"node.self_update":                            handleNodeSelfUpdateTask,
	"node.heartbeat_check":                        handleHeartbeatCheckTask,
	taskTypeWorkloadOCIRuntimeStatus:              handleOCIRuntimeStatusTask,
	taskTypeWorkloadOCILaunch:                     handleOCIWorkloadLaunchTask,
	taskTypeWorkloadOCIControl:                    handleOCIWorkloadControlTask,
	taskTypeWorkloadOCIRemove:                     handleOCIWorkloadRemoveTask,
	taskTypeWorkloadComposeLaunch:                 handleComposeWorkloadLaunchTask,
	taskTypeWorkloadComposeControl:                handleComposeWorkloadControlTask,
	taskTypeWorkloadComposeRemove:                 handleComposeWorkloadRemoveTask,
	taskTypeSliceTopologyDiscover:                 handleSliceTopologyDiscoverTask,
	taskTypeSliceVMProvision:                      handleSliceVMProvisionTask,
	taskTypeSliceVMRelease:                        handleSliceVMReleaseTask,
	taskTypeSliceVMStart:                          handleSliceVMStartTask,
	taskTypeSliceVMRestart:                        handleSliceVMRestartTask,
	taskTypeNodeGPUScrub:                          handleNodeGPUScrubTask,
	taskTypeNodeStorageCleanup:                    handleNodeStorageCleanupTask,
	taskTypeNodeValidateClean:                     handleNodeValidateCleanTask,
	taskTypeStorageMountAttach:                    handleStorageMountAttachTask,
	taskTypeStorageMountDetach:                    handleStorageMountDetachTask,
	taskTypeStorageMountProbe:                     handleStorageMountProbeTask,
}

var usernameOnNodePattern = regexp.MustCompile(`^[a-z_][a-z0-9_-]{0,31}$`)
var sha256HexPattern = regexp.MustCompile(`^[a-f0-9]{64}$`)
var hexTokenPattern = regexp.MustCompile(`^[a-f0-9]{64}$`)
var debianPackageNamePattern = regexp.MustCompile(`^[A-Za-z0-9][A-Za-z0-9+._-]{0,127}$`)

type provisionUserInput struct {
	Username      string
	UID           int
	GID           int
	Supplemental  []int
	SSHPublicKeys []string
}

type installAuthorizedKeysInput struct {
	Username      string
	SSHPublicKeys []string
}

type reconcileManagedAuthorizedKeyInput struct {
	Username     string
	ManagedKeyID string
	DesiredState string
	PublicKey    *string
}

type revokeUserInput struct {
	Username string
	Reason   string
}

type nodeUninstallInput struct {
	InstallRoot          string
	EnvFilePath          string
	SystemdUnitName      string
	CertPath             string
	KeyPath              string
	CABundlePath         string
	NodeCertCABundlePath string
	Reason               string
}

type nodeSelfUpdateInput struct {
	PackageURL         string
	PackageSHA256      string
	ExpectedVersion    string
	EnrollmentToken    string
	TaskSigningPubKeys string
	InstallRoot        string
	EnvFilePath        string
	SystemdUnitName    string
	CABundlePath       string
	RegistryEnvFile    string
	InstallContainers  bool
	ContainerPackage   string
	Reason             string
}

var provisionUserOpsRunner = runProvisionUserOps
var installAuthorizedKeysRunner = runInstallAuthorizedKeysOps
var reconcileManagedAuthorizedKeyRunner = runReconcileManagedAuthorizedKeyOps
var revokeUserOpsRunner = runRevokeUserOps
var nodeRebootRunner = runNodeReboot
var nodeUninstallRunner = runNodeUninstall
var nodeSelfUpdateRunner = runNodeSelfUpdate
var nodeUserOpsCommandRunner = nodeUserOpsCommand
var lookupProvisionedUserOnNode = userExistsOnNode
var provisionedHomePathExists = pathExists
var ociRuntimeStatusRunner = runOCIRuntimeStatus
var ociRuntimeLookPath = exec.LookPath
var ociRuntimeCommandContext = exec.CommandContext

func dispatchTask(ctx context.Context, task nodeTask) nodeTaskResult {
	handler, ok := taskCatalog[strings.TrimSpace(task.TaskType)]
	if !ok {
		return nodeTaskResult{
			Status: "rejected",
			Error:  "unknown_task_type",
			Output: map[string]any{},
		}
	}
	output, err := handler(ctx, task)
	if err != nil {
		if failure, ok := err.(interface {
			error
			output() map[string]any
		}); ok {
			return nodeTaskResult{
				Status: "failed",
				Error:  failure.Error(),
				Output: failure.output(),
			}
		}
		return nodeTaskResult{
			Status: "failed",
			Error:  err.Error(),
			Output: map[string]any{},
		}
	}
	return nodeTaskResult{
		Status: "success",
		Output: output,
	}
}

func handleOCIRuntimeStatusTask(ctx context.Context, _ nodeTask) (map[string]any, error) {
	return ociRuntimeStatusRunner(ctx)
}

type ociRuntimeProbe struct {
	Name string
	Args []string
}

var approvedOCIRuntimeProbes = []ociRuntimeProbe{
	{Name: "docker", Args: []string{"ps", "--format", "{{.ID}}"}},
	{Name: "podman", Args: []string{"version"}},
	{Name: "nerdctl", Args: []string{"version"}},
}

func runOCIRuntimeStatus(ctx context.Context) (map[string]any, error) {
	runtimes := make([]map[string]any, 0, len(approvedOCIRuntimeProbes))
	approvedNames := make([]string, 0, len(approvedOCIRuntimeProbes))
	selectedRuntime := ""
	for _, probe := range approvedOCIRuntimeProbes {
		approvedNames = append(approvedNames, probe.Name)
		status := probeOCIRuntime(ctx, probe)
		runtimes = append(runtimes, status)
		if selectedRuntime == "" && status["reachable"] == true {
			selectedRuntime, _ = status["name"].(string)
		}
	}
	return map[string]any{
		"available":              selectedRuntime != "",
		"selected_runtime":       selectedRuntime,
		"approved_runtime_names": approvedNames,
		"runtimes":               runtimes,
	}, nil
}

func probeOCIRuntime(ctx context.Context, probe ociRuntimeProbe) map[string]any {
	status := map[string]any{
		"name":      probe.Name,
		"binary":    probe.Name,
		"installed": false,
		"reachable": false,
	}
	path, err := ociRuntimeLookPath(probe.Name)
	if err != nil {
		return status
	}
	status["installed"] = true
	status["path"] = path

	probeCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
	defer cancel()
	out, err := ociRuntimeCommandContext(probeCtx, path, probe.Args...).CombinedOutput()
	if err != nil {
		status["probe_error"] = compactProbeOutput(out, err)
		return status
	}
	status["reachable"] = true
	status["version_output"] = compactProbeOutput(out, nil)
	return status
}

func compactProbeOutput(out []byte, err error) string {
	text := strings.TrimSpace(string(out))
	if text == "" && err != nil {
		text = err.Error()
	}
	if text == "" {
		return ""
	}
	lines := strings.Split(text, "\n")
	if err != nil {
		for _, line := range lines {
			line = strings.TrimSpace(line)
			if line != "" && probeOutputLineLooksActionableError(line) {
				return truncateProbeLine(line)
			}
		}
		for _, line := range lines {
			line = strings.TrimSpace(line)
			if line != "" && !probeOutputLineLooksNonActionableProgress(line) {
				return truncateProbeLine(line)
			}
		}
		for _, line := range lines {
			line := strings.TrimSpace(line)
			if line != "" {
				return truncateProbeLine(line)
			}
		}
		return ""
	}
	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line != "" {
			return truncateProbeLine(line)
		}
	}
	return ""
}

func probeOutputLineLooksActionableError(line string) bool {
	lower := strings.ToLower(strings.TrimSpace(line))
	if lower == "" || strings.HasPrefix(lower, "run '") {
		return false
	}
	needles := []string{
		"error",
		"failed",
		"failure",
		"denied",
		"unauthorized",
		"forbidden",
		"not found",
		"no such",
		"timeout",
		"timed out",
		"refused",
		"unavailable",
		"invalid",
		"cannot",
		"could not",
		"unable",
		"exited",
	}
	for _, needle := range needles {
		if strings.Contains(lower, needle) {
			return true
		}
	}
	return false
}

func probeOutputLineLooksNonActionableProgress(line string) bool {
	lower := strings.ToLower(strings.TrimSpace(line))
	if lower == "" || strings.HasPrefix(lower, "run '") {
		return true
	}
	fields := strings.Fields(lower)
	if len(fields) < 2 {
		return false
	}
	switch fields[len(fields)-1] {
	case "pulling", "pulled", "extracting", "downloading", "creating", "created", "starting", "started", "waiting":
		return true
	default:
		return false
	}
}

const (
	maxProbeOutputLineBytes = 800
	probeOutputLineHead     = 360
	probeOutputLineTail     = 360
)

func truncateProbeLine(line string) string {
	line = sanitizeProbeOutputLine(line)
	if len(line) <= maxProbeOutputLineBytes {
		return line
	}
	return line[:probeOutputLineHead] + " ... " + line[len(line)-probeOutputLineTail:]
}

func sanitizeProbeOutputLine(line string) string {
	text := strings.TrimSpace(line)
	if text == "" {
		return ""
	}
	text = ociURLTokenPattern.ReplaceAllString(text, `${1}[REDACTED]`)
	text = ociServerTokenPattern.ReplaceAllString(text, `${1}[REDACTED]`)
	return strings.Join(strings.Fields(text), " ")
}

func handleProvisionUserTask(_ context.Context, task nodeTask) (map[string]any, error) {
	in, err := parseProvisionUserInput(task.Params)
	if err != nil {
		return nil, err
	}
	if err := provisionUserOpsRunner(in); err != nil {
		return nil, err
	}
	if !lookupProvisionedUserOnNode(in.Username) {
		return nil, fmt.Errorf("provisioned user not resolvable after apply: %s", in.Username)
	}
	homePath := filepath.Join("/home", in.Username)
	if !provisionedHomePathExists(homePath) {
		return nil, fmt.Errorf("provisioned user home missing after apply: %s", homePath)
	}

	return map[string]any{
		"applied":       true,
		"verified":      true,
		"username":      in.Username,
		"uid":           in.UID,
		"gid":           in.GID,
		"ssh_key_count": len(in.SSHPublicKeys),
	}, nil
}

func handleInstallAuthorizedKeysTask(_ context.Context, task nodeTask) (map[string]any, error) {
	in, err := parseInstallAuthorizedKeysInput(task.Params)
	if err != nil {
		return nil, err
	}
	out, err := installAuthorizedKeysRunner(in)
	if err != nil {
		return nil, err
	}
	return out, nil
}

func handleReconcileManagedAuthorizedKeyTask(_ context.Context, task nodeTask) (map[string]any, error) {
	in, err := parseReconcileManagedAuthorizedKeyInput(task.Params)
	if err != nil {
		return nil, err
	}
	out, err := reconcileManagedAuthorizedKeyRunner(in)
	if err != nil {
		return nil, err
	}
	return out, nil
}

func handleRevokeUserTask(_ context.Context, task nodeTask) (map[string]any, error) {
	in, err := parseRevokeUserInput(task.Params)
	if err != nil {
		return nil, err
	}
	out, err := revokeUserOpsRunner(in)
	if err != nil {
		return nil, err
	}
	return out, nil
}

func parseRevokeUserInput(params map[string]any) (revokeUserInput, error) {
	username, _ := params["username_on_node"].(string)
	username = strings.TrimSpace(username)
	if username == "" {
		return revokeUserInput{}, fmt.Errorf("missing username_on_node")
	}
	if !usernameOnNodePattern.MatchString(username) {
		return revokeUserInput{}, fmt.Errorf("invalid username_on_node")
	}
	reason, _ := params["reason"].(string)
	reason = strings.TrimSpace(reason)
	if reason == "" {
		reason = "allocation_released"
	}
	return revokeUserInput{
		Username: username,
		Reason:   reason,
	}, nil
}

func runRevokeUserOps(in revokeUserInput) (map[string]any, error) {
	userExistedBefore := userExistsOnNode(in.Username)
	homePath := fmt.Sprintf("/home/%s", in.Username)
	homeExistedBefore := pathExists(homePath)
	sudoersPath := allocationUserSudoersPath(in.Username)

	script := fmt.Sprintf(`
set -euo pipefail
username=%q

if ! [[ "$username" =~ ^[a-z_][a-z0-9_-]{0,31}$ ]]; then
  echo "invalid username_on_node" >&2
  exit 1
fi

if id "$username" >/dev/null 2>&1; then
  primary_group="$(id -gn "$username" || true)"
  all_groups="$(id -nG "$username" || true)"
  userdel -r "$username" || true

  # Remove private primary group only when it matches username and is empty.
  if [ -n "$primary_group" ] && [ "$primary_group" = "$username" ] && getent group "$primary_group" >/dev/null 2>&1; then
    members="$(getent group "$primary_group" | awk -F: '{print $4}')"
    if [ -z "$members" ]; then
      groupdel "$primary_group" >/dev/null 2>&1 || true
    fi
  fi

  # Remove managed supplemental groups when no members remain.
  for g in $all_groups; do
    case "$g" in
      gpuaas-sup-*)
        if getent group "$g" >/dev/null 2>&1; then
          members="$(getent group "$g" | awk -F: '{print $4}')"
          if [ -z "$members" ]; then
            groupdel "$g" >/dev/null 2>&1 || true
          fi
        fi
        ;;
    esac
  done
fi

rm -f %q

# Defensive cleanup in case userdel -r skipped home due to prior user state.
if [ -d %q ]; then
  rm -rf %q
fi
`, in.Username, sudoersPath, homePath, homePath)

	cmd := nodeUserOpsCommandRunner(script)
	out, err := cmd.CombinedOutput()
	if err != nil {
		msg := strings.TrimSpace(string(out))
		if msg == "" {
			msg = err.Error()
		}
		return nil, fmt.Errorf("apply revoke user operations: %s", msg)
	}

	userAbsentAfter := !userExistsOnNode(in.Username)
	homeAbsentAfter := !pathExists(homePath)
	return map[string]any{
		"revoked":             true,
		"idempotent":          true,
		"username":            in.Username,
		"reason":              in.Reason,
		"user_existed_before": userExistedBefore,
		"user_absent_after":   userAbsentAfter,
		"home_existed_before": homeExistedBefore,
		"home_absent_after":   homeAbsentAfter,
	}, nil
}

func userExistsOnNode(username string) bool {
	if strings.TrimSpace(username) == "" {
		return false
	}
	return exec.Command("id", username).Run() == nil
}

func pathExists(path string) bool {
	if strings.TrimSpace(path) == "" {
		return false
	}
	_, err := os.Stat(path)
	return err == nil
}

func handleHeartbeatCheckTask(_ context.Context, _ nodeTask) (map[string]any, error) {
	return map[string]any{
		"ok": true,
	}, nil
}

func handleNodeDrainTask(_ context.Context, task nodeTask) (map[string]any, error) {
	nodeID, _ := task.Params["node_id"].(string)
	reason, _ := task.Params["reason"].(string)
	return map[string]any{
		"drained": true,
		"node_id": strings.TrimSpace(nodeID),
		"reason":  strings.TrimSpace(reason),
	}, nil
}

func handleNodeRebootTask(_ context.Context, task nodeTask) (map[string]any, error) {
	reason, _ := task.Params["reason"].(string)
	out, err := nodeRebootRunner(strings.TrimSpace(reason))
	if err != nil {
		return nil, err
	}
	return out, nil
}

func handleNodeUninstallTask(_ context.Context, task nodeTask) (map[string]any, error) {
	in, err := parseNodeUninstallInput(task.Params)
	if err != nil {
		return nil, err
	}
	out, err := nodeUninstallRunner(in)
	if err != nil {
		return nil, err
	}
	return out, nil
}

func handleNodeSelfUpdateTask(_ context.Context, task nodeTask) (map[string]any, error) {
	in, err := parseNodeSelfUpdateInput(task.Params)
	if err != nil {
		return nil, err
	}
	out, err := nodeSelfUpdateRunner(in)
	if err != nil {
		return nil, err
	}
	return out, nil
}

func runNodeReboot(reason string) (map[string]any, error) {
	scriptPath := filepath.Join(os.TempDir(), fmt.Sprintf("gpuaas-node-reboot-%d.sh", os.Getpid()))
	if strings.TrimSpace(reason) == "" {
		reason = "node_reboot"
	}
	script := fmt.Sprintf(`#!/bin/sh
sleep 5
systemctl reboot >/dev/null 2>&1 || shutdown -r now >/dev/null 2>&1 || reboot >/dev/null 2>&1
rm -f %q
`, scriptPath)
	if err := writeSecureFile(scriptPath, script, 0o700); err != nil {
		return nil, fmt.Errorf("write reboot finalizer: %w", err)
	}
	devNull, err := os.OpenFile(os.DevNull, os.O_RDWR, 0)
	if err != nil {
		return nil, fmt.Errorf("open %s: %w", os.DevNull, err)
	}
	defer func() { _ = devNull.Close() }()

	cmd := execabs.Command(scriptPath)
	cmd.Stdin = devNull
	cmd.Stdout = devNull
	cmd.Stderr = devNull
	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
	if err := cmd.Start(); err != nil {
		return nil, fmt.Errorf("schedule reboot finalizer: %w", err)
	}
	if cmd.Process != nil {
		_ = cmd.Process.Release()
	}
	return map[string]any{
		"reboot_scheduled": true,
		"reason":           reason,
	}, nil
}

func parseNodeSelfUpdateInput(params map[string]any) (nodeSelfUpdateInput, error) {
	readPath := func(key, fallback string, required bool) (string, error) {
		value, _ := params[key].(string)
		value = strings.TrimSpace(value)
		if value == "" {
			value = fallback
		}
		if value == "" {
			if required {
				return "", fmt.Errorf("missing %s", key)
			}
			return "", nil
		}
		if !filepath.IsAbs(value) {
			return "", fmt.Errorf("%s must be absolute", key)
		}
		cleaned := filepath.Clean(value)
		if cleaned == "/" {
			return "", fmt.Errorf("%s must not be root", key)
		}
		return cleaned, nil
	}

	packageURL, _ := params["package_url"].(string)
	packageURL = strings.TrimSpace(packageURL)
	if packageURL == "" {
		return nodeSelfUpdateInput{}, fmt.Errorf("missing package_url")
	}
	parsed, err := url.Parse(packageURL)
	if err != nil || parsed == nil || parsed.Host == "" {
		return nodeSelfUpdateInput{}, fmt.Errorf("invalid package_url")
	}
	switch parsed.Scheme {
	case "https", "http":
	default:
		return nodeSelfUpdateInput{}, fmt.Errorf("package_url scheme must be http or https")
	}
	if len(packageURL) > 2048 || strings.ContainsAny(packageURL, "\x00\r\n") {
		return nodeSelfUpdateInput{}, fmt.Errorf("invalid package_url")
	}

	packageSHA256, _ := params["package_sha256"].(string)
	packageSHA256 = strings.ToLower(strings.TrimSpace(packageSHA256))
	if !sha256HexPattern.MatchString(packageSHA256) {
		return nodeSelfUpdateInput{}, fmt.Errorf("invalid package_sha256")
	}
	expectedVersion, _ := params["expected_version"].(string)
	expectedVersion = strings.TrimSpace(expectedVersion)
	if expectedVersion == "" || len(expectedVersion) > 64 || strings.ContainsAny(expectedVersion, "\x00\r\n\t ") {
		return nodeSelfUpdateInput{}, fmt.Errorf("invalid expected_version")
	}
	enrollmentToken, _ := params["enrollment_token"].(string)
	enrollmentToken = strings.TrimSpace(enrollmentToken)
	if enrollmentToken != "" && !hexTokenPattern.MatchString(enrollmentToken) {
		return nodeSelfUpdateInput{}, fmt.Errorf("invalid enrollment_token")
	}
	taskSigningPubKeys, _ := params["task_signing_pubkeys"].(string)
	taskSigningPubKeys = strings.TrimSpace(taskSigningPubKeys)
	if taskSigningPubKeys == "" || len(taskSigningPubKeys) > 4096 || strings.ContainsAny(taskSigningPubKeys, "\x00\r\n") || len(taskSigningPublicKeys(taskSigningPubKeys)) == 0 {
		return nodeSelfUpdateInput{}, fmt.Errorf("invalid task_signing_pubkeys")
	}
	installRoot, err := readPath("install_root", "/opt/gpuaas/node-agent", true)
	if err != nil {
		return nodeSelfUpdateInput{}, err
	}
	envFilePath, err := readPath("env_file_path", "/etc/gpuaas/node-agent.env", true)
	if err != nil {
		return nodeSelfUpdateInput{}, err
	}
	caBundlePath, err := readPath("ca_bundle_path", "", false)
	if err != nil {
		return nodeSelfUpdateInput{}, err
	}
	registryEnvFile, err := readPath("registry_env_file", "", false)
	if err != nil {
		return nodeSelfUpdateInput{}, err
	}
	systemdUnitName, _ := params["systemd_unit_name"].(string)
	systemdUnitName = strings.TrimSpace(systemdUnitName)
	if systemdUnitName == "" {
		systemdUnitName = "gpuaas-node-agent.service"
	}
	if !systemdUnitNamePattern.MatchString(systemdUnitName) {
		return nodeSelfUpdateInput{}, fmt.Errorf("invalid systemd_unit_name")
	}
	installContainers := true
	if raw, ok := params["install_container_runtime"].(bool); ok {
		installContainers = raw
	}
	containerPackage, _ := params["container_runtime_package"].(string)
	containerPackage = strings.TrimSpace(containerPackage)
	if containerPackage == "" {
		containerPackage = "docker.io"
	}
	if containerPackage != "none" && !debianPackageNamePattern.MatchString(containerPackage) {
		return nodeSelfUpdateInput{}, fmt.Errorf("invalid container_runtime_package")
	}
	reason, _ := params["reason"].(string)
	reason = strings.TrimSpace(reason)
	if reason == "" {
		reason = "node_agent_self_update"
	}
	if len(reason) > 128 || strings.ContainsAny(reason, "\x00\r\n") {
		return nodeSelfUpdateInput{}, fmt.Errorf("invalid reason")
	}
	return nodeSelfUpdateInput{
		PackageURL:         packageURL,
		PackageSHA256:      packageSHA256,
		ExpectedVersion:    expectedVersion,
		EnrollmentToken:    enrollmentToken,
		TaskSigningPubKeys: taskSigningPubKeys,
		InstallRoot:        installRoot,
		EnvFilePath:        envFilePath,
		SystemdUnitName:    systemdUnitName,
		CABundlePath:       caBundlePath,
		RegistryEnvFile:    registryEnvFile,
		InstallContainers:  installContainers,
		ContainerPackage:   containerPackage,
		Reason:             reason,
	}, nil
}

func runNodeSelfUpdate(in nodeSelfUpdateInput) (map[string]any, error) {
	scriptPath := filepath.Join(os.TempDir(), fmt.Sprintf("gpuaas-node-agent-self-update-%d.sh", os.Getpid()))
	curlCAArgs := ""
	if in.CABundlePath != "" {
		curlCAArgs = fmt.Sprintf(" --cacert %q", in.CABundlePath)
	}
	registryEnvLine := ""
	if in.RegistryEnvFile != "" {
		registryEnvLine = fmt.Sprintf("  GPUAAS_NODE_AGENT_REGISTRY_ENV_FILE=%q \\\n", in.RegistryEnvFile)
	}
	registryCALine := ""
	if in.CABundlePath != "" {
		registryCALine = fmt.Sprintf("  GPUAAS_NODE_AGENT_REGISTRY_CA_BUNDLE=%q \\\n", in.CABundlePath)
	}
	script := fmt.Sprintf(`#!/bin/sh
set -eu
sleep 3
workdir="$(mktemp -d /tmp/gpuaas-node-agent-self-update.XXXXXX)"
log_path="/var/log/gpuaas-node-agent-self-update.log"
if ! touch "$log_path" 2>/dev/null; then
  log_path="$workdir/self-update.log"
fi
exec >>"$log_path" 2>&1
echo "[$(date -u +%%Y-%%m-%%dT%%H:%%M:%%SZ)] starting gpuaas node-agent self-update expected_version=%s reason=%s"
cleanup() {
  rm -rf "$workdir"
  rm -f %q
}
trap cleanup EXIT
curl -fsSL%s %q -o "$workdir/node-agent-bootstrap.tar.gz"
printf '%%s  %%s\n' %q "$workdir/node-agent-bootstrap.tar.gz" | sha256sum -c -
mkdir -p "$workdir/extract"
tar -xzf "$workdir/node-agent-bootstrap.tar.gz" -C "$workdir/extract"
test -x "$workdir/extract/bundle/install-node-agent.sh"
test -x "$workdir/extract/bundle/bin/gpuaas-node-agent"
cp %q "$workdir/gpuaas-node-agent.env"
enrollment_token=%q
if [ -n "$enrollment_token" ]; then
  awk -v token="$enrollment_token" '
    BEGIN { replaced = 0 }
    /^GPUAAS_ENROLLMENT_TOKEN=/ {
      print "GPUAAS_ENROLLMENT_TOKEN=" token
      replaced = 1
      next
    }
    { print }
    END {
      if (replaced == 0) {
        print "GPUAAS_ENROLLMENT_TOKEN=" token
      }
    }
  ' "$workdir/gpuaas-node-agent.env" > "$workdir/gpuaas-node-agent.env.next"
  mv "$workdir/gpuaas-node-agent.env.next" "$workdir/gpuaas-node-agent.env"
fi
task_signing_pubkeys=%q
awk -v keys="$task_signing_pubkeys" '
  BEGIN { replaced = 0 }
  /^GPUAAS_TASK_SIGNING_PUBKEYS=/ {
    print "GPUAAS_TASK_SIGNING_PUBKEYS=" keys
    replaced = 1
    next
  }
  /^GPUAAS_TASK_SIGNING_PUBKEY=/ { next }
  /^GPUAAS_NODE_TASK_SIGNING_PUBKEY=/ { next }
  { print }
  END {
    if (replaced == 0) {
      print "GPUAAS_TASK_SIGNING_PUBKEYS=" keys
    }
  }
' "$workdir/gpuaas-node-agent.env" > "$workdir/gpuaas-node-agent.env.next"
mv "$workdir/gpuaas-node-agent.env.next" "$workdir/gpuaas-node-agent.env"
env \
  GPUAAS_NODE_AGENT_BINARY_SOURCE="$workdir/extract/bundle/bin/gpuaas-node-agent" \
  GPUAAS_NODE_AGENT_ENV_TEMPLATE="$workdir/gpuaas-node-agent.env" \
  GPUAAS_NODE_AGENT_UNIT_TEMPLATE="$workdir/extract/bundle/systemd/gpuaas-node-agent.service.tmpl" \
  GPUAAS_NODE_AGENT_INSTALL_ROOT=%q \
  GPUAAS_NODE_AGENT_ENV_FILE=%q \
  GPUAAS_NODE_AGENT_SYSTEMD_UNIT=%q \
  GPUAAS_NODE_AGENT_INSTALL_CONTAINER_RUNTIME=%q \
  GPUAAS_NODE_AGENT_CONTAINER_RUNTIME_PACKAGE=%q \
%s%s  "$workdir/extract/bundle/install-node-agent.sh"
echo "[$(date -u +%%Y-%%m-%%dT%%H:%%M:%%SZ)] gpuaas node-agent self-update finalizer completed"
`, in.ExpectedVersion, in.Reason, scriptPath, curlCAArgs, in.PackageURL, in.PackageSHA256, in.EnvFilePath, in.EnrollmentToken, in.TaskSigningPubKeys, in.InstallRoot, in.EnvFilePath, in.SystemdUnitName, strconv.FormatBool(in.InstallContainers), in.ContainerPackage, registryEnvLine, registryCALine)
	if err := writeSecureFile(scriptPath, script, 0o700); err != nil {
		return nil, fmt.Errorf("write self-update finalizer: %w", err)
	}
	devNull, err := os.OpenFile(os.DevNull, os.O_RDWR, 0)
	if err != nil {
		return nil, fmt.Errorf("open %s: %w", os.DevNull, err)
	}
	defer func() { _ = devNull.Close() }()
	cmd := execabs.Command(scriptPath)
	cmd.Stdin = devNull
	cmd.Stdout = devNull
	cmd.Stderr = devNull
	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
	if err := cmd.Start(); err != nil {
		return nil, fmt.Errorf("schedule self-update finalizer: %w", err)
	}
	if cmd.Process != nil {
		_ = cmd.Process.Release()
	}
	return nodeSelfUpdateScheduledOutput(in), nil
}

func nodeSelfUpdateScheduledOutput(in nodeSelfUpdateInput) map[string]any {
	return map[string]any{
		"update_scheduled":               true,
		"expected_version":               in.ExpectedVersion,
		"enrollment_token_refreshed":     in.EnrollmentToken != "",
		"task_signing_verifier_count":    len(taskSigningPublicKeys(in.TaskSigningPubKeys)),
		"task_signing_verifier_delivery": "node.self_update",
		"install_root":                   in.InstallRoot,
		"env_file_path":                  in.EnvFilePath,
		"systemd_unit_name":              in.SystemdUnitName,
		"install_container_runtime":      in.InstallContainers,
		"container_runtime_package":      in.ContainerPackage,
		"log_path":                       "/var/log/gpuaas-node-agent-self-update.log",
		"reason":                         in.Reason,
	}
}

func parseNodeUninstallInput(params map[string]any) (nodeUninstallInput, error) {
	readPath := func(key string) (string, error) {
		value, _ := params[key].(string)
		value = strings.TrimSpace(value)
		if value == "" {
			return "", fmt.Errorf("missing %s", key)
		}
		if !strings.HasPrefix(value, "/") {
			return "", fmt.Errorf("%s must be absolute", key)
		}
		cleaned := filepath.Clean(value)
		if cleaned == "/" {
			return "", fmt.Errorf("%s must not be root", key)
		}
		return cleaned, nil
	}

	installRoot, err := readPath("install_root")
	if err != nil {
		return nodeUninstallInput{}, err
	}
	envFilePath, err := readPath("env_file_path")
	if err != nil {
		return nodeUninstallInput{}, err
	}
	certPath, err := readPath("cert_path")
	if err != nil {
		return nodeUninstallInput{}, err
	}
	keyPath, err := readPath("key_path")
	if err != nil {
		return nodeUninstallInput{}, err
	}
	caBundlePath, err := readPath("ca_bundle_path")
	if err != nil {
		return nodeUninstallInput{}, err
	}
	nodeCertCABundlePath, err := readPath("node_cert_ca_bundle_path")
	if err != nil {
		return nodeUninstallInput{}, err
	}
	systemdUnitName, _ := params["systemd_unit_name"].(string)
	systemdUnitName = strings.TrimSpace(systemdUnitName)
	if !systemdUnitNamePattern.MatchString(systemdUnitName) {
		return nodeUninstallInput{}, fmt.Errorf("invalid systemd_unit_name")
	}
	reason, _ := params["reason"].(string)
	reason = strings.TrimSpace(reason)
	if reason == "" {
		reason = "node_removed"
	}
	return nodeUninstallInput{
		InstallRoot:          installRoot,
		EnvFilePath:          envFilePath,
		SystemdUnitName:      systemdUnitName,
		CertPath:             certPath,
		KeyPath:              keyPath,
		CABundlePath:         caBundlePath,
		NodeCertCABundlePath: nodeCertCABundlePath,
		Reason:               reason,
	}, nil
}

func runNodeUninstall(in nodeUninstallInput) (map[string]any, error) {
	unitPath := filepath.Join(runtimeSystemdUnitDir, in.SystemdUnitName)
	scriptPath := filepath.Join(os.TempDir(), fmt.Sprintf("gpuaas-node-uninstall-%d.sh", os.Getpid()))
	script := fmt.Sprintf(`#!/bin/sh
sleep 5
systemctl stop %q >/dev/null 2>&1 || true
kill %d >/dev/null 2>&1 || true
systemctl disable %q >/dev/null 2>&1 || true
rm -f %q
systemctl daemon-reload >/dev/null 2>&1 || true
rm -rf %q
rm -f %q %q %q %q %q
rm -f %q
`, in.SystemdUnitName, os.Getpid(), in.SystemdUnitName, unitPath, in.InstallRoot, in.EnvFilePath, in.CertPath, in.KeyPath, in.CABundlePath, in.NodeCertCABundlePath, scriptPath)
	if err := writeSecureFile(scriptPath, script, 0o700); err != nil {
		return nil, fmt.Errorf("write uninstall finalizer: %w", err)
	}
	devNull, err := os.OpenFile(os.DevNull, os.O_RDWR, 0)
	if err != nil {
		return nil, fmt.Errorf("open %s: %w", os.DevNull, err)
	}
	defer func() { _ = devNull.Close() }()

	cmd := execabs.Command(scriptPath)
	cmd.Stdin = devNull
	cmd.Stdout = devNull
	cmd.Stderr = devNull
	cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
	if err := cmd.Start(); err != nil {
		return nil, fmt.Errorf("schedule uninstall finalizer: %w", err)
	}
	if cmd.Process != nil {
		_ = cmd.Process.Release()
	}
	return map[string]any{
		"uninstall_scheduled":      true,
		"reason":                   in.Reason,
		"install_root":             in.InstallRoot,
		"env_file_path":            in.EnvFilePath,
		"systemd_unit_name":        in.SystemdUnitName,
		"cert_path":                in.CertPath,
		"key_path":                 in.KeyPath,
		"ca_bundle_path":           in.CABundlePath,
		"node_cert_ca_bundle_path": in.NodeCertCABundlePath,
	}, nil
}

func handleTerminalOpenTask(_ context.Context, task nodeTask) (map[string]any, error) {
	in, err := parseTerminalOpenInput(task.Params)
	if err != nil {
		return nil, err
	}
	if err := terminalOpenRunner(in); err != nil {
		return nil, err
	}
	return map[string]any{
		"accepted":            true,
		"session_id":          in.SessionID,
		"allocation_id":       in.AllocationID,
		"user_id":             in.UserID,
		"username":            in.Username,
		"capacity_shape":      in.CapacityShape,
		"target_host":         in.TargetHost,
		"target_port":         in.TargetPort,
		"session_ttl_seconds": in.SessionTTLSeconds,
		"cols":                in.Cols,
		"rows":                in.Rows,
	}, nil
}

func handleTerminalCloseTask(_ context.Context, task nodeTask) (map[string]any, error) {
	allocationID, _ := task.Params["allocation_id"].(string)
	reason, _ := task.Params["reason"].(string)
	allocationID = strings.TrimSpace(allocationID)
	if allocationID == "" {
		return nil, fmt.Errorf("missing allocation_id")
	}
	if strings.TrimSpace(reason) == "" {
		reason = "normal_close"
	}
	closed := closeTerminalSessionByAllocation(allocationID, reason)
	return map[string]any{
		"closed":        closed,
		"allocation_id": allocationID,
		"reason":        reason,
	}, nil
}

type terminalOpenInput struct {
	SessionID         string
	UserID            string
	AllocationID      string
	Username          string
	CapacityShape     string
	TargetHost        string
	TargetPort        int
	SessionTTLSeconds int
	Cols              int
	Rows              int
}

var terminalOpenRunner = runTerminalOpenSession

func parseTerminalOpenInput(params map[string]any) (terminalOpenInput, error) {
	sessionID, _ := params["session_id"].(string)
	sessionID = strings.TrimSpace(sessionID)
	if sessionID == "" {
		return terminalOpenInput{}, fmt.Errorf("missing session_id")
	}
	if _, err := uuid.Parse(sessionID); err != nil {
		return terminalOpenInput{}, fmt.Errorf("invalid session_id")
	}
	userID, _ := params["user_id"].(string)
	userID = strings.TrimSpace(userID)
	if userID == "" {
		return terminalOpenInput{}, fmt.Errorf("missing user_id")
	}
	if _, err := uuid.Parse(userID); err != nil {
		return terminalOpenInput{}, fmt.Errorf("invalid user_id")
	}
	allocationID, _ := params["allocation_id"].(string)
	allocationID = strings.TrimSpace(allocationID)
	if allocationID == "" {
		return terminalOpenInput{}, fmt.Errorf("missing allocation_id")
	}
	if _, err := uuid.Parse(allocationID); err != nil {
		return terminalOpenInput{}, fmt.Errorf("invalid allocation_id")
	}
	username, _ := params["username"].(string)
	username = strings.TrimSpace(username)
	if username == "" {
		return terminalOpenInput{}, fmt.Errorf("missing username")
	}
	if !usernameOnNodePattern.MatchString(username) {
		return terminalOpenInput{}, fmt.Errorf("invalid username")
	}
	capacityShape, _ := params["capacity_shape"].(string)
	capacityShape = strings.TrimSpace(capacityShape)
	if capacityShape == "" {
		capacityShape = "baremetal"
	}
	if capacityShape != "baremetal" && capacityShape != "gpu_slice" {
		return terminalOpenInput{}, fmt.Errorf("invalid capacity_shape")
	}
	targetHost, _ := params["target_host"].(string)
	targetHost = strings.TrimSpace(targetHost)
	targetPort := 0
	if raw, ok := params["target_port"]; ok && raw != nil {
		var err error
		targetPort, err = intParam(params, "target_port")
		if err != nil {
			return terminalOpenInput{}, err
		}
	}
	if capacityShape == "gpu_slice" {
		if targetHost == "" {
			return terminalOpenInput{}, fmt.Errorf("missing target_host")
		}
		if targetPort < 1 || targetPort > 65535 {
			return terminalOpenInput{}, fmt.Errorf("target_port out of bounds")
		}
	}
	ttl, err := intParam(params, "session_ttl_seconds")
	if err != nil {
		return terminalOpenInput{}, err
	}
	if ttl < 300 || ttl > 86400 {
		return terminalOpenInput{}, fmt.Errorf("session_ttl_seconds out of bounds")
	}
	cols, err := intParam(params, "cols")
	if err != nil {
		return terminalOpenInput{}, err
	}
	if cols < 1 || cols > 512 {
		return terminalOpenInput{}, fmt.Errorf("cols out of bounds")
	}
	rows, err := intParam(params, "rows")
	if err != nil {
		return terminalOpenInput{}, err
	}
	if rows < 1 || rows > 512 {
		return terminalOpenInput{}, fmt.Errorf("rows out of bounds")
	}
	return terminalOpenInput{
		SessionID:         sessionID,
		UserID:            userID,
		AllocationID:      allocationID,
		Username:          username,
		CapacityShape:     capacityShape,
		TargetHost:        targetHost,
		TargetPort:        targetPort,
		SessionTTLSeconds: ttl,
		Cols:              cols,
		Rows:              rows,
	}, nil
}

func parseProvisionUserInput(params map[string]any) (provisionUserInput, error) {
	username, _ := params["username_on_node"].(string)
	username = strings.TrimSpace(username)
	if username == "" {
		return provisionUserInput{}, fmt.Errorf("missing username_on_node")
	}
	if !usernameOnNodePattern.MatchString(username) {
		return provisionUserInput{}, fmt.Errorf("invalid username_on_node")
	}

	uid, err := intParam(params, "uid")
	if err != nil {
		return provisionUserInput{}, err
	}
	gid, err := intParam(params, "gid")
	if err != nil {
		return provisionUserInput{}, err
	}
	if uid < 10000 || uid > 199999 {
		return provisionUserInput{}, fmt.Errorf("uid out of bounds")
	}
	if gid < 10000 || gid > 199999 {
		return provisionUserInput{}, fmt.Errorf("gid out of bounds")
	}

	supplemental, err := intSliceParam(params, "supplemental_gids")
	if err != nil {
		return provisionUserInput{}, err
	}
	for _, sgid := range supplemental {
		if sgid < 10000 || sgid > 199999 {
			return provisionUserInput{}, fmt.Errorf("supplemental_gids out of bounds")
		}
	}

	keys, err := stringSliceParam(params, "ssh_public_keys")
	if err != nil {
		return provisionUserInput{}, err
	}
	if len(keys) < 1 || len(keys) > 20 {
		return provisionUserInput{}, fmt.Errorf("ssh_public_keys must contain between 1 and 20 keys")
	}

	deduped := make([]string, 0, len(keys))
	seen := map[string]struct{}{}
	for _, key := range keys {
		trimmed := strings.TrimSpace(key)
		if trimmed == "" {
			continue
		}
		if _, ok := seen[trimmed]; ok {
			continue
		}
		seen[trimmed] = struct{}{}
		deduped = append(deduped, trimmed)
	}
	if len(deduped) < 1 || len(deduped) > 20 {
		return provisionUserInput{}, fmt.Errorf("ssh_public_keys must contain between 1 and 20 keys")
	}

	sort.Ints(supplemental)
	return provisionUserInput{
		Username:      username,
		UID:           uid,
		GID:           gid,
		Supplemental:  supplemental,
		SSHPublicKeys: deduped,
	}, nil
}

func parseInstallAuthorizedKeysInput(params map[string]any) (installAuthorizedKeysInput, error) {
	username, _ := params["username_on_node"].(string)
	username = strings.TrimSpace(username)
	if username == "" {
		return installAuthorizedKeysInput{}, fmt.Errorf("missing username_on_node")
	}
	if !usernameOnNodePattern.MatchString(username) {
		return installAuthorizedKeysInput{}, fmt.Errorf("invalid username_on_node")
	}
	keys, err := stringSliceParam(params, "ssh_public_keys")
	if err != nil {
		return installAuthorizedKeysInput{}, err
	}
	if len(keys) < 1 || len(keys) > 20 {
		return installAuthorizedKeysInput{}, fmt.Errorf("ssh_public_keys must contain between 1 and 20 keys")
	}
	deduped := make([]string, 0, len(keys))
	seen := map[string]struct{}{}
	for _, key := range keys {
		trimmed := strings.TrimSpace(key)
		if trimmed == "" {
			continue
		}
		if _, ok := seen[trimmed]; ok {
			continue
		}
		seen[trimmed] = struct{}{}
		deduped = append(deduped, trimmed)
	}
	if len(deduped) < 1 || len(deduped) > 20 {
		return installAuthorizedKeysInput{}, fmt.Errorf("ssh_public_keys must contain between 1 and 20 keys")
	}
	return installAuthorizedKeysInput{
		Username:      username,
		SSHPublicKeys: deduped,
	}, nil
}

func parseReconcileManagedAuthorizedKeyInput(params map[string]any) (reconcileManagedAuthorizedKeyInput, error) {
	username, _ := params["username_on_node"].(string)
	username = strings.TrimSpace(username)
	if username == "" {
		return reconcileManagedAuthorizedKeyInput{}, fmt.Errorf("missing username_on_node")
	}
	if !usernameOnNodePattern.MatchString(username) {
		return reconcileManagedAuthorizedKeyInput{}, fmt.Errorf("invalid username_on_node")
	}
	managedKeyID, _ := params["managed_key_id"].(string)
	managedKeyID = strings.TrimSpace(managedKeyID)
	if managedKeyID == "" {
		return reconcileManagedAuthorizedKeyInput{}, fmt.Errorf("missing managed_key_id")
	}
	desiredState, _ := params["desired_state"].(string)
	desiredState = strings.TrimSpace(strings.ToLower(desiredState))
	switch desiredState {
	case "present", "absent":
	default:
		return reconcileManagedAuthorizedKeyInput{}, fmt.Errorf("invalid desired_state")
	}
	var publicKey *string
	if desiredState == "present" {
		value, _ := params["public_key"].(string)
		value = strings.TrimSpace(value)
		if value == "" {
			return reconcileManagedAuthorizedKeyInput{}, fmt.Errorf("missing public_key")
		}
		publicKey = &value
	}
	return reconcileManagedAuthorizedKeyInput{
		Username:     username,
		ManagedKeyID: managedKeyID,
		DesiredState: desiredState,
		PublicKey:    publicKey,
	}, nil
}

func intParam(params map[string]any, name string) (int, error) {
	raw, ok := params[name]
	if !ok {
		return 0, fmt.Errorf("missing %s", name)
	}
	switch v := raw.(type) {
	case float64:
		if math.Mod(v, 1) != 0 {
			return 0, fmt.Errorf("invalid %s", name)
		}
		return int(v), nil
	case int:
		return v, nil
	case int32:
		return int(v), nil
	case int64:
		return int(v), nil
	case string:
		parsed, err := strconv.Atoi(strings.TrimSpace(v))
		if err != nil {
			return 0, fmt.Errorf("invalid %s", name)
		}
		return parsed, nil
	default:
		return 0, fmt.Errorf("invalid %s", name)
	}
}

func intSliceParam(params map[string]any, name string) ([]int, error) {
	raw, ok := params[name]
	if !ok || raw == nil {
		return nil, nil
	}
	switch v := raw.(type) {
	case []any:
		out := make([]int, 0, len(v))
		for _, item := range v {
			num, err := intParam(map[string]any{"v": item}, "v")
			if err != nil {
				return nil, fmt.Errorf("invalid %s", name)
			}
			out = append(out, num)
		}
		return out, nil
	case []int:
		out := make([]int, len(v))
		copy(out, v)
		return out, nil
	default:
		return nil, fmt.Errorf("invalid %s", name)
	}
}

func stringSliceParam(params map[string]any, name string) ([]string, error) {
	raw, ok := params[name]
	if !ok || raw == nil {
		return nil, nil
	}
	switch v := raw.(type) {
	case []any:
		out := make([]string, 0, len(v))
		for _, item := range v {
			s, ok := item.(string)
			if !ok {
				return nil, fmt.Errorf("invalid %s", name)
			}
			out = append(out, s)
		}
		return out, nil
	case []string:
		out := make([]string, len(v))
		copy(out, v)
		return out, nil
	default:
		return nil, fmt.Errorf("invalid %s", name)
	}
}

func runProvisionUserOps(in provisionUserInput) error {
	authorizedKeys := strings.Join(in.SSHPublicKeys, "\n")
	suppGroupNames := make([]string, 0, len(in.Supplemental))
	for _, sgid := range in.Supplemental {
		if sgid == in.GID {
			continue
		}
		suppGroupNames = append(suppGroupNames, fmt.Sprintf("gpuaas-sup-%d", sgid))
	}
	suppGroupsCSV := strings.Join(suppGroupNames, ",")
	sudoersPath := allocationUserSudoersPath(in.Username)

	script := fmt.Sprintf(`
set -euo pipefail
username=%q
uid=%d
gid=%d
authorized_keys=%q

if ! [[ "$username" =~ ^[a-z_][a-z0-9_-]{0,31}$ ]]; then
  echo "invalid username_on_node" >&2
  exit 1
fi

primary_group="$(getent group "$gid" | cut -d: -f1 || true)"
if [ -z "$primary_group" ]; then
  primary_group="$username"
  groupadd -f -g "$gid" "$primary_group"
fi

if id "$username" >/dev/null 2>&1; then
  current_uid="$(id -u "$username")"
  current_gid="$(id -g "$username")"
  if [ "$current_uid" != "$uid" ]; then
    usermod -u "$uid" "$username"
  fi
  if [ "$current_gid" != "$gid" ]; then
    usermod -g "$gid" "$username"
  fi
else
  existing_by_uid="$(getent passwd "$uid" | cut -d: -f1 || true)"
  if [ -n "$existing_by_uid" ] && [ "$existing_by_uid" != "$username" ]; then
    echo "uid already belongs to another user" >&2
    exit 1
  fi
  useradd -m -u "$uid" -g "$gid" -s /bin/bash "$username"
fi
`, in.Username, in.UID, in.GID, authorizedKeys)

	for _, sgid := range in.Supplemental {
		if sgid == in.GID {
			continue
		}
		script += fmt.Sprintf(`
if ! getent group %d >/dev/null 2>&1; then
  groupadd -f -g %d gpuaas-sup-%d
fi
`, sgid, sgid, sgid)
	}

	if suppGroupsCSV != "" {
		script += fmt.Sprintf(`
usermod -G %q "$username"
`, suppGroupsCSV)
	}

	script += `
install -d -m 700 -o "$username" -g "$primary_group" "/home/$username/.ssh"
printf '%%s\n' "$authorized_keys" > "/home/$username/.ssh/authorized_keys"
chown "$username:$primary_group" "/home/$username/.ssh/authorized_keys"
chmod 600 "/home/$username/.ssh/authorized_keys"
cat <<EOF >/tmp/gpuaas-allocation-user-sudoers
$username ALL=(ALL) NOPASSWD:ALL
EOF
chmod 440 /tmp/gpuaas-allocation-user-sudoers
if command -v visudo >/dev/null 2>&1; then
  visudo -cf /tmp/gpuaas-allocation-user-sudoers >/dev/null
fi
install -m 440 /tmp/gpuaas-allocation-user-sudoers %q
rm -f /tmp/gpuaas-allocation-user-sudoers
id "$username" >/dev/null 2>&1
getent passwd "$username" >/dev/null 2>&1
`

	script = fmt.Sprintf(script, sudoersPath)

	cmd := nodeUserOpsCommandRunner(script)
	out, err := cmd.CombinedOutput()
	if err != nil {
		msg := strings.TrimSpace(string(out))
		if msg == "" {
			msg = err.Error()
		}
		return fmt.Errorf("apply provision user operations: %s", msg)
	}
	return nil
}

func runInstallAuthorizedKeysOps(in installAuthorizedKeysInput) (map[string]any, error) {
	authorizedKeys := strings.Join(in.SSHPublicKeys, "\n")
	script := fmt.Sprintf(`
set -euo pipefail
username=%q
authorized_keys=%q

if ! [[ "$username" =~ ^[a-z_][a-z0-9_-]{0,31}$ ]]; then
  echo "invalid username_on_node" >&2
  exit 1
fi
if ! id "$username" >/dev/null 2>&1; then
  echo "allocation runtime user missing" >&2
  exit 1
fi

primary_group="$(id -gn "$username")"
home_dir="$(getent passwd "$username" | cut -d: -f6 || true)"
if [ -z "$home_dir" ] || [ ! -d "$home_dir" ]; then
  echo "allocation runtime user home missing" >&2
  exit 1
fi

install -d -m 700 -o "$username" -g "$primary_group" "$home_dir/.ssh"
printf '%%s\n' "$authorized_keys" > "$home_dir/.ssh/authorized_keys"
chown "$username:$primary_group" "$home_dir/.ssh/authorized_keys"
chmod 600 "$home_dir/.ssh/authorized_keys"
`, in.Username, authorizedKeys)

	cmd := nodeUserOpsCommandRunner(script)
	out, err := cmd.CombinedOutput()
	if err != nil {
		msg := strings.TrimSpace(string(out))
		if msg == "" {
			msg = err.Error()
		}
		return nil, fmt.Errorf("apply authorized key install operations: %s", msg)
	}
	return map[string]any{
		"applied":       true,
		"verified":      true,
		"username":      in.Username,
		"ssh_key_count": len(in.SSHPublicKeys),
	}, nil
}

func runReconcileManagedAuthorizedKeyOps(in reconcileManagedAuthorizedKeyInput) (map[string]any, error) {
	publicKey := ""
	if in.PublicKey != nil {
		publicKey = strings.TrimSpace(*in.PublicKey)
	}
	script := fmt.Sprintf(`
set -euo pipefail
username=%q
managed_key_id=%q
desired_state=%q
public_key=%q

if ! [[ "$username" =~ ^[a-z_][a-z0-9_-]{0,31}$ ]]; then
  echo "invalid username_on_node" >&2
  exit 1
fi
if [ -z "$managed_key_id" ]; then
  echo "invalid managed_key_id" >&2
  exit 1
fi
if ! id "$username" >/dev/null 2>&1; then
  echo "allocation runtime user missing" >&2
  exit 1
fi

primary_group="$(id -gn "$username")"
home_dir="$(getent passwd "$username" | cut -d: -f6 || true)"
if [ -z "$home_dir" ] || [ ! -d "$home_dir" ]; then
  echo "allocation runtime user home missing" >&2
  exit 1
fi

managed_start="# BEGIN GPUAAS MANAGED SSH KEY: ${managed_key_id}"
managed_end="# END GPUAAS MANAGED SSH KEY: ${managed_key_id}"
authorized_keys_path="$home_dir/.ssh/authorized_keys"
tmp_authorized_keys="$(mktemp)"

install -d -m 700 -o "$username" -g "$primary_group" "$home_dir/.ssh"
touch "$authorized_keys_path"
chown "$username:$primary_group" "$authorized_keys_path"
chmod 600 "$authorized_keys_path"

awk -v start="$managed_start" -v end="$managed_end" '
  $0 == start { skip=1; next }
  $0 == end { skip=0; next }
  skip != 1 { print }
' "$authorized_keys_path" > "$tmp_authorized_keys"

case "$desired_state" in
  present)
    if [ -z "$public_key" ]; then
      echo "public_key required when desired_state=present" >&2
      exit 1
    fi
    awk -v key="$public_key" '
      BEGIN {
        in_block = 0
        block_count = 0
        block_matches_key = 0
      }
      /^# BEGIN GPUAAS MANAGED SSH KEY: / {
        in_block = 1
        block_count = 1
        block[block_count] = $0
        block_matches_key = 0
        next
      }
      in_block == 1 {
        block_count++
        block[block_count] = $0
        if ($0 == key) {
          block_matches_key = 1
        }
        if ($0 ~ /^# END GPUAAS MANAGED SSH KEY: /) {
          if (block_matches_key != 1) {
            for (i = 1; i <= block_count; i++) {
              print block[i]
            }
          }
          for (i = 1; i <= block_count; i++) {
            delete block[i]
          }
          in_block = 0
          block_count = 0
          block_matches_key = 0
        }
        next
      }
      { print }
      END {
        if (in_block == 1 && block_matches_key != 1) {
          for (i = 1; i <= block_count; i++) {
            print block[i]
          }
        }
      }
    ' "$tmp_authorized_keys" > "${tmp_authorized_keys}.dedup"
    mv "${tmp_authorized_keys}.dedup" "$tmp_authorized_keys"
    {
      cat "$tmp_authorized_keys"
      printf '%%s\n' "$managed_start"
      printf '%%s\n' "$public_key"
      printf '%%s\n' "$managed_end"
    } > "$authorized_keys_path"
    ;;
  absent)
    cat "$tmp_authorized_keys" > "$authorized_keys_path"
    ;;
  *)
    echo "invalid desired_state" >&2
    exit 1
    ;;
esac

rm -f "$tmp_authorized_keys"
chown "$username:$primary_group" "$authorized_keys_path"
chmod 600 "$authorized_keys_path"
`, in.Username, in.ManagedKeyID, in.DesiredState, publicKey)

	cmd := nodeUserOpsCommandRunner(script)
	out, err := cmd.CombinedOutput()
	if err != nil {
		msg := strings.TrimSpace(string(out))
		if msg == "" {
			msg = err.Error()
		}
		return nil, fmt.Errorf("apply managed authorized key operations: %s", msg)
	}
	return map[string]any{
		"applied":        true,
		"verified":       true,
		"username":       in.Username,
		"managed_key_id": in.ManagedKeyID,
		"desired_state":  in.DesiredState,
	}, nil
}

func nodeUserOpsCommand(script string) *exec.Cmd {
	switch strings.ToLower(strings.TrimSpace(os.Getenv("GPUAAS_NODE_USER_OPS_SUDO"))) {
	case "never":
		return exec.Command("bash", "-lc", script)
	case "always":
		return exec.Command("sudo", "-n", "bash", "-lc", script)
	default: // auto
		if os.Geteuid() == 0 {
			return exec.Command("bash", "-lc", script)
		}
		return exec.Command("sudo", "-n", "bash", "-lc", script)
	}
}

func allocationUserSudoersPath(username string) string {
	return fmt.Sprintf("/etc/sudoers.d/80-gpuaas-allocation-%s", username)
}
