package main

import (
	"bytes"
	"context"
	"crypto/tls"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"log/slog"
	"net"
	"net/http"
	"os"
	"os/exec"
	"os/signal"
	"path/filepath"
	"strconv"
	"strings"
	"syscall"
	"time"

	"github.com/google/uuid"
	"github.com/gpuaas/platform/packages/shared/controllerretry"
	"golang.org/x/crypto/ssh"
)

var (
	runRemoteSudoCaptureFunc      = runRemoteSudoCapture
	runBootstrapScriptFunc        = runBootstrapScript
	bootstrapSSHTrustWaitTimeout  = 2 * time.Minute
	bootstrapSSHTrustPollInterval = 5 * time.Second
	bootstrapRuntimeProbeTimeout  = 5 * time.Minute
	bootstrapRuntimeProbeInterval = 10 * time.Second
)

type config struct {
	APIBaseURL           string
	BearerToken          string
	ServiceAccountID     string
	ServiceAccountKeyID  string
	ServiceAccountSecret string
	ProjectID            string
	AppInstanceID        string
	AppSlug              string
	AccessCredentialID   string
	SSHPrivateKeyPath    string
	ServerBootstrapPath  string
	AgentBootstrapPath   string
	HTTPHostOverride     string
	TLSServerName        string
	ReconcileInterval    time.Duration
}

type appInstance struct {
	ID                     string         `json:"id"`
	ResourceName           string         `json:"resource_name"`
	ProjectID              string         `json:"project_id"`
	AppSlug                string         `json:"app_slug"`
	AppVersion             string         `json:"app_version"`
	DisplayName            string         `json:"display_name"`
	RuntimeBackend         string         `json:"runtime_backend"`
	Status                 string         `json:"status"`
	OperatorServiceAccount *string        `json:"operator_service_account_id"`
	PlacementIntent        map[string]any `json:"placement_intent"`
	RuntimeState           map[string]any `json:"runtime_state"`
}

type appInstanceListResponse struct {
	Items []appInstance `json:"items"`
}

type member struct {
	ID                    string         `json:"id"`
	AppInstanceID         string         `json:"app_instance_id"`
	ComponentKey          string         `json:"component_key"`
	Status                string         `json:"status"`
	BoundNodeID           *string        `json:"bound_node_id"`
	BoundNodeResourceName *string        `json:"bound_node_resource_name"`
	AdapterDetail         map[string]any `json:"adapter_detail"`
}

type memberListResponse struct {
	Items []member `json:"items"`
}

type memberOperation struct {
	ID               string         `json:"id"`
	AppInstanceID    string         `json:"app_instance_id"`
	Action           string         `json:"action"`
	ComponentKey     *string        `json:"component_key"`
	TargetMemberID   *string        `json:"target_member_id"`
	AllocationIntent map[string]any `json:"allocation_intent"`
	Status           string         `json:"status"`
}

type operationListResponse struct {
	Items []memberOperation `json:"items"`
}

type allocation struct {
	ID         string  `json:"id"`
	NodeID     *string `json:"node_id"`
	Connection struct {
		Host           *string `json:"host"`
		Hostname       *string `json:"hostname"`
		Port           *int    `json:"port"`
		UsernameOnNode *string `json:"username_on_node"`
	} `json:"connection"`
}

type projectAccessCredentialListResponse struct {
	Items []projectAccessCredential `json:"items"`
}

type projectAccessCredential struct {
	ID             string                           `json:"id"`
	ResourceType   *string                          `json:"resource_type"`
	ResourceID     *string                          `json:"resource_id"`
	Bindings       []projectAccessCredentialBinding `json:"bindings"`
	CredentialKind string                           `json:"credential_kind"`
	DeliveryMode   string                           `json:"delivery_mode"`
	Status         string                           `json:"status"`
}

type projectAccessCredentialBinding struct {
	ResourceType string `json:"resource_type"`
	ResourceID   string `json:"resource_id"`
	Purpose      string `json:"purpose"`
}

type deliveredAccessCredentialResponse struct {
	Credential struct {
		ID             string `json:"id"`
		CredentialKind string `json:"credential_kind"`
		DeliveryMode   string `json:"delivery_mode"`
	} `json:"credential"`
	Delivery struct {
		Mode         string  `json:"mode"`
		WrappedToken *string `json:"wrapped_token"`
		UnwrapURL    *string `json:"unwrap_url"`
	} `json:"delivery"`
}

type controllerToken struct {
	AccessToken      string `json:"access_token"`
	ExpiresInSeconds int    `json:"expires_in_seconds"`
	TokenType        string `json:"token_type"`
}

type serviceAccountTokenCache struct {
	token         string
	expiresAt     time.Time
	refreshBefore time.Duration
	now           func() time.Time
}

func main() {
	log := slog.New(slog.NewJSONHandler(os.Stdout, nil)).With("service", "rke2-self-managed-controller")
	slog.SetDefault(log)

	cfg, err := loadConfig()
	if err != nil {
		log.Error("config error", "error", err)
		os.Exit(1)
	}

	ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
	defer stop()

	if cfg.ReconcileInterval <= 0 {
		runCtx, cancel := context.WithTimeout(ctx, 10*time.Minute)
		defer cancel()
		if err := reconcile(runCtx, log, cfg); err != nil {
			log.Error("reconcile failed", "error", err)
			os.Exit(1)
		}
		log.Info("reconcile complete", "project_id", cfg.ProjectID, "app_instance_id", cfg.AppInstanceID, "app_slug", cfg.AppSlug)
		return
	}

	log.Info("starting continuous reconcile loop", "project_id", cfg.ProjectID, "app_instance_id", cfg.AppInstanceID, "app_slug", cfg.AppSlug, "interval", cfg.ReconcileInterval.String())
	tokenCache := newServiceAccountTokenCache()
	retryBackoff := controllerretry.NewExponentialBackoff(cfg.ReconcileInterval, 5*time.Minute)
	for {
		runCtx, cancel := context.WithTimeout(ctx, 10*time.Minute)
		err := reconcileWithTokenCache(runCtx, log, cfg, tokenCache)
		cancel()
		nextDelay := cfg.ReconcileInterval
		if err != nil {
			nextDelay = retryBackoff.RecordFailure()
			log.Error("reconcile iteration failed", "error", err, "project_id", cfg.ProjectID, "app_instance_id", cfg.AppInstanceID, "app_slug", cfg.AppSlug, "retry_failures", retryBackoff.Failures(), "next_retry_delay", nextDelay.String())
		} else {
			retryBackoff.Reset()
			log.Info("reconcile iteration complete", "project_id", cfg.ProjectID, "app_instance_id", cfg.AppInstanceID, "app_slug", cfg.AppSlug)
		}
		select {
		case <-ctx.Done():
			log.Info("controller exiting", "reason", ctx.Err())
			return
		case <-time.After(nextDelay):
		}
	}
}

func loadConfig() (config, error) {
	cfg := config{
		APIBaseURL:           strings.TrimRight(strings.TrimSpace(getenvDefault("RKE2_CONTROLLER_API_BASE_URL", "https://api.gpuaas.localhost")), "/"),
		BearerToken:          strings.TrimSpace(os.Getenv("RKE2_CONTROLLER_BEARER_TOKEN")),
		ServiceAccountID:     strings.TrimSpace(os.Getenv("RKE2_CONTROLLER_SERVICE_ACCOUNT_ID")),
		ServiceAccountKeyID:  strings.TrimSpace(os.Getenv("RKE2_CONTROLLER_SERVICE_ACCOUNT_KEY_ID")),
		ServiceAccountSecret: strings.TrimSpace(os.Getenv("RKE2_CONTROLLER_SERVICE_ACCOUNT_SECRET")),
		ProjectID:            strings.TrimSpace(os.Getenv("RKE2_CONTROLLER_PROJECT_ID")),
		AppInstanceID:        strings.TrimSpace(os.Getenv("RKE2_CONTROLLER_APP_INSTANCE_ID")),
		AppSlug:              strings.TrimSpace(getenvDefault("RKE2_CONTROLLER_APP_SLUG", "rke2-self-managed")),
		AccessCredentialID:   strings.TrimSpace(os.Getenv("RKE2_CONTROLLER_ACCESS_CREDENTIAL_ID")),
		SSHPrivateKeyPath:    strings.TrimSpace(os.Getenv("RKE2_CONTROLLER_SSH_PRIVATE_KEY_PATH")),
		ServerBootstrapPath:  strings.TrimSpace(getenvDefault("RKE2_CONTROLLER_SERVER_BOOTSTRAP_SCRIPT_PATH", "/opt/rke2-self-managed/bootstrap/bootstrap-server.sh")),
		AgentBootstrapPath:   strings.TrimSpace(getenvDefault("RKE2_CONTROLLER_AGENT_BOOTSTRAP_SCRIPT_PATH", "/opt/rke2-self-managed/bootstrap/bootstrap-agent.sh")),
		HTTPHostOverride:     strings.TrimSpace(os.Getenv("RKE2_CONTROLLER_HTTP_HOST")),
		TLSServerName:        strings.TrimSpace(os.Getenv("RKE2_CONTROLLER_TLS_SERVER_NAME")),
	}
	hasBearerToken := cfg.BearerToken != ""
	hasServiceAccountCredential := cfg.ServiceAccountID != "" || cfg.ServiceAccountKeyID != "" || cfg.ServiceAccountSecret != ""
	if !hasBearerToken && !hasServiceAccountCredential {
		return config{}, errors.New("either RKE2_CONTROLLER_BEARER_TOKEN or service account credential env vars are required")
	}
	if hasServiceAccountCredential && (cfg.ServiceAccountID == "" || cfg.ServiceAccountKeyID == "" || cfg.ServiceAccountSecret == "") {
		return config{}, errors.New("RKE2_CONTROLLER_SERVICE_ACCOUNT_ID, RKE2_CONTROLLER_SERVICE_ACCOUNT_KEY_ID, and RKE2_CONTROLLER_SERVICE_ACCOUNT_SECRET must all be set together")
	}
	if cfg.ProjectID == "" {
		return config{}, errors.New("RKE2_CONTROLLER_PROJECT_ID is required")
	}
	if cfg.AppInstanceID == "" && cfg.AppSlug == "" {
		return config{}, errors.New("either RKE2_CONTROLLER_APP_INSTANCE_ID or RKE2_CONTROLLER_APP_SLUG is required")
	}
	if intervalRaw := strings.TrimSpace(os.Getenv("RKE2_CONTROLLER_RECONCILE_INTERVAL_SECONDS")); intervalRaw != "" {
		seconds, err := time.ParseDuration(intervalRaw + "s")
		if err != nil {
			return config{}, errors.New("RKE2_CONTROLLER_RECONCILE_INTERVAL_SECONDS must be a non-negative integer")
		}
		cfg.ReconcileInterval = seconds
	}
	return cfg, nil
}

func reconcile(ctx context.Context, log *slog.Logger, cfg config) error {
	return reconcileWithTokenCache(ctx, log, cfg, newServiceAccountTokenCache())
}

func reconcileWithTokenCache(ctx context.Context, log *slog.Logger, cfg config, tokenCache *serviceAccountTokenCache) error {
	client := buildHTTPClient(cfg)
	if cfg.BearerToken == "" {
		token, minted, err := tokenCache.Token(ctx, client, cfg)
		if err != nil {
			return err
		}
		cfg.BearerToken = token
		if minted {
			log.Info("minted service account token", "service_account_id", cfg.ServiceAccountID)
		}
	}
	if cfg.AppInstanceID != "" {
		return reconcileInstance(ctx, log, client, cfg)
	}
	items, err := listAppInstances(ctx, client, cfg)
	if err != nil {
		return err
	}
	var errs []error
	for i := range items {
		instanceCfg := cfg
		instanceCfg.AppInstanceID = items[i].ID
		if err := reconcileInstance(ctx, log, client, instanceCfg); err != nil {
			log.Error("app instance reconcile failed; continuing with remaining rke2 instances", "error", err, "app_instance_id", items[i].ID, "app_slug", items[i].AppSlug)
			errs = append(errs, fmt.Errorf("%s: %w", items[i].ID, err))
		}
	}
	return errors.Join(errs...)
}

func newServiceAccountTokenCache() *serviceAccountTokenCache {
	return &serviceAccountTokenCache{
		refreshBefore: 60 * time.Second,
		now:           time.Now,
	}
}

func (c *serviceAccountTokenCache) Token(ctx context.Context, client *http.Client, cfg config) (string, bool, error) {
	if c == nil {
		token, err := mintServiceAccountToken(ctx, client, cfg)
		return token, true, err
	}
	now := c.now().UTC()
	if strings.TrimSpace(c.token) != "" && now.Add(c.refreshBefore).Before(c.expiresAt) {
		return c.token, false, nil
	}
	token, expiresIn, err := mintServiceAccountTokenWithExpiry(ctx, client, cfg)
	if err != nil {
		return "", false, err
	}
	if expiresIn <= 0 {
		expiresIn = 900
	}
	c.token = token
	c.expiresAt = now.Add(time.Duration(expiresIn) * time.Second)
	return c.token, true, nil
}

func reconcileInstance(ctx context.Context, log *slog.Logger, client *http.Client, cfg config) error {
	instance, err := getAppInstance(ctx, client, cfg)
	if err != nil {
		return err
	}
	if strings.TrimSpace(instance.RuntimeBackend) != "rke2" {
		return fmt.Errorf("app instance %s is not rke2-backed", instance.ID)
	}
	switch instance.Status {
	case "requested", "deploying", "running", "stopping", "starting", "restarting", "upgrading", "rolling_back":
	default:
		log.Info("instance not reconciled in current state", "app_instance_id", instance.ID, "status", instance.Status)
		return nil
	}

	resolvedCredentialID, err := resolveAccessCredentialID(ctx, client, cfg, instance)
	if err != nil {
		return err
	}
	cfg.AccessCredentialID = resolvedCredentialID
	keyPath, err := ensureSSHPrivateKeyPath(ctx, client, cfg)
	if err != nil {
		return err
	}
	cfg.SSHPrivateKeyPath = keyPath

	members, err := listMembers(ctx, client, cfg)
	if err != nil {
		return err
	}
	server, err := pickServer(members)
	if err != nil {
		return err
	}
	if isRKE2InstanceLifecycleStatus(instance.Status) {
		return reconcileRKE2InstanceLifecycle(ctx, log, client, cfg, instance, server)
	}
	serverDetail, err := reconcileServer(ctx, log, client, cfg, instance, server)
	if err != nil {
		return err
	}

	operations, err := listMemberOperations(ctx, client, cfg)
	if err != nil {
		return err
	}
	for i := range operations {
		if err := reconcileAgentOperation(ctx, log, client, cfg, instance, members, server, serverDetail, operations[i]); err != nil {
			return err
		}
	}
	return nil
}

func reconcileServer(ctx context.Context, log *slog.Logger, client *http.Client, cfg config, instance *appInstance, server *member) (*serverRuntimeDetail, error) {
	allocationID := strings.TrimSpace(stringAny(server.AdapterDetail["allocation_id"]))
	if allocationID == "" {
		return nil, errors.New("server member missing allocation_id")
	}
	host, user, err := memberSSHTarget(ctx, client, cfg, server)
	if err != nil {
		return nil, err
	}
	if err := reconcileBootstrapSSHTrust(ctx, client, cfg, allocationID, user, cfg.SSHPrivateKeyPath, "present"); err != nil {
		return nil, err
	}
	if server.Status == "ready" {
		if err := ensureServerHeadlampEndpoint(ctx, cfg, host, user); err != nil {
			return nil, err
		}
		detail, err := fetchServerRuntimeDetail(ctx, cfg, host, user)
		if err == nil {
			if err := reportRKE2ServerRunning(ctx, client, cfg, instance, server, host, user, detail, "reconcile"); err != nil {
				return nil, err
			}
			return detail, nil
		}
		log.Warn("refreshing server runtime detail from bootstrap", "app_instance_id", instance.ID, "error", err)
	}
	if err := waitForBootstrapSSHTrust(ctx, cfg.SSHPrivateKeyPath, user, host); err != nil {
		return nil, err
	}

	log.Info("bootstrapping rke2 server", "app_instance_id", instance.ID, "member_id", server.ID, "host", host, "user", user)
	inProgressDetail := cloneMap(server.AdapterDetail)
	inProgressDetail["role"] = "server"
	inProgressDetail["phase"] = "bootstrap_in_progress"
	inProgressDetail["allocation_id"] = allocationID
	inProgressDetail["host"] = host
	inProgressDetail["username_on_node"] = user
	inProgressDetail["last_runtime_observed_at"] = time.Now().UTC().Format(time.RFC3339)
	if err := reportMemberStatus(ctx, client, cfg, server, "reconciling", inProgressDetail); err != nil {
		return nil, err
	}

	clusterName := clusterNameForInstance(instance)
	if err := runBootstrapScriptFunc(ctx, cfg.ServerBootstrapPath, cfg.SSHPrivateKeyPath, user, host, map[string]string{
		"GPUAAS_RKE2_CLUSTER_NAME": clusterName,
		"GPUAAS_RKE2_SERVER_HOST":  host,
		"GPUAAS_RKE2_NODE_NAME":    strings.TrimSpace(stringValue(server.BoundNodeResourceName)),
		"GPUAAS_RKE2_VERSION":      normalizeRKE2InstallVersion(instance.AppVersion),
	}); err != nil {
		bootstrapErr := err
		log.Warn("rke2 bootstrap command returned an error; probing runtime before failing instance", "app_instance_id", instance.ID, "error", bootstrapErr)
		if detail, probeErr := waitForServerRuntimeDetail(ctx, cfg, host, user); probeErr == nil {
			if err := reportRKE2ServerRunning(ctx, client, cfg, instance, server, host, user, detail, "reconcile"); err != nil {
				return nil, err
			}
			return detail, nil
		} else {
			message := bootstrapErr.Error()
			if strings.TrimSpace(probeErr.Error()) != "" {
				message = fmt.Sprintf("%s; runtime probe after bootstrap error: %s", message, probeErr.Error())
			}
			detail := cloneMap(inProgressDetail)
			detail["phase"] = "server_bootstrap_failed"
			detail["last_adapter_error"] = message
			_ = reportMemberStatus(ctx, client, cfg, server, "failed", detail)
			_ = reportInstanceStatus(ctx, client, cfg, instance, "failed", &message, map[string]any{
				"adapter_phase": "server_bootstrap_failed",
				"phase":         "failed",
			})
			return nil, bootstrapErr
		}
	}

	detail, err := fetchServerRuntimeDetail(ctx, cfg, host, user)
	if err != nil {
		message := err.Error()
		_ = reportInstanceStatus(ctx, client, cfg, instance, "failed", &message, map[string]any{
			"adapter_phase": "server_access_failed",
			"phase":         "failed",
		})
		return nil, err
	}
	memberDetail := cloneMap(inProgressDetail)
	memberDetail["phase"] = "rke2_server_ready"
	memberDetail["cluster_name"] = detail.ClusterName
	memberDetail["server_endpoint"] = detail.ServerEndpoint
	memberDetail["kubeconfig_path"] = "/etc/rancher/rke2/rke2.yaml"
	memberDetail["last_runtime_observed_at"] = time.Now().UTC().Format(time.RFC3339)
	if err := reportMemberStatus(ctx, client, cfg, server, "ready", memberDetail); err != nil {
		return nil, err
	}
	runtimeState := buildRKE2RunningRuntimeState(detail, time.Now().UTC().Format(time.RFC3339))
	if err := reportInstanceStatus(ctx, client, cfg, instance, "running", nil, runtimeState); err != nil {
		return nil, err
	}
	return detail, nil
}

func waitForServerRuntimeDetail(ctx context.Context, cfg config, host, user string) (*serverRuntimeDetail, error) {
	deadline := time.NewTimer(bootstrapRuntimeProbeTimeout)
	defer deadline.Stop()
	tick := time.NewTicker(bootstrapRuntimeProbeInterval)
	defer tick.Stop()
	var lastErr error
	for {
		detail, err := fetchServerRuntimeDetail(ctx, cfg, host, user)
		if err == nil {
			return detail, nil
		}
		lastErr = err
		select {
		case <-ctx.Done():
			return nil, ctx.Err()
		case <-deadline.C:
			if lastErr != nil {
				return nil, lastErr
			}
			return nil, errors.New("timed out waiting for rke2 runtime detail")
		case <-tick.C:
		}
	}
}

func isRKE2InstanceLifecycleStatus(status string) bool {
	switch strings.TrimSpace(status) {
	case "stopping", "starting", "restarting":
		return true
	default:
		return false
	}
}

func reconcileRKE2InstanceLifecycle(ctx context.Context, log *slog.Logger, client *http.Client, cfg config, instance *appInstance, server *member) error {
	allocationID := strings.TrimSpace(stringAny(server.AdapterDetail["allocation_id"]))
	host, user, err := memberSSHTarget(ctx, client, cfg, server)
	if err != nil {
		return reportRKE2LifecycleFailure(ctx, client, cfg, instance, "resolve_runtime_target", err)
	}
	if err := reconcileBootstrapSSHTrust(ctx, client, cfg, allocationID, user, cfg.SSHPrivateKeyPath, "present"); err != nil {
		return reportRKE2LifecycleFailure(ctx, client, cfg, instance, "ssh_trust", err)
	}

	switch strings.TrimSpace(instance.Status) {
	case "stopping":
		log.Info("stopping rke2 app instance", "app_instance_id", instance.ID, "host", host)
		if _, err := runRemoteSudoCaptureFunc(ctx, cfg.SSHPrivateKeyPath, user, host, "systemctl stop rke2-server || true; systemctl reset-failed rke2-server || true"); err != nil {
			return reportRKE2LifecycleFailure(ctx, client, cfg, instance, "stop", err)
		}
		if err := verifyRemoteSystemdInactive(ctx, cfg.SSHPrivateKeyPath, user, host, "rke2-server"); err != nil {
			return reportRKE2LifecycleFailure(ctx, client, cfg, instance, "stop", err)
		}
		now := time.Now().UTC().Format(time.RFC3339)
		detail := cloneMap(server.AdapterDetail)
		detail["role"] = "server"
		detail["phase"] = "rke2_server_stopped"
		detail["rke2_server_state"] = "stopped"
		detail["host"] = host
		detail["username_on_node"] = user
		detail["last_runtime_observed_at"] = now
		if err := reportMemberStatus(ctx, client, cfg, server, "ready", detail); err != nil {
			return err
		}
		return reportInstanceStatus(ctx, client, cfg, instance, "stopped", nil, map[string]any{
			"adapter_phase":            "rke2_server_stopped",
			"phase":                    "stopped",
			"last_operation":           "stop",
			"last_runtime_observed_at": now,
			"access": map[string]any{
				"status": "unavailable",
				"detail": "RKE2 server service is stopped on the control-plane allocation.",
			},
		})
	case "starting":
		log.Info("starting rke2 app instance", "app_instance_id", instance.ID, "host", host)
		command := `systemctl start rke2-server
for i in $(seq 1 60); do
  if systemctl is-active --quiet rke2-server && [ -s /etc/rancher/rke2/rke2.yaml ]; then
    exit 0
  fi
  sleep 5
done
systemctl status rke2-server --no-pager
exit 1`
		if _, err := runRemoteSudoCaptureFunc(ctx, cfg.SSHPrivateKeyPath, user, host, command); err != nil {
			return reportRKE2LifecycleFailure(ctx, client, cfg, instance, "start", err)
		}
		detail, err := fetchServerRuntimeDetail(ctx, cfg, host, user)
		if err != nil {
			return reportRKE2LifecycleFailure(ctx, client, cfg, instance, "start_access", err)
		}
		return reportRKE2ServerRunning(ctx, client, cfg, instance, server, host, user, detail, "start")
	case "restarting":
		err := errors.New("rke2 restart is not supported yet; use stop/start or the planned reconcile/repair action")
		return reportRKE2LifecycleFailure(ctx, client, cfg, instance, "restart_unsupported", err)
	default:
		return nil
	}
}

func reportRKE2ServerRunning(ctx context.Context, client *http.Client, cfg config, instance *appInstance, server *member, host, user string, detail *serverRuntimeDetail, operation string) error {
	if operation == "reconcile" && rke2ServerRunningAlreadyReported(instance, server, host, user, detail) {
		return nil
	}
	now := time.Now().UTC().Format(time.RFC3339)
	memberDetail := cloneMap(server.AdapterDetail)
	memberDetail["role"] = "server"
	memberDetail["phase"] = "rke2_server_ready"
	memberDetail["cluster_name"] = detail.ClusterName
	memberDetail["server_endpoint"] = detail.ServerEndpoint
	memberDetail["kubeconfig_path"] = "/etc/rancher/rke2/rke2.yaml"
	memberDetail["host"] = host
	memberDetail["username_on_node"] = user
	memberDetail["last_runtime_observed_at"] = now
	if err := reportMemberStatus(ctx, client, cfg, server, "ready", memberDetail); err != nil {
		return err
	}
	runtimeState := buildRKE2RunningRuntimeState(detail, now)
	runtimeState["last_operation"] = operation
	runtimeState["last_runtime_observed_at"] = now
	return reportInstanceStatus(ctx, client, cfg, instance, "running", nil, runtimeState)
}

func buildRKE2RunningRuntimeState(detail *serverRuntimeDetail, generatedAt string) map[string]any {
	accessState := map[string]any{
		"status":          "available",
		"cluster_name":    detail.ClusterName,
		"server_endpoint": detail.ServerEndpoint,
		"kubeconfig":      detail.Kubeconfig,
		"headlamp_token":  detail.HeadlampToken,
		"detail":          "Kubeconfig reported by the self-managed RKE2 controller.",
		"generated_at":    generatedAt,
	}
	runtimeState := map[string]any{
		"adapter_phase": "cluster_ready",
		"phase":         "ready",
		"access":        accessState,
	}
	if token := strings.TrimSpace(detail.HeadlampToken); token != "" {
		runtimeState["workload_access"] = map[string]any{
			"token": token,
		}
	}
	return runtimeState
}

func rke2ServerRunningAlreadyReported(instance *appInstance, server *member, host, user string, detail *serverRuntimeDetail) bool {
	if instance == nil || server == nil || detail == nil {
		return false
	}
	if strings.TrimSpace(instance.Status) != "running" || strings.TrimSpace(server.Status) != "ready" {
		return false
	}
	memberDetail := server.AdapterDetail
	if strings.TrimSpace(stringAny(memberDetail["role"])) != "server" {
		return false
	}
	if strings.TrimSpace(stringAny(memberDetail["phase"])) != "rke2_server_ready" {
		return false
	}
	if strings.TrimSpace(stringAny(memberDetail["cluster_name"])) != strings.TrimSpace(detail.ClusterName) {
		return false
	}
	if strings.TrimSpace(stringAny(memberDetail["server_endpoint"])) != strings.TrimSpace(detail.ServerEndpoint) {
		return false
	}
	if strings.TrimSpace(stringAny(memberDetail["kubeconfig_path"])) != "/etc/rancher/rke2/rke2.yaml" {
		return false
	}
	if strings.TrimSpace(stringAny(memberDetail["host"])) != strings.TrimSpace(host) {
		return false
	}
	if strings.TrimSpace(stringAny(memberDetail["username_on_node"])) != strings.TrimSpace(user) {
		return false
	}
	runtimeState := instance.RuntimeState
	if strings.TrimSpace(stringAny(runtimeState["adapter_phase"])) != "cluster_ready" {
		return false
	}
	if strings.TrimSpace(stringAny(runtimeState["phase"])) != "ready" {
		return false
	}
	access, _ := runtimeState["access"].(map[string]any)
	if strings.TrimSpace(stringAny(access["status"])) != "available" {
		return false
	}
	if strings.TrimSpace(stringAny(access["cluster_name"])) != strings.TrimSpace(detail.ClusterName) {
		return false
	}
	if strings.TrimSpace(stringAny(access["server_endpoint"])) != strings.TrimSpace(detail.ServerEndpoint) {
		return false
	}
	if strings.TrimSpace(detail.HeadlampToken) != "" && strings.TrimSpace(stringAny(access["headlamp_token"])) != strings.TrimSpace(detail.HeadlampToken) {
		return false
	}
	workloadAccess, _ := runtimeState["workload_access"].(map[string]any)
	if strings.TrimSpace(detail.HeadlampToken) != "" && strings.TrimSpace(stringAny(workloadAccess["token"])) != strings.TrimSpace(detail.HeadlampToken) {
		return false
	}
	return true
}

func reportRKE2LifecycleFailure(ctx context.Context, client *http.Client, cfg config, instance *appInstance, operation string, cause error) error {
	message := cause.Error()
	return reportInstanceStatus(ctx, client, cfg, instance, "failed", &message, map[string]any{
		"adapter_phase":      "lifecycle_" + operation + "_failed",
		"phase":              "failed",
		"last_operation":     operation,
		"last_adapter_error": message,
	})
}

func reconcileAgentOperation(ctx context.Context, log *slog.Logger, client *http.Client, cfg config, instance *appInstance, members []member, server *member, serverDetail *serverRuntimeDetail, op memberOperation) error {
	if strings.TrimSpace(stringValue(op.ComponentKey)) != "agent" {
		return nil
	}
	if op.Status != "accepted" && op.Status != "in_progress" {
		return nil
	}
	switch strings.TrimSpace(op.Action) {
	case "add":
		return reconcileAgentAddOperation(ctx, log, client, cfg, instance, members, server, serverDetail, op)
	case "drain":
		return reconcileAgentDrainOperation(ctx, log, client, cfg, op, members, server)
	case "remove":
		return reconcileAgentRemoveOperation(ctx, log, client, cfg, op, members, server)
	default:
		return nil
	}
}

func reconcileAgentAddOperation(ctx context.Context, log *slog.Logger, client *http.Client, cfg config, instance *appInstance, members []member, server *member, serverDetail *serverRuntimeDetail, op memberOperation) error {
	host, user, boundNodeID, detail, err := targetFromAgentOperation(ctx, client, cfg, op)
	if err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "bootstrap", err, detail, boundNodeID)
	}
	targetMemberID := strings.TrimSpace(stringValue(op.TargetMemberID))
	if targetMemberID == "" {
		targetMemberID = uuid.NewString()
	}
	if existing := activeAgentOnNode(members, boundNodeID, targetMemberID); existing != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "bootstrap", fmt.Errorf("agent already active on bound node %s", strings.TrimSpace(stringValue(boundNodeID))), detail, boundNodeID)
	}
	if err := reconcileBootstrapSSHTrust(ctx, client, cfg, strings.TrimSpace(stringAny(op.AllocationIntent["allocation_id"])), user, cfg.SSHPrivateKeyPath, "present"); err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "bootstrap", err, detail, boundNodeID)
	}
	if err := reportMemberOperation(ctx, client, cfg, op, "in_progress", targetMemberID, ptrString("reconciling"), boundNodeID, detail.toMap(), nil, nil); err != nil {
		return err
	}

	serverHost, serverUser, err := memberSSHTarget(ctx, client, cfg, server)
	if err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "bootstrap", err, detail, boundNodeID)
	}
	token, err := readRemoteFile(ctx, cfg.SSHPrivateKeyPath, serverUser, serverHost, "/var/lib/rancher/rke2/server/node-token")
	if err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "bootstrap", err, detail, boundNodeID)
	}
	log.Info("bootstrapping rke2 agent", "app_instance_id", instance.ID, "operation_id", op.ID, "target_member_id", targetMemberID, "host", host, "user", user)
	if err := runBootstrapScriptFunc(ctx, cfg.AgentBootstrapPath, cfg.SSHPrivateKeyPath, user, host, map[string]string{
		"GPUAAS_RKE2_SERVER_URL": rke2AgentJoinEndpoint(serverDetail.ServerEndpoint),
		"GPUAAS_RKE2_TOKEN":      strings.TrimSpace(token),
		"GPUAAS_RKE2_NODE_NAME":  strings.TrimSpace(detail.NodeName),
		"GPUAAS_RKE2_VERSION":    normalizeRKE2InstallVersion(instance.AppVersion),
	}); err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "bootstrap", err, detail, boundNodeID)
	}

	detail.Role = "agent"
	detail.Phase = "rke2_agent_ready"
	detail.ClusterName = serverDetail.ClusterName
	detail.ServerEndpoint = serverDetail.ServerEndpoint
	detail.LastObservedAt = time.Now().UTC().Format(time.RFC3339)
	return reportMemberOperation(ctx, client, cfg, op, "succeeded", targetMemberID, ptrString("ready"), boundNodeID, detail.toMap(), nil, nil)
}

func reconcileAgentDrainOperation(ctx context.Context, log *slog.Logger, client *http.Client, cfg config, op memberOperation, members []member, server *member) error {
	target, detail, err := targetAgentMember(op, members)
	if err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "drain", err, memberRuntimeDetail{}, nil)
	}
	boundNodeID := target.BoundNodeID
	detail.Phase = "rke2_agent_drain_requested"
	detail.LastObservedAt = time.Now().UTC().Format(time.RFC3339)
	if err := reportMemberOperation(ctx, client, cfg, op, "in_progress", target.ID, ptrString("draining"), boundNodeID, detail.toMap(), nil, nil); err != nil {
		return err
	}
	serverHost, serverUser, err := memberSSHTarget(ctx, client, cfg, server)
	if err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "drain", err, detail, boundNodeID)
	}
	nodeName := strings.TrimSpace(detail.NodeName)
	log.Info("draining rke2 agent", "operation_id", op.ID, "target_member_id", target.ID, "node_name", nodeName)
	command := fmt.Sprintf("/var/lib/rancher/rke2/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml drain %s --ignore-daemonsets --delete-emptydir-data --force --timeout=120s", shellQuote(nodeName))
	if _, err := runRemoteSudoCaptureFunc(ctx, cfg.SSHPrivateKeyPath, serverUser, serverHost, command); err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "drain", err, detail, boundNodeID)
	}
	detail.Phase = "rke2_agent_drained"
	detail.LastObservedAt = time.Now().UTC().Format(time.RFC3339)
	return reportMemberOperation(ctx, client, cfg, op, "succeeded", target.ID, ptrString("draining"), boundNodeID, detail.toMap(), nil, nil)
}

func reconcileAgentRemoveOperation(ctx context.Context, log *slog.Logger, client *http.Client, cfg config, op memberOperation, members []member, server *member) error {
	target, detail, err := targetAgentMember(op, members)
	if err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "remove", err, memberRuntimeDetail{}, nil)
	}
	boundNodeID := target.BoundNodeID
	detail.Phase = "rke2_agent_remove_requested"
	detail.LastObservedAt = time.Now().UTC().Format(time.RFC3339)
	if err := reportMemberOperation(ctx, client, cfg, op, "in_progress", target.ID, ptrString("deleting"), boundNodeID, detail.toMap(), nil, nil); err != nil {
		return err
	}
	host, user, err := memberSSHTarget(ctx, client, cfg, target)
	if err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "remove", err, detail, boundNodeID)
	}
	serverHost, serverUser, err := memberSSHTarget(ctx, client, cfg, server)
	if err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "remove", err, detail, boundNodeID)
	}
	nodeName := strings.TrimSpace(detail.NodeName)
	log.Info("removing rke2 agent", "operation_id", op.ID, "target_member_id", target.ID, "node_name", nodeName, "host", host, "user", user)
	deleteCommand := fmt.Sprintf("/var/lib/rancher/rke2/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml delete node %s --ignore-not-found=true", shellQuote(nodeName))
	if _, err := runRemoteSudoCaptureFunc(ctx, cfg.SSHPrivateKeyPath, serverUser, serverHost, deleteCommand); err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "remove", err, detail, boundNodeID)
	}
	if _, err := runRemoteSudoCaptureFunc(ctx, cfg.SSHPrivateKeyPath, user, host, "systemctl disable --now rke2-agent || true; systemctl reset-failed rke2-agent || true"); err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "remove", err, detail, boundNodeID)
	}
	if err := reconcileBootstrapSSHTrust(ctx, client, cfg, detail.AllocationID, user, cfg.SSHPrivateKeyPath, "absent"); err != nil {
		return reportAgentOperationFailure(ctx, client, cfg, op, "remove", err, detail, boundNodeID)
	}
	detail.Phase = "rke2_agent_removed"
	detail.LastObservedAt = time.Now().UTC().Format(time.RFC3339)
	return reportMemberOperation(ctx, client, cfg, op, "succeeded", target.ID, ptrString("deleted"), boundNodeID, detail.toMap(), nil, nil)
}

type serverRuntimeDetail struct {
	ClusterName    string
	ServerEndpoint string
	Kubeconfig     string
	HeadlampToken  string
}

type memberRuntimeDetail struct {
	Role           string
	Phase          string
	AllocationID   string
	Host           string
	ServerHost     string
	UsernameOnNode string
	ServerUser     string
	NodeName       string
	ClusterName    string
	ServerEndpoint string
	LastObservedAt string
}

func (d memberRuntimeDetail) toMap() map[string]any {
	return map[string]any{
		"role":                     d.Role,
		"phase":                    d.Phase,
		"allocation_id":            d.AllocationID,
		"host":                     d.Host,
		"username_on_node":         d.UsernameOnNode,
		"hostname":                 d.NodeName,
		"cluster_name":             d.ClusterName,
		"server_endpoint":          d.ServerEndpoint,
		"last_runtime_observed_at": d.LastObservedAt,
	}
}

func fetchServerRuntimeDetail(ctx context.Context, cfg config, host, user string) (*serverRuntimeDetail, error) {
	kubeconfig, err := readRemoteFile(ctx, cfg.SSHPrivateKeyPath, user, host, "/etc/rancher/rke2/rke2.yaml")
	if err != nil {
		return nil, err
	}
	endpoint := "https://" + strings.TrimSpace(host) + ":6443"
	kubeconfig = strings.ReplaceAll(kubeconfig, "https://127.0.0.1:6443", endpoint)
	kubeconfig = strings.ReplaceAll(kubeconfig, "https://localhost:6443", endpoint)
	headlampToken, err := readHeadlampToken(ctx, cfg.SSHPrivateKeyPath, user, host)
	if err != nil {
		headlampToken = ""
	}
	return &serverRuntimeDetail{
		ClusterName:    strings.TrimSpace(host),
		ServerEndpoint: endpoint,
		Kubeconfig:     strings.TrimSpace(kubeconfig) + "\n",
		HeadlampToken:  strings.TrimSpace(headlampToken),
	}, nil
}

func ensureServerHeadlampEndpoint(ctx context.Context, cfg config, host, user string) error {
	_, err := runRemoteSudoCaptureFunc(ctx, cfg.SSHPrivateKeyPath, user, host, `set -eu
KUBECTL=/var/lib/rancher/rke2/bin/kubectl
KUBECONFIG=/etc/rancher/rke2/rke2.yaml
NODE_NAME="$(hostname -s)"
MIN_HEADLAMP_ROOT_FREE_KB="${GPUAAS_RKE2_MIN_HEADLAMP_ROOT_FREE_KB:-2097152}"
if ! ss -ltn | grep -q ':4466 '; then
  root_free_kb="$(df -Pk / | awk 'NR==2 {print $4}')"
  if [ "${root_free_kb:-0}" -lt "$MIN_HEADLAMP_ROOT_FREE_KB" ]; then
    echo "insufficient root disk for Headlamp: free_kb=${root_free_kb:-unknown}, required_kb=${MIN_HEADLAMP_ROOT_FREE_KB}" >&2
    df -h / >&2 || true
    exit 42
  fi
  disk_pressure="$("$KUBECTL" --kubeconfig "$KUBECONFIG" get node "$NODE_NAME" -o jsonpath='{.status.conditions[?(@.type=="DiskPressure")].status}' 2>/dev/null || true)"
  if [ "$disk_pressure" = "True" ]; then
    echo "rke2 node reports DiskPressure; Headlamp cannot be scheduled safely" >&2
    "$KUBECTL" --kubeconfig "$KUBECONFIG" describe node "$NODE_NAME" | sed -n '/Conditions:/,/Addresses:/p' >&2 || true
    df -h / >&2 || true
    exit 42
  fi
  "$KUBECTL" --kubeconfig "$KUBECONFIG" apply -f https://raw.githubusercontent.com/kubernetes-sigs/headlamp/main/kubernetes-headlamp.yaml
  "$KUBECTL" --kubeconfig "$KUBECONFIG" -n kube-system create serviceaccount gpuaas-headlamp-admin --dry-run=client -o yaml | "$KUBECTL" --kubeconfig "$KUBECONFIG" apply -f -
  "$KUBECTL" --kubeconfig "$KUBECONFIG" create clusterrolebinding gpuaas-headlamp-admin --clusterrole=cluster-admin --serviceaccount=kube-system:gpuaas-headlamp-admin --dry-run=client -o yaml | "$KUBECTL" --kubeconfig "$KUBECONFIG" apply -f -
  cat <<'EOF' | "$KUBECTL" --kubeconfig "$KUBECONFIG" apply -f -
apiVersion: v1
kind: Secret
metadata:
  name: gpuaas-headlamp-admin-token
  namespace: kube-system
  annotations:
    kubernetes.io/service-account.name: gpuaas-headlamp-admin
type: kubernetes.io/service-account-token
EOF
  "$KUBECTL" --kubeconfig "$KUBECONFIG" -n kube-system rollout status deploy/headlamp --timeout=180s
  cat >/etc/systemd/system/gpuaas-headlamp-portforward.service <<'EOF'
[Unit]
Description=GPUaaS Headlamp managed-ingress port-forward
After=rke2-server.service
Requires=rke2-server.service

[Service]
Type=simple
Restart=always
RestartSec=5
ExecStart=/var/lib/rancher/rke2/bin/kubectl --kubeconfig /etc/rancher/rke2/rke2.yaml -n kube-system port-forward --address 0.0.0.0 service/headlamp 4466:80

[Install]
WantedBy=multi-user.target
EOF
  systemctl daemon-reload
  systemctl enable --now gpuaas-headlamp-portforward.service
fi
"$KUBECTL" --kubeconfig "$KUBECONFIG" -n kube-system create serviceaccount gpuaas-headlamp-admin --dry-run=client -o yaml | "$KUBECTL" --kubeconfig "$KUBECONFIG" apply -f -
"$KUBECTL" --kubeconfig "$KUBECONFIG" create clusterrolebinding gpuaas-headlamp-admin --clusterrole=cluster-admin --serviceaccount=kube-system:gpuaas-headlamp-admin --dry-run=client -o yaml | "$KUBECTL" --kubeconfig "$KUBECONFIG" apply -f -
cat <<'EOF' | "$KUBECTL" --kubeconfig "$KUBECONFIG" apply -f -
apiVersion: v1
kind: Secret
metadata:
  name: gpuaas-headlamp-admin-token
  namespace: kube-system
  annotations:
    kubernetes.io/service-account.name: gpuaas-headlamp-admin
type: kubernetes.io/service-account-token
EOF
for _ in $(seq 1 30); do
  ss -ltn | grep -q ':4466 ' && exit 0
  sleep 2
done
systemctl status gpuaas-headlamp-portforward.service --no-pager || true
exit 1`)
	return err
}

func readHeadlampToken(ctx context.Context, keyPath, user, host string) (string, error) {
	return runRemoteSudoCaptureFunc(ctx, keyPath, user, host, `set -eu
KUBECTL=/var/lib/rancher/rke2/bin/kubectl
KUBECONFIG=/etc/rancher/rke2/rke2.yaml
for _ in $(seq 1 20); do
  token_b64="$("$KUBECTL" --kubeconfig "$KUBECONFIG" -n kube-system get secret gpuaas-headlamp-admin-token -o jsonpath='{.data.token}' 2>/dev/null || true)"
  if [ -n "$token_b64" ]; then
    printf '%s' "$token_b64" | base64 -d
    exit 0
  fi
  sleep 1
done
echo "headlamp service-account token not ready" >&2
exit 1`)
}

func rke2AgentJoinEndpoint(serverEndpoint string) string {
	endpoint := strings.TrimSpace(serverEndpoint)
	if endpoint == "" {
		return endpoint
	}
	if strings.HasSuffix(endpoint, ":6443") {
		return strings.TrimSuffix(endpoint, ":6443") + ":9345"
	}
	return endpoint
}

func readRemoteFile(ctx context.Context, keyPath, user, host, path string) (string, error) {
	return runRemoteSudoCaptureFunc(ctx, keyPath, user, host, fmt.Sprintf("cat %s", shellQuote(path)))
}

func listAppInstances(ctx context.Context, client *http.Client, cfg config) ([]appInstance, error) {
	url := fmt.Sprintf("%s/api/v1/projects/%s/app-instances?page_size=100", cfg.APIBaseURL, cfg.ProjectID)
	var out appInstanceListResponse
	if err := doJSON(ctx, client, cfg, http.MethodGet, url, nil, &out); err != nil {
		return nil, err
	}
	items := make([]appInstance, 0, len(out.Items))
	for _, item := range out.Items {
		if strings.TrimSpace(item.AppSlug) != cfg.AppSlug {
			continue
		}
		switch item.Status {
		case "requested", "deploying", "running", "stopping", "starting", "restarting", "upgrading", "rolling_back":
			items = append(items, item)
		}
	}
	return items, nil
}

func getAppInstance(ctx context.Context, client *http.Client, cfg config) (*appInstance, error) {
	url := fmt.Sprintf("%s/api/v1/projects/%s/app-instances/%s", cfg.APIBaseURL, cfg.ProjectID, cfg.AppInstanceID)
	var out appInstance
	if err := doJSON(ctx, client, cfg, http.MethodGet, url, nil, &out); err != nil {
		return nil, err
	}
	return &out, nil
}

func listMembers(ctx context.Context, client *http.Client, cfg config) ([]member, error) {
	url := fmt.Sprintf("%s/api/v1/projects/%s/app-instances/%s/members?page_size=100", cfg.APIBaseURL, cfg.ProjectID, cfg.AppInstanceID)
	var out memberListResponse
	if err := doJSON(ctx, client, cfg, http.MethodGet, url, nil, &out); err != nil {
		return nil, err
	}
	return out.Items, nil
}

func listMemberOperations(ctx context.Context, client *http.Client, cfg config) ([]memberOperation, error) {
	url := fmt.Sprintf("%s/api/v1/projects/%s/app-instances/%s/member-operations?page_size=100", cfg.APIBaseURL, cfg.ProjectID, cfg.AppInstanceID)
	var out operationListResponse
	if err := doJSON(ctx, client, cfg, http.MethodGet, url, nil, &out); err != nil {
		return nil, err
	}
	return out.Items, nil
}

func pickServer(items []member) (*member, error) {
	for i := range items {
		if items[i].ComponentKey == "server" {
			return &items[i], nil
		}
	}
	return nil, errors.New("server member not found")
}

func memberSSHTarget(ctx context.Context, client *http.Client, cfg config, item *member) (string, string, error) {
	rawHost, _ := item.AdapterDetail["host"].(string)
	rawUser, _ := item.AdapterDetail["username_on_node"].(string)
	rawHostname, _ := item.AdapterDetail["hostname"].(string)
	port := intAny(item.AdapterDetail["port"])
	host := strings.TrimSpace(rawHost)
	if host == "" {
		host = strings.TrimSpace(rawHostname)
	}
	user := strings.TrimSpace(rawUser)
	if host != "" && user != "" {
		return sshEndpoint(host, port), user, nil
	}
	allocationID := strings.TrimSpace(stringAny(item.AdapterDetail["allocation_id"]))
	if allocationID != "" {
		target, err := getProjectAllocation(ctx, client, cfg, allocationID)
		if err != nil {
			return "", "", err
		}
		return allocationSSHTarget(target)
	}
	return "", "", errors.New("member missing host or username_on_node in adapter_detail")
}

func targetFromAgentOperation(ctx context.Context, client *http.Client, cfg config, op memberOperation) (host string, user string, boundNodeID *string, detail memberRuntimeDetail, err error) {
	allocationID := strings.TrimSpace(stringAny(op.AllocationIntent["allocation_id"]))
	if allocationID == "" {
		return "", "", nil, memberRuntimeDetail{}, errors.New("agent operation missing allocation_id")
	}
	target, err := getProjectAllocation(ctx, client, cfg, allocationID)
	if err != nil {
		return "", "", nil, memberRuntimeDetail{}, err
	}
	host, user, err = allocationSSHTarget(target)
	if err != nil {
		return "", "", nil, memberRuntimeDetail{}, err
	}
	nodeName := host
	if target.Connection.Hostname != nil && strings.TrimSpace(*target.Connection.Hostname) != "" {
		nodeName = strings.TrimSpace(*target.Connection.Hostname)
	}
	detail = memberRuntimeDetail{
		Role:           "agent",
		Phase:          "bootstrap_requested",
		AllocationID:   allocationID,
		Host:           host,
		ServerHost:     "",
		UsernameOnNode: user,
		ServerUser:     "",
		NodeName:       nodeName,
		LastObservedAt: time.Now().UTC().Format(time.RFC3339),
	}
	return host, user, target.NodeID, detail, nil
}

func activeAgentOnNode(items []member, nodeID *string, ignoreMemberID string) *member {
	if strings.TrimSpace(stringValue(nodeID)) == "" {
		return nil
	}
	for i := range items {
		if strings.TrimSpace(items[i].ID) == strings.TrimSpace(ignoreMemberID) {
			continue
		}
		if items[i].ComponentKey != "agent" {
			continue
		}
		if strings.TrimSpace(stringValue(items[i].BoundNodeID)) != strings.TrimSpace(stringValue(nodeID)) {
			continue
		}
		if items[i].Status == "failed" || items[i].Status == "deleted" {
			continue
		}
		return &items[i]
	}
	return nil
}

func targetAgentMember(op memberOperation, items []member) (*member, memberRuntimeDetail, error) {
	targetMemberID := strings.TrimSpace(stringValue(op.TargetMemberID))
	if targetMemberID == "" {
		return nil, memberRuntimeDetail{}, errors.New("agent operation missing target_member_id")
	}
	for i := range items {
		if strings.TrimSpace(items[i].ID) != targetMemberID {
			continue
		}
		if strings.TrimSpace(items[i].ComponentKey) != "agent" {
			return nil, memberRuntimeDetail{}, fmt.Errorf("target member %s is not an rke2 agent", targetMemberID)
		}
		if items[i].Status == "deleted" {
			return nil, memberRuntimeDetail{}, fmt.Errorf("target agent %s is already deleted", targetMemberID)
		}
		detail := memberRuntimeDetailFromMember(&items[i])
		if strings.TrimSpace(detail.NodeName) == "" {
			return nil, detail, fmt.Errorf("target agent %s missing node name", targetMemberID)
		}
		return &items[i], detail, nil
	}
	return nil, memberRuntimeDetail{}, fmt.Errorf("target agent %s not found", targetMemberID)
}

func memberRuntimeDetailFromMember(item *member) memberRuntimeDetail {
	if item == nil {
		return memberRuntimeDetail{}
	}
	detail := item.AdapterDetail
	nodeName := strings.TrimSpace(stringAny(detail["hostname"]))
	if nodeName == "" {
		nodeName = strings.TrimSpace(stringValue(item.BoundNodeResourceName))
	}
	if nodeName == "" {
		nodeName = strings.TrimSpace(stringAny(detail["host"]))
	}
	return memberRuntimeDetail{
		Role:           "agent",
		Phase:          strings.TrimSpace(stringAny(detail["phase"])),
		AllocationID:   strings.TrimSpace(stringAny(detail["allocation_id"])),
		Host:           strings.TrimSpace(stringAny(detail["host"])),
		UsernameOnNode: strings.TrimSpace(stringAny(detail["username_on_node"])),
		NodeName:       nodeName,
		ClusterName:    strings.TrimSpace(stringAny(detail["cluster_name"])),
		ServerEndpoint: strings.TrimSpace(stringAny(detail["server_endpoint"])),
		LastObservedAt: strings.TrimSpace(stringAny(detail["last_runtime_observed_at"])),
	}
}

func clusterNameForInstance(item *appInstance) string {
	if item == nil {
		return "gpuaas-rke2"
	}
	name := strings.TrimSpace(item.DisplayName)
	if name == "" {
		name = strings.TrimSpace(item.ResourceName)
	}
	name = strings.ToLower(name)
	name = strings.ReplaceAll(name, " ", "-")
	name = strings.ReplaceAll(name, "_", "-")
	return name
}

func normalizeRKE2InstallVersion(raw string) string {
	version := strings.TrimSpace(raw)
	if version == "" {
		return ""
	}
	if strings.HasPrefix(version, "v") {
		return version
	}
	return "v" + version
}

func resolveAccessCredentialID(ctx context.Context, client *http.Client, cfg config, instance *appInstance) (string, error) {
	if strings.TrimSpace(cfg.AccessCredentialID) != "" {
		return strings.TrimSpace(cfg.AccessCredentialID), nil
	}
	items, err := listProjectAccessCredentials(ctx, client, cfg)
	if err != nil {
		return "", err
	}
	for _, item := range items {
		if item.Status != "active" || item.CredentialKind != "ssh_key" || item.DeliveryMode != "vault_wrapped_token" {
			continue
		}
		if strings.TrimSpace(stringValue(item.ResourceType)) == "app_instance" && strings.TrimSpace(stringValue(item.ResourceID)) == strings.TrimSpace(instance.ID) {
			return strings.TrimSpace(item.ID), nil
		}
		for _, binding := range item.Bindings {
			if strings.TrimSpace(binding.ResourceType) != "app_instance" {
				continue
			}
			if strings.TrimSpace(binding.ResourceID) != strings.TrimSpace(instance.ID) {
				continue
			}
			return strings.TrimSpace(item.ID), nil
		}
	}
	return "", errors.New("no active app_instance-bound vault_wrapped_token ssh_key access credential found for rke2 instance")
}

func ensureSSHPrivateKeyPath(ctx context.Context, client *http.Client, cfg config) (string, error) {
	if strings.TrimSpace(cfg.AccessCredentialID) == "" {
		if strings.TrimSpace(cfg.SSHPrivateKeyPath) == "" {
			return "", errors.New("rke2 controller missing ssh private key path or resolvable access credential")
		}
		return strings.TrimSpace(cfg.SSHPrivateKeyPath), nil
	}
	cacheDir := filepath.Join(os.TempDir(), "rke2-self-managed")
	if err := os.MkdirAll(cacheDir, 0o700); err != nil {
		return "", err
	}
	cachePath := filepath.Join(cacheDir, strings.TrimSpace(cfg.AccessCredentialID)+".key")
	if info, err := os.Stat(cachePath); err == nil && info.Mode().IsRegular() && info.Size() > 0 {
		return cachePath, nil
	} else if err != nil && !errors.Is(err, os.ErrNotExist) {
		return "", err
	}
	delivered, err := deliverAccessCredential(ctx, client, cfg)
	if err != nil {
		return "", err
	}
	privateKey, err := unwrapSSHPrivateKey(ctx, client, delivered)
	if err != nil {
		return "", err
	}
	if !strings.HasSuffix(privateKey, "\n") {
		privateKey += "\n"
	}
	tmpPath := cachePath + ".tmp"
	_ = os.Remove(tmpPath)
	if err := os.WriteFile(tmpPath, []byte(privateKey), 0o600); err != nil {
		return "", err
	}
	if err := os.Chmod(tmpPath, 0o400); err != nil {
		_ = os.Remove(tmpPath)
		return "", err
	}
	if err := os.Rename(tmpPath, cachePath); err != nil {
		_ = os.Remove(tmpPath)
		return "", err
	}
	return cachePath, nil
}

func listProjectAccessCredentials(ctx context.Context, client *http.Client, cfg config) ([]projectAccessCredential, error) {
	url := fmt.Sprintf("%s/api/v1/projects/%s/access-credentials?page_size=100", cfg.APIBaseURL, cfg.ProjectID)
	var out projectAccessCredentialListResponse
	if err := doJSON(ctx, client, cfg, http.MethodGet, url, nil, &out); err != nil {
		return nil, err
	}
	return out.Items, nil
}

func deliverAccessCredential(ctx context.Context, client *http.Client, cfg config) (*deliveredAccessCredentialResponse, error) {
	url := fmt.Sprintf("%s/api/v1/projects/%s/access-credentials/%s/deliver", cfg.APIBaseURL, cfg.ProjectID, cfg.AccessCredentialID)
	var out deliveredAccessCredentialResponse
	req := map[string]any{"wrap_ttl_seconds": 900}
	if err := doJSON(ctx, client, cfg, http.MethodPost, url, req, &out); err != nil {
		return nil, err
	}
	return &out, nil
}

func unwrapSSHPrivateKey(ctx context.Context, client *http.Client, delivered *deliveredAccessCredentialResponse) (string, error) {
	if delivered == nil || delivered.Delivery.WrappedToken == nil || strings.TrimSpace(*delivered.Delivery.WrappedToken) == "" {
		return "", errors.New("access credential delivery missing wrapped token")
	}
	if delivered.Delivery.UnwrapURL == nil || strings.TrimSpace(*delivered.Delivery.UnwrapURL) == "" {
		return "", errors.New("access credential delivery missing unwrap_url")
	}
	req, err := http.NewRequestWithContext(ctx, http.MethodPost, strings.TrimSpace(*delivered.Delivery.UnwrapURL), nil)
	if err != nil {
		return "", err
	}
	req.Header.Set("X-Vault-Token", strings.TrimSpace(*delivered.Delivery.WrappedToken))
	resp, err := client.Do(req)
	if err != nil {
		return "", err
	}
	defer func() { _ = resp.Body.Close() }()
	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
		body, _ := io.ReadAll(resp.Body)
		return "", fmt.Errorf("vault unwrap failed: status=%d body=%s", resp.StatusCode, strings.TrimSpace(string(body)))
	}
	var decoded struct {
		Data struct {
			CredentialData map[string]string `json:"credential_data"`
		} `json:"data"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&decoded); err != nil {
		return "", err
	}
	privateKey := strings.TrimSpace(decoded.Data.CredentialData["private_key"])
	if privateKey == "" {
		return "", errors.New("access credential unwrap payload missing private_key")
	}
	return privateKey, nil
}

func reconcileBootstrapSSHTrust(ctx context.Context, client *http.Client, cfg config, allocationID, usernameOnNode, keyPath, desiredState string) error {
	allocationID = strings.TrimSpace(allocationID)
	usernameOnNode = strings.TrimSpace(usernameOnNode)
	keyPath = strings.TrimSpace(keyPath)
	desiredState = strings.TrimSpace(strings.ToLower(desiredState))
	if allocationID == "" || usernameOnNode == "" {
		return nil
	}
	body := map[string]any{
		"access_credential_id": normalizedStringValue(cfg.AccessCredentialID),
		"allocation_id":        allocationID,
		"username_on_node":     usernameOnNode,
		"desired_state":        desiredState,
	}
	if desiredState == "present" {
		publicKey, err := publicKeyFromPrivateKeyPath(keyPath)
		if err != nil {
			return err
		}
		body["public_key"] = publicKey
	}
	url := fmt.Sprintf("%s/api/v1/projects/%s/app-instances/%s/bootstrap-ssh/reconcile", cfg.APIBaseURL, cfg.ProjectID, cfg.AppInstanceID)
	return doJSON(ctx, client, cfg, http.MethodPost, url, body, nil)
}

func waitForBootstrapSSHTrust(ctx context.Context, keyPath, user, host string) error {
	deadline := time.Now().Add(bootstrapSSHTrustWaitTimeout)
	var lastErr error
	for {
		_, err := runRemoteSudoCaptureFunc(ctx, keyPath, user, host, "true")
		if err == nil {
			return nil
		}
		lastErr = err
		if time.Now().After(deadline) {
			return fmt.Errorf("bootstrap ssh trust not usable after %s: %w", bootstrapSSHTrustWaitTimeout, lastErr)
		}
		timer := time.NewTimer(bootstrapSSHTrustPollInterval)
		select {
		case <-ctx.Done():
			timer.Stop()
			return ctx.Err()
		case <-timer.C:
		}
	}
}

func publicKeyFromPrivateKeyPath(path string) (string, error) {
	raw, err := os.ReadFile(path)
	if err != nil {
		return "", err
	}
	signer, err := ssh.ParsePrivateKey(raw)
	if err != nil {
		return "", err
	}
	return strings.TrimSpace(string(ssh.MarshalAuthorizedKey(signer.PublicKey()))), nil
}

func reportInstanceStatus(ctx context.Context, client *http.Client, cfg config, instance *appInstance, status string, failureReason *string, runtimeState map[string]any) error {
	url := fmt.Sprintf("%s/api/v1/projects/%s/app-instances/%s/report", cfg.APIBaseURL, cfg.ProjectID, cfg.AppInstanceID)
	current := cloneMap(instance.RuntimeState)
	for k, v := range runtimeState {
		current[k] = v
	}
	if failureReason == nil && status != "failed" {
		delete(current, "last_adapter_error")
		delete(current, "bootstrap_log_excerpt")
	}
	req := map[string]any{
		"status":         status,
		"failure_reason": failureReason,
		"runtime_state":  current,
	}
	return doJSON(ctx, client, cfg, http.MethodPost, url, req, nil)
}

func reportMemberStatus(ctx context.Context, client *http.Client, cfg config, item *member, status string, detail map[string]any) error {
	url := fmt.Sprintf("%s/api/v1/projects/%s/app-instances/%s/members/%s/report", cfg.APIBaseURL, cfg.ProjectID, cfg.AppInstanceID, item.ID)
	req := map[string]any{
		"status":         status,
		"bound_node_id":  item.BoundNodeID,
		"adapter_detail": detail,
	}
	return doJSON(ctx, client, cfg, http.MethodPost, url, req, nil)
}

func reportMemberOperation(ctx context.Context, client *http.Client, cfg config, op memberOperation, status string, targetMemberID string, memberStatus *string, boundNodeID *string, detail map[string]any, lastErrorCode *string, lastErrorMessage *string) error {
	url := fmt.Sprintf("%s/api/v1/projects/%s/app-instances/%s/member-operations/%s/report", cfg.APIBaseURL, cfg.ProjectID, cfg.AppInstanceID, op.ID)
	req := map[string]any{
		"status":             status,
		"target_member_id":   targetMemberID,
		"member_status":      memberStatus,
		"bound_node_id":      boundNodeID,
		"adapter_detail":     detail,
		"last_error_code":    lastErrorCode,
		"last_error_message": lastErrorMessage,
	}
	return doJSON(ctx, client, cfg, http.MethodPost, url, req, nil)
}

func reportAgentOperationFailure(ctx context.Context, client *http.Client, cfg config, op memberOperation, operation string, cause error, detail memberRuntimeDetail, boundNodeID *string) error {
	targetMemberID := strings.TrimSpace(stringValue(op.TargetMemberID))
	if targetMemberID == "" {
		targetMemberID = uuid.NewString()
	}
	operation = strings.TrimSpace(operation)
	if operation == "" {
		operation = strings.TrimSpace(op.Action)
	}
	if operation == "" {
		operation = "operation"
	}
	code := operation + "_failed"
	message := cause.Error()
	failureDetail := detail.toMap()
	failureDetail["phase"] = code
	failureDetail["last_adapter_error"] = message
	return reportMemberOperation(ctx, client, cfg, op, "failed", targetMemberID, ptrString("failed"), boundNodeID, failureDetail, &code, &message)
}

func buildHTTPClient(cfg config) *http.Client {
	return &http.Client{
		Timeout: 45 * time.Second,
		Transport: &http.Transport{
			TLSClientConfig: &tls.Config{
				MinVersion: tls.VersionTLS12,
				ServerName: cfg.TLSServerName,
			},
		},
	}
}

func mintServiceAccountToken(ctx context.Context, client *http.Client, cfg config) (string, error) {
	token, _, err := mintServiceAccountTokenWithExpiry(ctx, client, cfg)
	return token, err
}

func mintServiceAccountTokenWithExpiry(ctx context.Context, client *http.Client, cfg config) (string, int, error) {
	url := fmt.Sprintf("%s/api/v1/auth/service-account/token", cfg.APIBaseURL)
	req := map[string]any{
		"service_account_id": cfg.ServiceAccountID,
		"key_id":             cfg.ServiceAccountKeyID,
		"client_secret":      cfg.ServiceAccountSecret,
	}
	var out controllerToken
	if err := doJSON(ctx, client, cfg, http.MethodPost, url, req, &out); err != nil {
		return "", 0, err
	}
	if strings.TrimSpace(out.AccessToken) == "" {
		return "", 0, errors.New("service-account token response missing access_token")
	}
	return out.AccessToken, out.ExpiresInSeconds, nil
}

func doJSON(ctx context.Context, client *http.Client, cfg config, method, url string, body any, out any) error {
	var payload io.Reader
	if body != nil {
		raw, err := json.Marshal(body)
		if err != nil {
			return err
		}
		payload = bytes.NewReader(raw)
	}
	req, err := http.NewRequestWithContext(ctx, method, url, payload)
	if err != nil {
		return err
	}
	if cfg.HTTPHostOverride != "" {
		req.Host = cfg.HTTPHostOverride
	}
	if cfg.BearerToken != "" {
		req.Header.Set("Authorization", "Bearer "+cfg.BearerToken)
	}
	if strings.TrimSpace(cfg.ProjectID) != "" {
		req.Header.Set("X-Project-ID", cfg.ProjectID)
	}
	req.Header.Set("X-Correlation-ID", fmt.Sprintf("rke2-self-managed-controller-%d", time.Now().UnixNano()))
	if body != nil {
		req.Header.Set("Content-Type", "application/json")
	}
	resp, err := client.Do(req)
	if err != nil {
		return err
	}
	defer func() { _ = resp.Body.Close() }()
	raw, err := io.ReadAll(resp.Body)
	if err != nil {
		return err
	}
	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
		return fmt.Errorf("%s %s returned %d: %s", method, url, resp.StatusCode, strings.TrimSpace(string(raw)))
	}
	if out != nil && len(raw) > 0 {
		if err := json.Unmarshal(raw, out); err != nil {
			return err
		}
	}
	return nil
}

func getProjectAllocation(ctx context.Context, client *http.Client, cfg config, allocationID string) (*allocation, error) {
	url := fmt.Sprintf("%s/api/v1/projects/%s/allocations/%s", cfg.APIBaseURL, cfg.ProjectID, allocationID)
	var out allocation
	if err := doJSON(ctx, client, cfg, http.MethodGet, url, nil, &out); err != nil {
		return nil, err
	}
	return &out, nil
}

func allocationSSHTarget(item *allocation) (host string, user string, err error) {
	if item == nil {
		return "", "", errors.New("allocation missing")
	}
	if item.Connection.Host != nil {
		host = strings.TrimSpace(*item.Connection.Host)
	}
	if host == "" && item.Connection.Hostname != nil {
		host = strings.TrimSpace(*item.Connection.Hostname)
	}
	if item.Connection.UsernameOnNode != nil {
		user = strings.TrimSpace(*item.Connection.UsernameOnNode)
	}
	if host == "" || user == "" {
		return "", "", errors.New("allocation missing connection host or username_on_node")
	}
	port := 0
	if item.Connection.Port != nil {
		port = *item.Connection.Port
	}
	return sshEndpoint(host, port), user, nil
}

func sshEndpoint(host string, port int) string {
	host = strings.TrimSpace(host)
	if override, ok := schedulerSSHOverride(host, port); ok {
		return override
	}
	if host == "" || port <= 0 || port == 22 {
		return host
	}
	if _, _, err := net.SplitHostPort(host); err == nil {
		return host
	}
	return net.JoinHostPort(host, strconv.Itoa(port))
}

type schedulerSSHOverrideTarget struct {
	Host string `json:"host"`
	Port int    `json:"port"`
}

func schedulerSSHOverride(host string, port int) (string, bool) {
	host = strings.TrimSpace(host)
	if host == "" {
		return "", false
	}
	rawMap := firstNonEmpty(
		os.Getenv("RKE2_CONTROLLER_SSH_TARGET_OVERRIDE_MAP"),
		os.Getenv("APP_SCHEDULER_SSH_TARGET_OVERRIDE_MAP"),
		os.Getenv("SCHEDULER_SSH_TARGET_OVERRIDE_MAP"),
	)
	if strings.TrimSpace(rawMap) == "" {
		return "", false
	}
	var values map[string]json.RawMessage
	if err := json.Unmarshal([]byte(rawMap), &values); err != nil {
		return "", false
	}
	keys := []string{host}
	if port > 0 {
		keys = append([]string{net.JoinHostPort(host, strconv.Itoa(port))}, keys...)
	}
	for _, key := range keys {
		raw, ok := values[key]
		if !ok {
			continue
		}
		if target, ok := parseSchedulerSSHOverrideTarget(raw); ok {
			return target, true
		}
	}
	return "", false
}

func parseSchedulerSSHOverrideTarget(raw json.RawMessage) (string, bool) {
	var text string
	if err := json.Unmarshal(raw, &text); err == nil {
		text = strings.TrimSpace(text)
		return text, text != ""
	}
	var target schedulerSSHOverrideTarget
	if err := json.Unmarshal(raw, &target); err != nil {
		return "", false
	}
	target.Host = strings.TrimSpace(target.Host)
	if target.Host == "" {
		return "", false
	}
	if target.Port <= 0 {
		return target.Host, true
	}
	return net.JoinHostPort(target.Host, strconv.Itoa(target.Port)), true
}

func firstNonEmpty(values ...string) string {
	for _, value := range values {
		if strings.TrimSpace(value) != "" {
			return value
		}
	}
	return ""
}

func runBootstrapScript(ctx context.Context, scriptPath, keyPath, user, host string, env map[string]string) error {
	script, err := os.ReadFile(scriptPath)
	if err != nil {
		return err
	}
	envParts := make([]string, 0, len(env))
	for key, value := range env {
		if strings.TrimSpace(value) == "" {
			continue
		}
		envParts = append(envParts, fmt.Sprintf("%s=%s", key, shellQuote(value)))
	}
	remoteCommand := "sudo -n env " + strings.Join(envParts, " ") + " bash -s"
	args := append(baseSSHArgs(keyPath, user, host), remoteCommand)
	cmd := exec.CommandContext(ctx, "ssh", args...)
	cmd.Stdin = bytes.NewReader(script)
	var stderr bytes.Buffer
	cmd.Stdout = &bytes.Buffer{}
	cmd.Stderr = &stderr
	if err := cmd.Run(); err != nil {
		errText := strings.TrimSpace(stderr.String())
		if errText == "" {
			return err
		}
		return fmt.Errorf("%w: %s", err, errText)
	}
	return nil
}

func runRemoteSudoCapture(ctx context.Context, keyPath, user, host, command string) (string, error) {
	remoteCommand := fmt.Sprintf("sudo -n bash -lc %s", shellQuote(command))
	args := append(baseSSHArgs(keyPath, user, host), remoteCommand)
	cmd := exec.CommandContext(ctx, "ssh", args...)
	var stdout bytes.Buffer
	var stderr bytes.Buffer
	cmd.Stdout = &stdout
	cmd.Stderr = &stderr
	if err := cmd.Run(); err != nil {
		errText := strings.TrimSpace(stderr.String())
		if errText == "" {
			return "", err
		}
		return "", fmt.Errorf("%w: %s", err, errText)
	}
	return stdout.String(), nil
}

func verifyRemoteSystemdInactive(ctx context.Context, keyPath, user, host, unit string) error {
	output, err := runRemoteSudoCaptureFunc(ctx, keyPath, user, host, fmt.Sprintf("if systemctl is-active --quiet %s; then echo active; else echo inactive; fi", shellQuote(unit)))
	if err != nil {
		return err
	}
	if strings.TrimSpace(output) != "inactive" {
		return fmt.Errorf("systemd unit %s still active: %s", unit, strings.TrimSpace(output))
	}
	return nil
}

func getenvDefault(key, fallback string) string {
	v := strings.TrimSpace(os.Getenv(key))
	if v == "" {
		return fallback
	}
	return v
}

func cloneMap(in map[string]any) map[string]any {
	if len(in) == 0 {
		return map[string]any{}
	}
	out := make(map[string]any, len(in))
	for k, v := range in {
		out[k] = v
	}
	return out
}

func ptrString(v string) *string { return &v }

func stringValue(v *string) string {
	if v == nil {
		return ""
	}
	return strings.TrimSpace(*v)
}

func stringAny(v any) string {
	s, _ := v.(string)
	return strings.TrimSpace(s)
}

func intAny(v any) int {
	switch typed := v.(type) {
	case int:
		return typed
	case int32:
		return int(typed)
	case int64:
		return int(typed)
	case float64:
		return int(typed)
	case json.Number:
		n, _ := typed.Int64()
		return int(n)
	case string:
		n, _ := strconv.Atoi(strings.TrimSpace(typed))
		return n
	default:
		return 0
	}
}

func baseSSHArgs(keyPath, user, host string) []string {
	host = strings.TrimSpace(host)
	targetHost := host
	targetPort := ""
	if parsedHost, parsedPort, err := net.SplitHostPort(host); err == nil {
		targetHost = parsedHost
		targetPort = parsedPort
	}
	args := []string{
		"-o", "StrictHostKeyChecking=no",
		"-o", "UserKnownHostsFile=/dev/null",
		"-i", keyPath,
	}
	if targetPort != "" {
		args = append(args, "-p", targetPort)
	}
	args = append(args, fmt.Sprintf("%s@%s", user, targetHost))
	return args
}

func normalizedStringValue(v string) any {
	if strings.TrimSpace(v) == "" {
		return nil
	}
	return strings.TrimSpace(v)
}

func shellQuote(v string) string {
	return "'" + strings.ReplaceAll(v, "'", `'"'"'`) + "'"
}
