#!/usr/bin/env bash
# Pylon node installer — served at https://pylond.run/install.sh
#
#   curl -fsSL https://pylond.run/install.sh | sh -s -- --token <ENROLL_TOKEN>
#
# Connects this machine to Pylon: installs Docker if missing, fetches the Pylon
# CA, then runs the node-agent — it dials OUT to the control plane over mutually
# authenticated TLS (verifying the Pylon CA), joins the mesh, holds the reverse
# tunnel, AND holds the workload-control stream so customer servers run HERE (on
# this node's Docker), not on the control box. No inbound ports are opened.
# POSIX sh (the install one-liner pipes to `sh`, i.e. dash) — no `pipefail`.
set -eu

FLEET="${PYLON_FLEET:-https://fleet.pylon.host:50051}"
MESH="${PYLON_MESH:-https://mesh.pylon.host:50054}"
IMAGE="${PYLON_IMAGE:-registry.pylon.host/pylon:latest}"
CA_URL="${PYLON_CA_URL:-https://pylond.run/ca.crt}"
TLS_DOMAIN="${PYLON_TLS_DOMAIN:-pylon-control}"
INSTALL_DIR=/opt/pylon
TOKEN=""
UNINSTALL=0
while [ $# -gt 0 ]; do
  case "$1" in
    --token) TOKEN="$2"; shift 2 ;;
    --token=*) TOKEN="${1#*=}"; shift ;;
    --fleet) FLEET="$2"; shift 2 ;;
    --mesh) MESH="$2"; shift 2 ;;
    --uninstall|--remove) UNINSTALL=1; shift ;;
    *) echo "unknown arg: $1" >&2; exit 1 ;;
  esac
done
[ "$(id -u)" = "0" ] || { echo "error: run as root (sudo)"; exit 1; }

# --- Uninstall: tear the node down so it can be cleanly re-enrolled ---
#   curl -fsSL https://pylond.run/install.sh | sh -s -- --uninstall
if [ "$UNINSTALL" = "1" ]; then
  echo "==> Pylon node uninstall"
  if command -v docker >/dev/null 2>&1; then
    [ -f "$INSTALL_DIR/docker-compose.yml" ] && docker compose -f "$INSTALL_DIR/docker-compose.yml" down --remove-orphans 2>/dev/null || true
    # belt-and-suspenders: remove any stray pylon-node containers
    docker ps -aq --filter "label=com.docker.compose.project=pylon-node" | xargs -r docker rm -f 2>/dev/null || true
  fi
  # Tear down any Pylon KVM VMs (vm tier) and their per-VM overlays.
  if command -v virsh >/dev/null 2>&1; then
    for d in $(virsh list --all --name 2>/dev/null | grep '^pylon-' || true); do
      virsh destroy "$d" 2>/dev/null || true
      virsh undefine "$d" --managed-save --nvram 2>/dev/null || true
    done
  fi
  rm -rf /var/lib/libvirt/images/pylon-vms 2>/dev/null || true
  rm -rf "$INSTALL_DIR"
  echo "✅ Pylon node removed. Re-enroll with a fresh token:"
  echo "   curl -fsSL https://pylond.run/install.sh | sh -s -- --token <NEW_TOKEN>"
  exit 0
fi

[ -n "$TOKEN" ] || { echo "error: --token <ENROLL_TOKEN> is required (mint one in the Pylon host panel → Nodes), or --uninstall to remove"; exit 1; }

echo "==> Pylon node install"
if ! command -v docker >/dev/null 2>&1; then
  echo "==> installing Docker"
  curl -fsSL https://get.docker.com | sh
fi

mkdir -p "$INSTALL_DIR"
echo "==> fetching Pylon CA"
curl -fsSL "$CA_URL" -o "$INSTALL_DIR/ca.crt"

# --- KVM/libvirt VM support (the `vm` tier / vm-driver) ---
# Auto-enabled when /dev/kvm is present (force with PYLON_VM=1). libvirtd runs on
# the HOST and owns /dev/kvm; the node-agent container shells out to virsh/qemu-img/
# cloud-localds and reaches the host libvirtd over the mounted socket. The VM state
# + base-image dirs are shared at IDENTICAL paths so the host's qemu (launched by
# libvirtd) can open the overlay files the container created.
VM_ENABLED=0
if [ -e /dev/kvm ] || [ "${PYLON_VM:-0}" = "1" ]; then VM_ENABLED=1; fi
VM_VOLUMES=""
VM_ENV=""
if [ "$VM_ENABLED" = "1" ]; then
  echo "==> enabling KVM/libvirt VM support (vm tier)"
  if ! command -v virsh >/dev/null 2>&1; then
    if command -v apt-get >/dev/null 2>&1; then
      echo "==> installing qemu-kvm + libvirt + cloud-image-utils"
      DEBIAN_FRONTEND=noninteractive apt-get update -y
      DEBIAN_FRONTEND=noninteractive apt-get install -y \
        qemu-kvm libvirt-daemon-system libvirt-clients qemu-utils cloud-image-utils
    else
      echo "warn: non-apt distro — install qemu-kvm, libvirt, qemu-img, cloud-localds manually" >&2
    fi
  fi
  systemctl enable --now libvirtd 2>/dev/null || true
  [ -e /dev/kvm ] || echo "warn: /dev/kvm absent; VMs would use slow TCG emulation" >&2
  # VM disks live under /var/lib/libvirt/images: libvirt's AppArmor (virt-aa-helper)
  # only grants qemu access to disk images there, so overlays/base/seed must reside
  # under it. The default libvirt NAT network (virbr0) is unused (VMs use QEMU
  # user-net) and its nftables rules can clash with Docker — turn it off.
  mkdir -p /var/lib/libvirt/images/pylon-vms /var/lib/libvirt/images/pylon-base
  virsh -c qemu:///system net-autostart --disable default 2>/dev/null || true
  virsh -c qemu:///system net-destroy default 2>/dev/null || true
  VM_VOLUMES=$(printf '      - /var/run/libvirt:/var/run/libvirt\n      - /var/lib/libvirt:/var/lib/libvirt')
  VM_ENV=$(printf '      - NODE_VM_RUNTIME=on\n      - LIBVIRT_URI=qemu:///system\n      - VM_STATE_DIR=/var/lib/libvirt/images/pylon-vms\n      - VM_BASE_DIR=/var/lib/libvirt/images/pylon-base')
fi

cat > "$INSTALL_DIR/docker-compose.yml" <<YAML
name: pylon-node
services:
  node-agent:
    image: ${IMAGE}
    command: ["node-agent"]
    network_mode: host
    restart: unless-stopped
    volumes:
      - ${INSTALL_DIR}/ca.crt:/pylon-tls/ca.crt:ro
      # The agent runs customer workloads ON THIS NODE via the host's Docker —
      # mount the daemon socket so its bollard runtime can pull + run them. The
      # workload containers are siblings on the host, published on 127.0.0.1, and
      # the agent forwards the tunnel to each container's locally-bound port.
      - /var/run/docker.sock:/var/run/docker.sock
${VM_VOLUMES}
    environment:
      - NODE_FLEET_ENDPOINT=${FLEET}
      - NODE_MESH_ENDPOINT=${MESH}
      - NODE_ENROLL_TOKEN=${TOKEN}
      - NODE_CA_CERT=/pylon-tls/ca.crt
      - NODE_TLS_DOMAIN=${TLS_DOMAIN}
      - NODE_HOSTNAME=$(hostname)
      - NODE_WORKLOAD_HOST=127.0.0.1
      # Run real OCI containers on this node (the moat: workload-on-host). Hold
      # the NodeControl stream so the orchestrator can dispatch here. The
      # NodeControl endpoint is co-hosted on fleet (\$NODE_FLEET_ENDPOINT) — no
      # extra port. Set CONTAINER_RUNTIME=sim / NODE_CONTROL=off for a pure relay.
      - CONTAINER_RUNTIME=${PYLON_RUNTIME:-bollard}
      - NODE_CONTROL=on
      # Platform CurseForge key for AUTO_CURSEFORGE modpack installs (optional).
      - CF_API_KEY=\${CF_API_KEY:-}
      # Off-node backups: when set, the node uploads volume snapshots to object
      # storage directly (console/exec/logs/files all work without these). Unset =
      # backups are sized but not stored. Pass via the environment when installing.
      - S3_ENDPOINT=\${S3_ENDPOINT:-}
      - S3_ACCESS_KEY=\${S3_ACCESS_KEY:-}
      - S3_SECRET_KEY=\${S3_SECRET_KEY:-}
      - S3_BUCKET_BACKUPS=\${S3_BUCKET_BACKUPS:-}
      - S3_FORCE_PATH_STYLE=\${S3_FORCE_PATH_STYLE:-true}
${VM_ENV}
YAML

echo "==> pulling ${IMAGE}"
docker pull "$IMAGE"
echo "==> starting node-agent"
docker compose -f "$INSTALL_DIR/docker-compose.yml" up -d --remove-orphans

echo
echo "✅ Node connected to Pylon. It will appear in your host panel → Nodes within a few seconds."
echo "   Manage: docker compose -f $INSTALL_DIR/docker-compose.yml [logs|down|pull]"
