Backup RFP Infinity controller state before Resolume changes
Some checks failed
WLED CI / wled_build (push) Has been cancelled
Deploy Nightly / wled_build (push) Has been cancelled
Deploy Nightly / Deploy nightly (push) Has been cancelled

This commit is contained in:
jan
2026-05-14 12:31:13 +02:00
parent ebc4498d89
commit 4bc4e1257e
33 changed files with 3482 additions and 695 deletions

View File

@@ -14,7 +14,7 @@ import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable
from typing import Any, Iterable
import requests
@@ -27,12 +27,23 @@ class WledHost:
ip: str
name: str
version: str
release: str
arch: str
@dataclass
class WledInfo(WledHost):
uptime_s: int
raw: dict[str, Any]
@dataclass
class OtaPreflight:
info: WledInfo | None
update_status: str
update_hint: str
firmware_size: int
ota_space_hint: str
def _run(cmd: list[str]) -> str:
@@ -86,8 +97,9 @@ def probe_wled_info(ip: str, timeout_s: float) -> WledInfo | None:
except (requests.RequestException, ValueError):
return None
# WLED info endpoint typically includes "name", "ver", and "arch"
# WLED info endpoint typically includes "name", "ver", "release"/"rel", and "arch".
ver = str(data.get("ver", "")).strip()
release = str(data.get("release") or data.get("rel") or "").strip()
name = str(data.get("name", "")).strip()
arch = str(data.get("arch", "")).strip()
if not ver:
@@ -99,14 +111,14 @@ def probe_wled_info(ip: str, timeout_s: float) -> WledInfo | None:
arch = "-"
uptime_s = int(data.get("uptime", 0) or 0)
return WledInfo(ip=ip, name=name, version=ver, arch=arch, uptime_s=uptime_s)
return WledInfo(ip=ip, name=name, version=ver, release=release, arch=arch, uptime_s=uptime_s, raw=data)
def probe_wled(ip: str, timeout_s: float) -> WledHost | None:
info = probe_wled_info(ip, timeout_s)
if info is None:
return None
return WledHost(ip=info.ip, name=info.name, version=info.version, arch=info.arch)
return WledHost(ip=info.ip, name=info.name, version=info.version, release=info.release, arch=info.arch)
def discover_hosts(
@@ -143,8 +155,8 @@ def read_targets_file(path: Path) -> list[str]:
def write_discovery(path: Path, hosts: list[WledHost]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
lines = ["# ip name version arch"]
lines += [f"{h.ip} {h.name} {h.version} {h.arch}" for h in hosts]
lines = ["# ip name version release arch"]
lines += [f"{h.ip} {h.name} {h.version} {h.release or '-'} {h.arch}" for h in hosts]
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
@@ -176,22 +188,198 @@ def wait_for_online_info(ip: str, timeout_s: float, interval_s: float) -> WledIn
return None
def reboot_confirmed(before: WledInfo | None, after: WledInfo, offline_seen: bool) -> tuple[bool, str]:
def reboot_confirmed(
before: WledInfo | None,
after: WledInfo,
offline_seen: bool,
transport_reset_seen: bool,
expected_release: str | None,
) -> tuple[bool, str]:
if offline_seen:
return True, "offline transition observed"
if before is None:
release_ok, release_reason = release_matches(after.release, expected_release)
if transport_reset_seen and release_ok:
return True, f"transport reset during upload and {release_reason}"
return False, "device was not profiled before upload, and no offline transition was observed"
if after.uptime_s + 5 < before.uptime_s:
return True, f"uptime reset from {before.uptime_s}s to {after.uptime_s}s"
release_ok, release_reason = release_matches(after.release, expected_release)
if transport_reset_seen and release_ok:
return True, (
"transport reset during upload and expected release is present "
f"({before.uptime_s}s -> {after.uptime_s}s; weak proof when flashing the same release)"
)
return False, f"device stayed reachable and uptime did not reset ({before.uptime_s}s -> {after.uptime_s}s)"
def ota_flash(ip: str, firmware: Path, connect_timeout_s: float, read_timeout_s: float) -> tuple[str, str]:
def release_matches(actual: str, expected: str | None) -> tuple[bool, str]:
if not expected:
return True, "no expected release configured"
if actual == expected:
return True, f"release matches {expected}"
if not actual:
return False, f"release is not exposed; expected {expected}"
return False, f"release mismatch: expected {expected}, got {actual}"
def _iter_numeric_fields(value: Any, prefix: str = "") -> Iterable[tuple[str, int]]:
if isinstance(value, dict):
for key, nested in value.items():
nested_prefix = f"{prefix}.{key}" if prefix else str(key)
yield from _iter_numeric_fields(nested, nested_prefix)
elif isinstance(value, list):
for index, nested in enumerate(value):
yield from _iter_numeric_fields(nested, f"{prefix}[{index}]")
elif isinstance(value, (int, float)) and not isinstance(value, bool):
yield prefix, int(value)
def ota_space_hint(info: WledInfo | None, firmware_size: int) -> str:
if info is None:
return "unknown, /json/info was not reachable"
candidates: list[tuple[str, int]] = []
for key, value in _iter_numeric_fields(info.raw):
lowered = key.lower()
# Runtime memory fields such as totalheap/freeheap are not OTA slots.
# They can contain the substring "ota" (for example "totalheap"), so
# exclude them before looking for OTA-related names.
if any(token in lowered for token in ("heap", "psram", "ram")):
continue
if any(token in lowered for token in ("sketch", "ota", "update")) and value > 0:
candidates.append((key, value))
# WLED does not consistently expose ESP.getFreeSketchSpace() in /json/info.
if not candidates:
return "not exposed by this firmware; if OTA still fails, USB-clean-flash may be required"
best_key, best_value = max(candidates, key=lambda item: item[1])
if best_value < firmware_size:
return f"WARNING: {best_key}={best_value} bytes is smaller than firmware={firmware_size} bytes"
return f"{best_key}={best_value} bytes, firmware={firmware_size} bytes"
def probe_update_page(ip: str, timeout_s: float) -> tuple[str, str]:
url = f"http://{ip}/update"
try:
resp = requests.get(url, timeout=timeout_s)
except requests.RequestException as exc:
return "unreachable", f"GET /update failed: {exc}"
text = (resp.text or "").lower()
if resp.status_code in (401, 403):
return "blocked", f"HTTP {resp.status_code}; OTA may be locked or authentication may be required"
if resp.status_code >= 400:
return "warning", f"HTTP {resp.status_code}"
if any(token in text for token in ("ota lock", "ota locked", "locked", "forbidden", "incorrect pin")):
return "blocked", "update page suggests OTA is locked or PIN/auth is required"
if any(token in text for token in ("update", "upload", "firmware")):
return "ok", "update page reachable"
return "warning", "update page reachable, but expected upload form text was not detected"
def preflight(ip: str, firmware: Path, timeout_s: float) -> OtaPreflight:
firmware_size = firmware.stat().st_size
info = probe_wled_info(ip, timeout_s=timeout_s)
update_status, update_hint = probe_update_page(ip, timeout_s=timeout_s)
return OtaPreflight(
info=info,
update_status=update_status,
update_hint=update_hint,
firmware_size=firmware_size,
ota_space_hint=ota_space_hint(info, firmware_size),
)
def print_preflight(index: int, total: int, ip: str, result: OtaPreflight) -> None:
prefix = f"[{index}/{total}] {ip}"
if result.info is None:
print(f"{prefix}: /json/info unavailable")
else:
print(
f"{prefix}: current firmware {result.info.version}, release '{result.info.release or '-'}', uptime {result.info.uptime_s}s, "
f"name '{result.info.name}', arch '{result.info.arch or '-'}'"
)
print(f"{prefix}: firmware size {result.firmware_size} bytes")
print(f"{prefix}: OTA space hint: {result.ota_space_hint}")
print(f"{prefix}: /update preflight: {result.update_status} ({result.update_hint})")
def request_reboot(ip: str, timeout_s: float) -> tuple[bool, str]:
url = f"http://{ip}/json/state"
try:
resp = requests.post(url, json={"rb": True}, timeout=timeout_s)
except requests.RequestException as exc:
return False, f"reboot request failed: {exc}"
if resp.status_code >= 400:
return False, f"reboot request returned HTTP {resp.status_code}"
return True, "reboot requested via /json/state"
def classify_update_response(text: str) -> tuple[str, str]:
normalized = " ".join((text or "").strip().split())
lowered = normalized.lower()
snippet = normalized[:180]
if "update successful" in lowered or "rebooting" in lowered:
return "ok", "ok"
if "update failed" in lowered or "could not activate the firmware" in lowered:
return "failed", f"device reported update failure: {snippet or 'empty response'}"
# WLED message pages contain generic scripts and wording; unknown 200 OK
# responses are verified by the reboot/release checks instead of string
# guessing here.
return "ok", "ok"
def ota_flash(
ip: str,
firmware: Path,
connect_timeout_s: float,
read_timeout_s: float,
skip_validation: bool,
backend: str,
) -> tuple[str, str]:
# Send skipValidation both as query and multipart field. Different WLED-MM
# builds have used different request parameter paths around OTA validation.
url = f"http://{ip}/update"
if skip_validation:
url += "?skipValidation=1"
if backend == "curl":
cmd = [
"curl",
"-sS",
"--connect-timeout",
str(max(1, int(connect_timeout_s))),
"--max-time",
str(max(1, int(read_timeout_s))),
"-F",
f"update=@{firmware}",
]
if skip_validation:
cmd += ["-F", "skipValidation=1"]
cmd.append(url)
try:
proc = subprocess.run(cmd, check=False, capture_output=True, text=True)
except OSError as exc:
return "failed", f"curl unavailable or failed to start: {exc}"
combined = " ".join((proc.stdout + " " + proc.stderr).strip().split())
snippet = combined[:180]
if proc.returncode == 0:
return classify_update_response(combined)
# WLED often closes the socket or reboots before curl receives a clean
# response. Treat transport drops as uncertain and prove success later.
if proc.returncode in (52, 55, 56):
return "transport_reset", f"curl exit {proc.returncode}: {snippet or 'connection dropped during/after upload'}"
if proc.returncode == 28:
return "uncertain", f"curl exit {proc.returncode}: {snippet or 'upload timed out'}"
return "failed", f"curl exit {proc.returncode}: {snippet or 'empty response'}"
try:
with firmware.open("rb") as fh:
resp = requests.post(
url,
data={"skipValidation": "1"} if skip_validation else None,
files={"update": (firmware.name, fh, "application/octet-stream")},
timeout=(connect_timeout_s, read_timeout_s),
)
@@ -200,16 +388,15 @@ def ota_flash(ip: str, firmware: Path, connect_timeout_s: float, read_timeout_s:
return "uncertain", "read timeout after upload"
except requests.ConnectionError:
# Some devices close the socket abruptly when rebooting after successful OTA.
return "uncertain", "connection dropped during/after upload"
return "transport_reset", "connection dropped during/after upload"
except requests.RequestException as exc:
return "failed", f"request failed: {exc}"
text = (resp.text or "").lower()
text = resp.text or ""
snippet = " ".join(text.strip().split())[:180]
if resp.status_code >= 400:
return "failed", f"http {resp.status_code}"
if "fail" in text or "error" in text:
return "failed", "device reported update failure"
return "ok", "ok"
return "failed", f"http {resp.status_code}: {snippet or 'empty response'}"
return classify_update_response(text)
def print_hosts(hosts: list[WledHost]) -> None:
@@ -217,10 +404,11 @@ def print_hosts(hosts: list[WledHost]) -> None:
print("No WLED devices found.")
return
print(f"Found {len(hosts)} WLED device(s):")
print(f"{'IP':<16} {'Name':<24} {'Version':<18} {'Arch'}")
print("-" * 80)
print(f"{'IP':<16} {'Name':<24} {'Version':<18} {'Release':<30} {'Arch'}")
print("-" * 112)
for h in hosts:
print(f"{h.ip:<16} {h.name[:24]:<24} {h.version[:18]:<18} {h.arch}")
release = h.release or "-"
print(f"{h.ip:<16} {h.name[:24]:<24} {h.version[:18]:<18} {release[:30]:<30} {h.arch}")
def cmd_discover(args: argparse.Namespace) -> int:
@@ -285,35 +473,64 @@ def cmd_flash(args: argparse.Namespace) -> int:
failures: list[str] = []
for idx, ip in enumerate(targets, start=1):
before = probe_wled_info(ip, timeout_s=args.timeout)
if before is not None:
print(
f"[{idx}/{len(targets)}] {ip}: current firmware {before.version}, "
f"uptime {before.uptime_s}s, name '{before.name}'"
)
check = preflight(ip=ip, firmware=firmware, timeout_s=args.timeout)
print_preflight(idx, len(targets), ip, check)
before = check.info
if args.preflight_only:
if before is not None and args.expect_release:
release_ok, release_reason = release_matches(before.release, args.expect_release)
level = "ok" if release_ok else "warning"
print(f"[{idx}/{len(targets)}] {ip}: current release check: {level} ({release_reason})")
if before is None or check.update_status in ("blocked", "unreachable"):
print(f"[{idx}/{len(targets)}] {ip}: FAILED (preflight did not prove OTA readiness)")
failures.append(ip)
continue
if check.update_status == "blocked" and not args.ignore_preflight_warnings:
print(f"[{idx}/{len(targets)}] {ip}: FAILED (OTA preflight is blocked; use --ignore-preflight-warnings to try anyway)")
failures.append(ip)
continue
if args.skip_if_release_matches and before is not None and args.expect_release:
release_ok, release_reason = release_matches(before.release, args.expect_release)
if release_ok:
print(f"[{idx}/{len(targets)}] {ip}: SKIP ({release_reason}; use --force-same-release to reflash anyway)")
continue
print(f"[{idx}/{len(targets)}] {ip}: uploading...")
status, msg = ota_flash(
ip=ip,
firmware=firmware,
connect_timeout_s=args.connect_timeout,
read_timeout_s=args.upload_timeout,
skip_validation=args.skip_validation,
backend=args.upload_backend,
)
if status == "failed":
print(f"[{idx}/{len(targets)}] {ip}: FAILED ({msg})")
failures.append(ip)
continue
if status == "uncertain":
transport_reset_seen = status == "transport_reset"
if status in ("uncertain", "transport_reset"):
print(f"[{idx}/{len(targets)}] {ip}: upload response uncertain ({msg}), verifying via reboot check...")
else:
print(f"[{idx}/{len(targets)}] {ip}: uploaded, waiting {args.reboot_wait:.1f}s for reboot...")
time.sleep(args.reboot_wait)
forced_reboot_used = False
offline_seen = wait_for_offline(ip=ip, timeout_s=args.offline_timeout, interval_s=0.5)
if offline_seen:
print(f"[{idx}/{len(targets)}] {ip}: reboot detected, device went offline.")
else:
print(f"[{idx}/{len(targets)}] {ip}: warning, no offline transition observed. Checking uptime reset...")
if args.force_reboot_after_upload:
reboot_sent, reboot_msg = request_reboot(ip=ip, timeout_s=args.timeout)
print(f"[{idx}/{len(targets)}] {ip}: forced reboot fallback: {reboot_msg}")
if reboot_sent:
forced_reboot_used = True
time.sleep(args.reboot_wait)
offline_seen = wait_for_offline(ip=ip, timeout_s=args.offline_timeout, interval_s=0.5)
if offline_seen:
print(f"[{idx}/{len(targets)}] {ip}: forced reboot detected, device went offline.")
after = wait_for_online_info(ip=ip, timeout_s=args.online_timeout, interval_s=1.0)
if after is None:
@@ -321,16 +538,30 @@ def cmd_flash(args: argparse.Namespace) -> int:
failures.append(ip)
continue
reboot_ok, reason = reboot_confirmed(before=before, after=after, offline_seen=offline_seen)
reboot_ok, reason = reboot_confirmed(
before=before,
after=after,
offline_seen=offline_seen,
transport_reset_seen=transport_reset_seen,
expected_release=args.expect_release,
)
if not reboot_ok:
print(f"[{idx}/{len(targets)}] {ip}: FAILED (could not prove reboot: {reason})")
failures.append(ip)
continue
release_ok, release_reason = release_matches(after.release, args.expect_release)
if not release_ok:
print(f"[{idx}/{len(targets)}] {ip}: FAILED ({release_reason})")
failures.append(ip)
continue
print(
f"[{idx}/{len(targets)}] {ip}: OK "
f"(now {after.version}, uptime {after.uptime_s}s, {reason})"
f"(now {after.version}, release '{after.release or '-'}', uptime {after.uptime_s}s, {reason}, {release_reason})"
)
if forced_reboot_used:
print(f"[{idx}/{len(targets)}] {ip}: warning, reboot was forced after an uncertain upload; verify the firmware manually in /json/info")
if failures:
print("\nFailed targets:")
@@ -338,7 +569,10 @@ def cmd_flash(args: argparse.Namespace) -> int:
print(f"- {ip}")
return 1
print("\nAll targets flashed successfully.")
if args.preflight_only:
print("\nAll targets passed preflight.")
else:
print("\nAll targets flashed successfully.")
return 0
@@ -367,6 +601,15 @@ def build_parser() -> argparse.ArgumentParser:
p_flash.add_argument("--reboot-wait", type=float, default=10.0, help="Sleep after upload before online check")
p_flash.add_argument("--offline-timeout", type=float, default=20.0, help="How long to wait for the device to disappear during reboot")
p_flash.add_argument("--online-timeout", type=float, default=60.0, help="How long to wait for device to come back")
p_flash.add_argument("--preflight-only", action="store_true", help="Only print /json/info and /update diagnostics, do not upload")
p_flash.add_argument("--ignore-preflight-warnings", action="store_true", help="Try upload even if /update preflight looks blocked")
p_flash.add_argument("--force-reboot-after-upload", action="store_true", help="After uncertain upload with no offline transition, request reboot via /json/state and verify uptime")
p_flash.add_argument("--expect-release", help="Require /json/info release/rel to match this value after OTA")
p_flash.add_argument("--skip-if-release-matches", action="store_true", help="Skip a target when its current release already matches --expect-release")
p_flash.add_argument("--force-same-release", dest="skip_if_release_matches", action="store_false", help="Reflash even when the current release already matches --expect-release")
p_flash.add_argument("--skip-validation", action="store_true", help="Send WLED skipValidation=1 for controlled migrations between release names")
p_flash.add_argument("--upload-backend", choices=("curl", "requests"), default="curl", help="HTTP upload implementation (default: curl, matching WLED helper scripts)")
p_flash.set_defaults(skip_if_release_matches=False)
p_flash.set_defaults(func=cmd_flash)
return parser