Stabilize discovery lifecycle and rescan summary

This commit is contained in:
Artem Kokos
2026-05-16 10:59:31 +07:00
parent 15529961d6
commit 1ac66ec4ac
8 changed files with 604 additions and 124 deletions

View File

@@ -32,6 +32,8 @@ UI: `http://<host>:8000/`
IGNIS_API_KEY=change-me
APP_TIMEZONE=Asia/Novosibirsk
SCAN_NETWORK=
DISCOVERY_INTERVAL_SECONDS=600
DISCOVERY_BACKGROUND_MISSING_THRESHOLD=2
LOG_LEVEL=INFO
EVENT_LOG_RETENTION_DAYS=30
```
@@ -45,8 +47,12 @@ IGNIS_SYNC_DATABASE_URL=sqlite:///./ignis.db
Замечание по discovery:
- если на хосте есть VPN или несколько интерфейсов, лучше явно задать `SCAN_NETWORK`
- если `SCAN_NETWORK` не задан, сервер сам выбирает private IPv4-подсети обычных интерфейсов и старается не сканировать VPN / docker / tunnel-интерфейсы
- если на хосте есть VPN или несколько интерфейсов, всё равно лучше явно задать `SCAN_NETWORK`
- формат: `192.168.0.0/24` или список через запятую
- startup scan выполняется до старта фонового цикла
- background refresh по умолчанию удаляет устройство только после двух подряд промахов discovery
- manual `POST /devices/rescan` удаляет оффлайн-устройства сразу и возвращает summary (`found`, `added`, `updated`, `removed_offline`, `pending_removal`, `online`)
## Авторизация
@@ -162,7 +168,7 @@ curl -X POST 'http://localhost:8000/schedules/once' \
timeout 120s .venv/bin/python -m unittest discover -s tests -v
```
Сейчас есть 17 тестов. Покрыты:
Сейчас есть 25 тестов. Покрыты:
- auth и роли
- lifecycle API-ключей
@@ -170,6 +176,9 @@ timeout 120s .venv/bin/python -m unittest discover -s tests -v
- валидация scene
- one-shot и cron расписания
- миграция legacy jobs
- auto-subnet selection для discovery
- background offline cleanup threshold
- manual rescan summary и immediate cleanup
- агрегация stats без двойного счёта `*_requested`
## Ограничения

View File

@@ -1,10 +1,12 @@
import logging
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy import select
from app.core.state import state_manager, discovery_service
from app.core.database import async_session
from app.models.device import GroupModel, GroupCreateSchema
from app.api.schemas import RescanResponse
from app.api.deps import verify_token, require_admin
from app.core.database import async_session
from app.core.state import state_manager, discovery_service
from app.models.device import GroupModel, GroupCreateSchema
from app.drivers.wiz import WizDriver
logger = logging.getLogger(__name__)
@@ -61,25 +63,14 @@ async def delete_group(group_id: str):
return {"status": "deleted", "id": group_id}
@router.post("/rescan", dependencies=[Depends(require_admin)])
@router.post(
"/rescan",
dependencies=[Depends(require_admin)],
response_model=RescanResponse,
)
async def rescan_network():
found_devices = await discovery_service.scan_network()
# MAC-адреса найденных ламп
found_macs = {dev["mac"] for dev in found_devices}
# Удаляем устройства, которые не ответили (оффлайн)
offline_macs = [mac for mac in state_manager.devices if mac not in found_macs]
for mac in offline_macs:
del state_manager.devices[mac]
logger.info(f"Устройство {mac} не ответило -- убрано из списка")
# Обновляем/добавляем найденные
for dev_data in found_devices:
state_manager.update_device(dev_data)
summary = await discovery_service.manual_refresh(state_manager)
return {
"status": "ok",
"found": len(found_macs),
"removed_offline": len(offline_macs),
**summary.to_dict(),
}

View File

@@ -168,3 +168,13 @@ class ScheduleTasksResponse(BaseModel):
class DeleteStatusResponse(BaseModel):
status: Literal["deleted"]
class RescanResponse(BaseModel):
status: Literal["ok"]
found: int
added: int
updated: int
removed_offline: int
pending_removal: int
online: int

View File

@@ -1,64 +1,224 @@
import asyncio
import ipaddress
import json
import socket
import logging
import os
import ipaddress
from typing import List, Dict
import socket
import struct
from dataclasses import dataclass
from typing import Dict, List
try:
import fcntl
except ImportError: # pragma: no cover - не на Linux
fcntl = None
logger = logging.getLogger(__name__)
# Минимальный допустимый prefixlen (больше число = меньше сеть)
# /16 = 65534 хоста, /8 = 16M хостов -- слишком много
MIN_PREFIX_LEN = 16
ENV_MIN_PREFIX_LEN = 16
AUTO_MIN_PREFIX_LEN = 24
DEFAULT_DISCOVERY_INTERVAL_SECONDS = 600
DEFAULT_BACKGROUND_MISSING_THRESHOLD = 2
EXCLUDED_INTERFACE_PREFIXES = (
"lo",
"docker",
"br-",
"veth",
"virbr",
"tun",
"tap",
"wg",
"tailscale",
"zt",
"utun",
"ppp",
)
SIOCGIFADDR = 0x8915
SIOCGIFNETMASK = 0x891B
@dataclass(frozen=True)
class InterfaceSubnet:
name: str
address: ipaddress.IPv4Address
network: ipaddress.IPv4Network
class DiscoveryService:
def __init__(self, port: int = 38899):
self.port = port
self.discover_msg = {"method": "getPilot", "params": {}}
self._scan_lock = asyncio.Lock()
def _env_min_prefix_len(self) -> int:
return int(os.getenv("DISCOVERY_ENV_MIN_PREFIX_LEN", ENV_MIN_PREFIX_LEN))
def _auto_min_prefix_len(self) -> int:
return int(os.getenv("DISCOVERY_AUTO_MIN_PREFIX_LEN", AUTO_MIN_PREFIX_LEN))
def _background_interval_seconds(self) -> int:
return int(
os.getenv(
"DISCOVERY_INTERVAL_SECONDS", DEFAULT_DISCOVERY_INTERVAL_SECONDS
)
)
def _background_missing_threshold(self) -> int:
return int(
os.getenv(
"DISCOVERY_BACKGROUND_MISSING_THRESHOLD",
DEFAULT_BACKGROUND_MISSING_THRESHOLD,
)
)
def _parse_env_subnets(self, value: str) -> List[str]:
subnets: list[str] = []
min_prefix_len = self._env_min_prefix_len()
for raw_subnet in value.split(","):
subnet = raw_subnet.strip()
if not subnet:
continue
def _get_target_subnets(self) -> List[str]:
"""
Определяет список подсетей для сканирования.
Приоритет:
1. Переменная окружения SCAN_NETWORK (можно через запятую: "192.168.0.0/24,192.168.1.0/24")
2. Автоопределение по дефолтному шлюзу
"""
env_network = os.getenv("SCAN_NETWORK")
if env_network:
subnets = []
for s in env_network.split(","):
s = s.strip()
try:
net = ipaddress.IPv4Network(s, strict=False)
if net.prefixlen < MIN_PREFIX_LEN:
network = ipaddress.IPv4Network(subnet, strict=False)
except ValueError as exc:
logger.error("Неверный формат подсети %s: %s", subnet, exc)
continue
if network.prefixlen < min_prefix_len:
logger.warning(
f"Подсеть {s} слишком большая (/{net.prefixlen}), "
f"ограничиваю до /{MIN_PREFIX_LEN}"
"Подсеть %s слишком большая (/%s), ограничиваю до /%s",
subnet,
network.prefixlen,
min_prefix_len,
)
net = ipaddress.IPv4Network(
f"{net.network_address}/{MIN_PREFIX_LEN}", strict=False
network = ipaddress.IPv4Network(
f"{network.network_address}/{min_prefix_len}", strict=False
)
subnets.append(str(net))
except ValueError as e:
logger.error(f"Неверный формат подсети {s}: {e}")
return subnets if subnets else ["192.168.1.0/24"]
subnets.append(str(network))
# Автоопределение
return subnets
def _interface_subnets(self) -> list[InterfaceSubnet]:
if fcntl is None:
return []
candidates: list[InterfaceSubnet] = []
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
for _, interface_name in socket.if_nameindex():
ifreq = struct.pack("256s", interface_name.encode("utf-8")[:15])
try:
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
# Коннект не создает трафика, но заставляет ОС выбрать нужный интерфейс
s.connect(("8.8.8.8", 80))
local_ip = s.getsockname()[0]
network = ipaddress.IPv4Network(f"{local_ip}/24", strict=False)
return [str(network)]
except Exception as e:
address = socket.inet_ntoa(
fcntl.ioctl(sock.fileno(), SIOCGIFADDR, ifreq)[20:24]
)
netmask = socket.inet_ntoa(
fcntl.ioctl(sock.fileno(), SIOCGIFNETMASK, ifreq)[20:24]
)
except OSError:
continue
ipv4 = ipaddress.IPv4Address(address)
if ipv4.is_loopback or ipv4.is_link_local:
continue
network = ipaddress.IPv4Network(f"{address}/{netmask}", strict=False)
candidates.append(
InterfaceSubnet(
name=interface_name,
address=ipv4,
network=network,
)
)
return candidates
def _is_excluded_interface(self, interface_name: str) -> bool:
lowered = interface_name.lower()
return lowered.startswith(EXCLUDED_INTERFACE_PREFIXES)
def _normalize_auto_network(
self, candidate: InterfaceSubnet
) -> ipaddress.IPv4Network:
min_prefix_len = self._auto_min_prefix_len()
target_prefix_len = max(candidate.network.prefixlen, min_prefix_len)
if target_prefix_len != candidate.network.prefixlen:
logger.info(
"Авто-discovery: подсеть %s (%s) шире /%s, сканирую локальный сегмент /%s",
candidate.network,
candidate.name,
min_prefix_len,
target_prefix_len,
)
return ipaddress.IPv4Network(
f"{candidate.address}/{target_prefix_len}", strict=False
)
def _collect_auto_subnets(self) -> list[str]:
candidates = self._interface_subnets()
if not candidates:
return []
private_candidates = [candidate for candidate in candidates if candidate.address.is_private]
usable_candidates = private_candidates or candidates
preferred_candidates = [
candidate
for candidate in usable_candidates
if not self._is_excluded_interface(candidate.name)
]
selected_candidates = preferred_candidates or usable_candidates
subnets: list[str] = []
seen: set[str] = set()
for candidate in selected_candidates:
normalized = str(self._normalize_auto_network(candidate))
if normalized in seen:
continue
seen.add(normalized)
subnets.append(normalized)
if subnets:
logger.info(
"Авто-discovery: выбраны подсети %s",
", ".join(subnets),
)
return subnets
def _fallback_subnet(self) -> list[str]:
try:
with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
sock.connect(("8.8.8.8", 80))
local_ip = sock.getsockname()[0]
except Exception as exc:
logger.error(
f"Discovery Error: Не удалось определить подсеть автоматически: {e}"
"Discovery Error: Не удалось определить подсеть автоматически: %s",
exc,
)
return ["192.168.1.0/24"]
network = ipaddress.IPv4Network(
f"{local_ip}/{self._auto_min_prefix_len()}",
strict=False,
)
logger.info(
"Авто-discovery fallback: использую локальный сегмент %s", network
)
return [str(network)]
def _get_target_subnets(self) -> List[str]:
env_network = os.getenv("SCAN_NETWORK", "").strip()
if env_network:
subnets = self._parse_env_subnets(env_network)
return subnets if subnets else ["192.168.1.0/24"]
auto_subnets = self._collect_auto_subnets()
if auto_subnets:
return auto_subnets
return self._fallback_subnet()
async def scan_network(self, timeout: float = 2.0) -> List[Dict]:
subnets = self._get_target_subnets()
found_devices = []
@@ -69,9 +229,9 @@ class DiscoveryService:
loop = asyncio.get_running_loop()
message = json.dumps(self.discover_msg).encode()
logger.debug(f"Начинаю сканирование сетей: {', '.join(subnets)}...")
logger.debug("Начинаю сканирование сетей: %s...", ", ".join(subnets))
# Рассылаем запросы по всем целевым сетям
try:
for subnet in subnets:
try:
network = ipaddress.IPv4Network(subnet)
@@ -80,54 +240,108 @@ class DiscoveryService:
sock.sendto(message, (str(ip), self.port))
except Exception:
continue
except ValueError as e:
logger.error(f"Неверный формат подсети {subnet}: {e}")
except ValueError as exc:
logger.error("Неверный формат подсети %s: %s", subnet, exc)
# Собираем ответы
start_time = loop.time()
while (loop.time() - start_time) < timeout:
try:
# Используем небольшой таймаут на чтение, чтобы успевать выходить из цикла
data, addr = await asyncio.wait_for(
loop.run_in_executor(None, sock.recvfrom, 1024), timeout=0.2
)
resp = json.loads(data.decode())
if "result" in resp:
res = resp["result"]
mac = res.get("mac")
if mac:
if "result" not in resp:
continue
result = resp["result"]
mac = result.get("mac")
if not mac:
continue
found_devices.append(
{
"mac": mac,
"ip": addr[0],
"state": {
"on": res.get("state"),
"dimming": res.get("dimming"),
"temp": res.get("temp"),
"on": result.get("state"),
"dimming": result.get("dimming"),
"temp": result.get("temp"),
},
}
)
logger.info(f" [+] Найдена лампа: {addr[0]} | MAC: {mac}")
logger.info(" [+] Найдена лампа: %s | MAC: %s", addr[0], mac)
except (asyncio.TimeoutError, json.JSONDecodeError):
continue
except Exception:
await asyncio.sleep(0.01)
continue
finally:
sock.close()
# Фильтруем дубликаты
return list({d["mac"]: d for d in found_devices}.values())
async def start_background_discovery(self, state_manager, interval=600):
"""Запускает бесконечный цикл сканирования."""
return list({device["mac"]: device for device in found_devices}.values())
async def _refresh_devices(
self,
state_manager,
*,
mode: str,
remove_missing: bool,
missing_threshold: int,
timeout: float = 2.0,
):
async with self._scan_lock:
found_devices = await self.scan_network(timeout=timeout)
result = state_manager.apply_discovery_snapshot(
found_devices,
remove_missing=remove_missing,
missing_threshold=missing_threshold,
)
logger.info(
"Discovery (%s): found=%s added=%s updated=%s removed=%s pending_removal=%s online=%s",
mode,
result.found,
result.added,
result.updated,
result.removed_offline,
result.pending_removal,
result.online,
)
return result
async def startup_refresh(self, state_manager, timeout: float = 2.0):
return await self._refresh_devices(
state_manager,
mode="startup",
remove_missing=True,
missing_threshold=1,
timeout=timeout,
)
async def manual_refresh(self, state_manager, timeout: float = 2.0):
return await self._refresh_devices(
state_manager,
mode="manual",
remove_missing=True,
missing_threshold=1,
timeout=timeout,
)
async def background_refresh(self, state_manager, timeout: float = 2.0):
return await self._refresh_devices(
state_manager,
mode="background",
remove_missing=True,
missing_threshold=self._background_missing_threshold(),
timeout=timeout,
)
async def start_background_discovery(self, state_manager, interval: int | None = None):
interval_seconds = interval or self._background_interval_seconds()
while True:
await asyncio.sleep(interval_seconds)
try:
found_devices = await self.scan_network()
for dev_data in found_devices:
state_manager.update_device(dev_data)
logger.info(f"Discovery: онлайн {len(state_manager.devices)} устройств")
except Exception as e:
logger.error(f"Discovery background error: {e}")
await asyncio.sleep(interval)
await self.background_refresh(state_manager)
except Exception as exc:
logger.error("Discovery background error: %s", exc)

View File

@@ -1,28 +1,96 @@
from dataclasses import asdict, dataclass
import logging
from typing import Dict, List, Optional
from typing import Dict, List
from app.models.device import DeviceSchema, GroupModel
from app.core.discovery import DiscoveryService
logger = logging.getLogger(__name__)
@dataclass(frozen=True)
class DiscoveryApplyResult:
found: int
added: int
updated: int
removed_offline: int
pending_removal: int
online: int
def to_dict(self) -> dict:
return asdict(self)
class StateManager:
def __init__(self):
# Храним устройства как Pydantic объекты
self.devices: Dict[str, DeviceSchema] = {}
# Группы как модели SQLAlchemy
self.groups: Dict[str, GroupModel] = {}
# Сколько подряд циклов discovery устройство не видно
self._missing_scan_counts: Dict[str, int] = {}
def update_device(self, device_data: dict):
"""Обновляет или добавляет устройство в состояние."""
mac = device_data["mac"]
# Используем DeviceSchema вместо Device
current = self.devices.get(mac)
device = DeviceSchema(
id=mac, ip=device_data["ip"], name=f"WiZ {mac[-4:]}", room="Default"
id=mac,
ip=device_data["ip"],
name=current.name if current else f"WiZ {mac[-4:]}",
room=current.room if current else "Default",
)
self.devices[mac] = device
self._missing_scan_counts.pop(mac, None)
def apply_discovery_snapshot(
self,
found_devices: list[dict],
*,
remove_missing: bool,
missing_threshold: int = 1,
) -> DiscoveryApplyResult:
found_by_mac = {device["mac"]: device for device in found_devices}
added = 0
updated = 0
for mac, device_data in found_by_mac.items():
if mac in self.devices:
updated += 1
else:
added += 1
self.update_device(device_data)
removed_offline = 0
if remove_missing:
for mac in list(self.devices):
if mac in found_by_mac:
continue
missed_scans = self._missing_scan_counts.get(mac, 0) + 1
self._missing_scan_counts[mac] = missed_scans
if missed_scans < missing_threshold:
logger.info(
"Устройство %s не ответило (%s/%s), оставляю до следующего цикла",
mac,
missed_scans,
missing_threshold,
)
continue
self.devices.pop(mac, None)
self._missing_scan_counts.pop(mac, None)
removed_offline += 1
logger.info("Устройство %s не ответило -- убрано из списка", mac)
return DiscoveryApplyResult(
found=len(found_by_mac),
added=added,
updated=updated,
removed_offline=removed_offline,
pending_removal=len(self._missing_scan_counts),
online=len(self.devices),
)
def get_group_ips(self, group_id: str) -> List[str]:
"""Возвращает список IP всех ламп, входящих в группу."""

View File

@@ -31,10 +31,13 @@ async def lifespan(app: FastAPI):
state_manager.groups[g.id] = g
logger.info(f"📂 Загружена группа: {g.name}")
# 3. Планировщик после загрузки метаданных групп
# 3. Startup discovery до старта фонового цикла
await discovery_service.startup_refresh(state_manager)
# 4. Планировщик после загрузки метаданных групп
await start_scheduler()
# 4. Фоновый Discovery
# 5. Фоновый Discovery
discovery_task = asyncio.create_task(
discovery_service.start_background_discovery(state_manager)
)

View File

@@ -176,7 +176,9 @@
"description": "Successful Response",
"content": {
"application/json": {
"schema": {}
"schema": {
"$ref": "#/components/schemas/RescanResponse"
}
}
}
}
@@ -1307,6 +1309,49 @@
"title": "KeyActionRequest",
"description": "Тело запроса для операций с ключом (чтобы токен не летел в URL)."
},
"RescanResponse": {
"properties": {
"status": {
"const": "ok",
"title": "Status"
},
"found": {
"type": "integer",
"title": "Found"
},
"added": {
"type": "integer",
"title": "Added"
},
"updated": {
"type": "integer",
"title": "Updated"
},
"removed_offline": {
"type": "integer",
"title": "Removed Offline"
},
"pending_removal": {
"type": "integer",
"title": "Pending Removal"
},
"online": {
"type": "integer",
"title": "Online"
}
},
"type": "object",
"required": [
"status",
"found",
"added",
"updated",
"removed_offline",
"pending_removal",
"online"
],
"title": "RescanResponse"
},
"ScheduleCreateResponse": {
"properties": {
"status": {

140
tests/test_p1_discovery.py Normal file
View File

@@ -0,0 +1,140 @@
import ipaddress
import os
import unittest
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import AsyncMock, patch
from httpx import ASGITransport, AsyncClient
from sqlalchemy import delete
TEST_DB_PATH = Path(__file__).with_name("test_ignis_discovery.db")
if TEST_DB_PATH.exists():
TEST_DB_PATH.unlink()
MASTER_KEY = "master-secret-for-discovery-tests"
os.environ["IGNIS_API_KEY"] = MASTER_KEY
os.environ["IGNIS_DATABASE_URL"] = f"sqlite+aiosqlite:///{TEST_DB_PATH}"
os.environ["IGNIS_SYNC_DATABASE_URL"] = f"sqlite:///{TEST_DB_PATH}"
import main # noqa: E402
from app.core.database import async_session, init_db # noqa: E402
from app.core.discovery import DiscoveryService, InterfaceSubnet # noqa: E402
from app.core.state import state_manager # noqa: E402
from app.models.device import GroupModel # noqa: E402
class DiscoveryBehaviorTests(unittest.IsolatedAsyncioTestCase):
async def asyncSetUp(self):
os.environ["IGNIS_API_KEY"] = MASTER_KEY
await init_db()
await self._reset_database()
state_manager.devices.clear()
state_manager.groups.clear()
state_manager._missing_scan_counts.clear()
self.client = AsyncClient(
transport=ASGITransport(app=main.app),
base_url="http://testserver",
)
async def asyncTearDown(self):
await self.client.aclose()
state_manager.devices.clear()
state_manager.groups.clear()
state_manager._missing_scan_counts.clear()
async def _reset_database(self):
async with async_session() as session:
await session.execute(delete(GroupModel))
await session.commit()
def _headers(self) -> dict[str, str]:
return {"X-API-Key": MASTER_KEY}
def test_auto_subnets_prefer_non_vpn_private_interfaces(self):
service = DiscoveryService()
candidates = [
InterfaceSubnet(
name="wg0",
address=ipaddress.IPv4Address("10.8.0.2"),
network=ipaddress.IPv4Network("10.8.0.0/24"),
),
InterfaceSubnet(
name="wlan0",
address=ipaddress.IPv4Address("192.168.0.25"),
network=ipaddress.IPv4Network("192.168.0.0/24"),
),
InterfaceSubnet(
name="docker0",
address=ipaddress.IPv4Address("172.17.0.1"),
network=ipaddress.IPv4Network("172.17.0.0/16"),
),
InterfaceSubnet(
name="enp3s0",
address=ipaddress.IPv4Address("192.168.1.20"),
network=ipaddress.IPv4Network("192.168.0.0/23"),
),
]
with patch.dict(os.environ, {}, clear=True):
with patch.object(service, "_interface_subnets", return_value=candidates):
subnets = service._get_target_subnets()
self.assertEqual(subnets, ["192.168.0.0/24", "192.168.1.0/24"])
async def test_manual_rescan_updates_and_removes_devices_immediately(self):
state_manager.devices["stale-device"] = SimpleNamespace(
id="stale-device",
ip="192.168.0.10",
name="Old Lamp",
room="Office",
)
with patch.object(
main.discovery_service,
"scan_network",
AsyncMock(
return_value=[
{
"mac": "fresh-device",
"ip": "192.168.0.20",
"state": {"on": True, "dimming": 100, "temp": 4100},
}
]
),
):
response = await self.client.post(
"/devices/rescan",
headers=self._headers(),
)
self.assertEqual(response.status_code, 200)
payload = response.json()
self.assertEqual(payload["status"], "ok")
self.assertEqual(payload["found"], 1)
self.assertEqual(payload["added"], 1)
self.assertEqual(payload["updated"], 0)
self.assertEqual(payload["removed_offline"], 1)
self.assertEqual(payload["online"], 1)
self.assertEqual(list(state_manager.devices.keys()), ["fresh-device"])
def test_background_cleanup_requires_multiple_misses(self):
state_manager.update_device({"mac": "dev-1", "ip": "192.168.0.10"})
first = state_manager.apply_discovery_snapshot(
[],
remove_missing=True,
missing_threshold=2,
)
self.assertEqual(first.removed_offline, 0)
self.assertEqual(first.pending_removal, 1)
self.assertIn("dev-1", state_manager.devices)
second = state_manager.apply_discovery_snapshot(
[],
remove_missing=True,
missing_threshold=2,
)
self.assertEqual(second.removed_offline, 1)
self.assertEqual(second.pending_removal, 0)
self.assertNotIn("dev-1", state_manager.devices)