fix(modbus): zadne vecne pending v journalu + flock timeout + EV poll backoff

Zivy incident home-01 (TeltoCharge .16): zapis 15/19-20 koncil failed
s prazdnym error_msg, nebo zustal trvale pending a zablokoval exportni ticky.

- _gateway_exclusive: neblokujici flock s deadline (EMS_MODBUS_FLOCK_TIMEOUT_S,
  default 20 s) -> GatewayLockTimeout misto starvation bez limitu
- execute_modbus_commands: invariant written/failed + neprazdny error_msg
  (str(e) or repr(e)); safety net pres BaseException (CancelledError, chyba DB);
  journal update mimo retry cyklus zarizeni; force_disconnect bez zamku brany
- telemetry poll_ev_chargers: po 3 selhanich backoff 5 min per (host,port,unit)
  - mrtvy unit_id drzi branu 4x8=32 s z kazde minuty
- testy backend/tests/test_modbus_execute_failsafe.py; docs
  modbus-command-journal.md (sekce Robustnost zapisu + konfigurace)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dusan Vojacek
2026-06-13 00:17:04 +02:00
parent fb9d0f107a
commit b08782525e
5 changed files with 499 additions and 72 deletions

View File

@@ -198,6 +198,27 @@ def _modbus_command_contiguous_runs(cmds: list[asyncpg.Record]) -> list[list[asy
return runs
def _modbus_error_text(e: BaseException) -> str:
"""Text chyby pro error_msg — nikdy prázdný (TimeoutError() apod. má str '')."""
return str(e).strip() or repr(e)
async def _mark_commands_failed(
db: asyncpg.Connection, cmd_ids: list[int], error_msg: str
) -> None:
for cid in cmd_ids:
await db.execute(
"""
UPDATE ems.modbus_command
SET status='failed', error_msg=$1,
attempt_count=attempt_count+1
WHERE id=$2
""",
error_msg,
cid,
)
async def execute_modbus_commands(
command_ids: list[int],
db: asyncpg.Connection,
@@ -205,6 +226,10 @@ async def execute_modbus_commands(
"""
Zapíše příkazy z modbus_command do zařízení (FC 0x10 po souvislých blocích).
Aktualizuje status na 'written' nebo 'failed'.
Invariant: žádný z předaných příkazů nesmí zůstat 'pending' — i při
CancelledError / GatewayLockTimeout / chybě DB se zbylé řádky označí
failed s neprázdným error_msg (safety net níže) a výjimka se propaguje.
"""
max_retries = 3
retry_delay = 0.5
@@ -226,67 +251,99 @@ async def execute_modbus_commands(
(cmd["device_host"], int(cmd["device_port"]), int(cmd["device_unit_id"]))
].append(cmd)
#: Ještě nerozhodnuté příkazy (pro safety net při výjimce mimo retry cyklus).
unresolved: set[int] = {int(c["id"]) for c in rows}
all_ok = True
for (host, port, unit), group in by_gw.items():
client = await get_modbus_client(host, port)
for run in _modbus_command_contiguous_runs(group):
start_reg = int(run[0]["register"])
values = [int(c["value_to_write"]) for c in run]
for attempt in range(max_retries):
try:
await client.write_registers(start_reg, values, unit)
for cmd, val in zip(run, values):
cid = int(cmd["id"])
await db.execute(
"""
UPDATE ems.modbus_command
SET status='written', value_written=$1, written_at=now(),
attempt_count=attempt_count+1, error_msg=NULL
WHERE id=$2
""",
val,
cid,
)
logger.info(
"[cmd %s] %s 0x%04X=%s OK batch@%s (attempt %s)",
cid,
cmd["asset_code"],
int(cmd["register"]),
val,
start_reg,
attempt + 1,
)
break
except Exception as e:
if attempt < max_retries - 1:
logger.warning(
"Modbus batch write 0x%04X count=%s attempt %s failed: %s, retrying...",
start_reg,
len(values),
attempt + 1,
e,
)
await asyncio.sleep(retry_delay)
await client.force_disconnect()
else:
for cmd in run:
await db.execute(
"""
UPDATE ems.modbus_command
SET status='failed', error_msg=$1,
attempt_count=attempt_count+1
WHERE id=$2
""",
str(e),
int(cmd["id"]),
try:
for (host, port, unit), group in by_gw.items():
client = await get_modbus_client(host, port)
for run in _modbus_command_contiguous_runs(group):
start_reg = int(run[0]["register"])
values = [int(c["value_to_write"]) for c in run]
write_err: Exception | None = None
attempts_used = 0
for attempt in range(max_retries):
attempts_used = attempt + 1
try:
await client.write_registers(start_reg, values, unit)
write_err = None
break
except Exception as e:
write_err = e
if attempt < max_retries - 1:
logger.warning(
"Modbus batch write 0x%04X count=%s attempt %s failed: %s, retrying...",
start_reg,
len(values),
attempt + 1,
_modbus_error_text(e),
)
logger.error(
"Modbus batch 0x%04X count=%s all %s attempts failed: %s",
start_reg,
len(values),
max_retries,
e,
)
all_ok = False
await asyncio.sleep(retry_delay)
try:
await client.force_disconnect()
except Exception as de:
logger.warning(
"Modbus force_disconnect %s:%s failed: %s",
host,
port,
_modbus_error_text(de),
)
if write_err is not None:
err = _modbus_error_text(write_err)
await _mark_commands_failed(db, [int(c["id"]) for c in run], err)
for c in run:
unresolved.discard(int(c["id"]))
logger.error(
"Modbus batch 0x%04X count=%s all %s attempts failed: %s",
start_reg,
len(values),
max_retries,
err,
)
all_ok = False
continue
# Journal update mimo retry cyklus — chyba DB nesmí vyvolat
# další zápis do zařízení; spadne do safety netu níže.
for cmd, val in zip(run, values):
cid = int(cmd["id"])
await db.execute(
"""
UPDATE ems.modbus_command
SET status='written', value_written=$1, written_at=now(),
attempt_count=attempt_count+1, error_msg=NULL
WHERE id=$2
""",
val,
cid,
)
unresolved.discard(cid)
logger.info(
"[cmd %s] %s 0x%04X=%s OK batch@%s (attempt %s)",
cid,
cmd["asset_code"],
int(cmd["register"]),
val,
start_reg,
attempts_used,
)
except BaseException as e:
# Safety net: CancelledError (shutdown / zrušený task), GatewayLockTimeout
# propadlý mimo retry cyklus, chyba DB v success větvi, … — nic nesmí
# zůstat 'pending'. Best effort: označit a výjimku propagovat dál.
err = f"execute aborted: {_modbus_error_text(e)}"
try:
await _mark_commands_failed(db, sorted(unresolved), err)
except Exception as me:
logger.error(
"Modbus journal: nelze označit %s příkazů failed (%s): %s",
len(unresolved),
err,
_modbus_error_text(me),
)
logger.error("execute_modbus_commands aborted: %s", err)
raise
return all_ok