fix(modbus): zadne vecne pending v journalu + flock timeout + EV poll backoff
Zivy incident home-01 (TeltoCharge .16): zapis 15/19-20 koncil failed s prazdnym error_msg, nebo zustal trvale pending a zablokoval exportni ticky. - _gateway_exclusive: neblokujici flock s deadline (EMS_MODBUS_FLOCK_TIMEOUT_S, default 20 s) -> GatewayLockTimeout misto starvation bez limitu - execute_modbus_commands: invariant written/failed + neprazdny error_msg (str(e) or repr(e)); safety net pres BaseException (CancelledError, chyba DB); journal update mimo retry cyklus zarizeni; force_disconnect bez zamku brany - telemetry poll_ev_chargers: po 3 selhanich backoff 5 min per (host,port,unit) - mrtvy unit_id drzi branu 4x8=32 s z kazde minuty - testy backend/tests/test_modbus_execute_failsafe.py; docs modbus-command-journal.md (sekce Robustnost zapisu + konfigurace) Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -198,6 +198,27 @@ def _modbus_command_contiguous_runs(cmds: list[asyncpg.Record]) -> list[list[asy
|
||||
return runs
|
||||
|
||||
|
||||
def _modbus_error_text(e: BaseException) -> str:
|
||||
"""Text chyby pro error_msg — nikdy prázdný (TimeoutError() apod. má str '')."""
|
||||
return str(e).strip() or repr(e)
|
||||
|
||||
|
||||
async def _mark_commands_failed(
|
||||
db: asyncpg.Connection, cmd_ids: list[int], error_msg: str
|
||||
) -> None:
|
||||
for cid in cmd_ids:
|
||||
await db.execute(
|
||||
"""
|
||||
UPDATE ems.modbus_command
|
||||
SET status='failed', error_msg=$1,
|
||||
attempt_count=attempt_count+1
|
||||
WHERE id=$2
|
||||
""",
|
||||
error_msg,
|
||||
cid,
|
||||
)
|
||||
|
||||
|
||||
async def execute_modbus_commands(
|
||||
command_ids: list[int],
|
||||
db: asyncpg.Connection,
|
||||
@@ -205,6 +226,10 @@ async def execute_modbus_commands(
|
||||
"""
|
||||
Zapíše příkazy z modbus_command do zařízení (FC 0x10 po souvislých blocích).
|
||||
Aktualizuje status na 'written' nebo 'failed'.
|
||||
|
||||
Invariant: žádný z předaných příkazů nesmí zůstat 'pending' — i při
|
||||
CancelledError / GatewayLockTimeout / chybě DB se zbylé řádky označí
|
||||
failed s neprázdným error_msg (safety net níže) a výjimka se propaguje.
|
||||
"""
|
||||
max_retries = 3
|
||||
retry_delay = 0.5
|
||||
@@ -226,67 +251,99 @@ async def execute_modbus_commands(
|
||||
(cmd["device_host"], int(cmd["device_port"]), int(cmd["device_unit_id"]))
|
||||
].append(cmd)
|
||||
|
||||
#: Ještě nerozhodnuté příkazy (pro safety net při výjimce mimo retry cyklus).
|
||||
unresolved: set[int] = {int(c["id"]) for c in rows}
|
||||
|
||||
all_ok = True
|
||||
for (host, port, unit), group in by_gw.items():
|
||||
client = await get_modbus_client(host, port)
|
||||
for run in _modbus_command_contiguous_runs(group):
|
||||
start_reg = int(run[0]["register"])
|
||||
values = [int(c["value_to_write"]) for c in run]
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
await client.write_registers(start_reg, values, unit)
|
||||
for cmd, val in zip(run, values):
|
||||
cid = int(cmd["id"])
|
||||
await db.execute(
|
||||
"""
|
||||
UPDATE ems.modbus_command
|
||||
SET status='written', value_written=$1, written_at=now(),
|
||||
attempt_count=attempt_count+1, error_msg=NULL
|
||||
WHERE id=$2
|
||||
""",
|
||||
val,
|
||||
cid,
|
||||
)
|
||||
logger.info(
|
||||
"[cmd %s] %s 0x%04X=%s OK batch@%s (attempt %s)",
|
||||
cid,
|
||||
cmd["asset_code"],
|
||||
int(cmd["register"]),
|
||||
val,
|
||||
start_reg,
|
||||
attempt + 1,
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
if attempt < max_retries - 1:
|
||||
logger.warning(
|
||||
"Modbus batch write 0x%04X count=%s attempt %s failed: %s, retrying...",
|
||||
start_reg,
|
||||
len(values),
|
||||
attempt + 1,
|
||||
e,
|
||||
)
|
||||
await asyncio.sleep(retry_delay)
|
||||
await client.force_disconnect()
|
||||
else:
|
||||
for cmd in run:
|
||||
await db.execute(
|
||||
"""
|
||||
UPDATE ems.modbus_command
|
||||
SET status='failed', error_msg=$1,
|
||||
attempt_count=attempt_count+1
|
||||
WHERE id=$2
|
||||
""",
|
||||
str(e),
|
||||
int(cmd["id"]),
|
||||
try:
|
||||
for (host, port, unit), group in by_gw.items():
|
||||
client = await get_modbus_client(host, port)
|
||||
for run in _modbus_command_contiguous_runs(group):
|
||||
start_reg = int(run[0]["register"])
|
||||
values = [int(c["value_to_write"]) for c in run]
|
||||
write_err: Exception | None = None
|
||||
attempts_used = 0
|
||||
for attempt in range(max_retries):
|
||||
attempts_used = attempt + 1
|
||||
try:
|
||||
await client.write_registers(start_reg, values, unit)
|
||||
write_err = None
|
||||
break
|
||||
except Exception as e:
|
||||
write_err = e
|
||||
if attempt < max_retries - 1:
|
||||
logger.warning(
|
||||
"Modbus batch write 0x%04X count=%s attempt %s failed: %s, retrying...",
|
||||
start_reg,
|
||||
len(values),
|
||||
attempt + 1,
|
||||
_modbus_error_text(e),
|
||||
)
|
||||
logger.error(
|
||||
"Modbus batch 0x%04X count=%s all %s attempts failed: %s",
|
||||
start_reg,
|
||||
len(values),
|
||||
max_retries,
|
||||
e,
|
||||
)
|
||||
all_ok = False
|
||||
await asyncio.sleep(retry_delay)
|
||||
try:
|
||||
await client.force_disconnect()
|
||||
except Exception as de:
|
||||
logger.warning(
|
||||
"Modbus force_disconnect %s:%s failed: %s",
|
||||
host,
|
||||
port,
|
||||
_modbus_error_text(de),
|
||||
)
|
||||
|
||||
if write_err is not None:
|
||||
err = _modbus_error_text(write_err)
|
||||
await _mark_commands_failed(db, [int(c["id"]) for c in run], err)
|
||||
for c in run:
|
||||
unresolved.discard(int(c["id"]))
|
||||
logger.error(
|
||||
"Modbus batch 0x%04X count=%s all %s attempts failed: %s",
|
||||
start_reg,
|
||||
len(values),
|
||||
max_retries,
|
||||
err,
|
||||
)
|
||||
all_ok = False
|
||||
continue
|
||||
|
||||
# Journal update mimo retry cyklus — chyba DB nesmí vyvolat
|
||||
# další zápis do zařízení; spadne do safety netu níže.
|
||||
for cmd, val in zip(run, values):
|
||||
cid = int(cmd["id"])
|
||||
await db.execute(
|
||||
"""
|
||||
UPDATE ems.modbus_command
|
||||
SET status='written', value_written=$1, written_at=now(),
|
||||
attempt_count=attempt_count+1, error_msg=NULL
|
||||
WHERE id=$2
|
||||
""",
|
||||
val,
|
||||
cid,
|
||||
)
|
||||
unresolved.discard(cid)
|
||||
logger.info(
|
||||
"[cmd %s] %s 0x%04X=%s OK batch@%s (attempt %s)",
|
||||
cid,
|
||||
cmd["asset_code"],
|
||||
int(cmd["register"]),
|
||||
val,
|
||||
start_reg,
|
||||
attempts_used,
|
||||
)
|
||||
except BaseException as e:
|
||||
# Safety net: CancelledError (shutdown / zrušený task), GatewayLockTimeout
|
||||
# propadlý mimo retry cyklus, chyba DB v success větvi, … — nic nesmí
|
||||
# zůstat 'pending'. Best effort: označit a výjimku propagovat dál.
|
||||
err = f"execute aborted: {_modbus_error_text(e)}"
|
||||
try:
|
||||
await _mark_commands_failed(db, sorted(unresolved), err)
|
||||
except Exception as me:
|
||||
logger.error(
|
||||
"Modbus journal: nelze označit %s příkazů failed (%s): %s",
|
||||
len(unresolved),
|
||||
err,
|
||||
_modbus_error_text(me),
|
||||
)
|
||||
logger.error("execute_modbus_commands aborted: %s", err)
|
||||
raise
|
||||
|
||||
return all_ok
|
||||
|
||||
Reference in New Issue
Block a user